Compare commits
11 Commits
a82ec96eb7
...
e5a82ff1f4
Author | SHA1 | Date |
---|---|---|
pictuga | e5a82ff1f4 | |
pictuga | f3d1f92b39 | |
pictuga | 7691df5257 | |
pictuga | 0ae0dbc175 | |
pictuga | f1d0431e68 | |
pictuga | a09831415f | |
pictuga | bfad6b7a4a | |
pictuga | 6b8c3e51e7 | |
pictuga | dc9e425247 | |
pictuga | 2f48e18bb1 | |
pictuga | 31cac921c7 |
|
@ -24,15 +24,13 @@ hand-written rules (ie. there's no automatic detection of links to build feeds).
|
||||||
Please mind that feeds based on html files may stop working unexpectedly, due to
|
Please mind that feeds based on html files may stop working unexpectedly, due to
|
||||||
html structure changes on the target website.
|
html structure changes on the target website.
|
||||||
|
|
||||||
Additionally morss can grab the source xml feed of iTunes podcast, and detect
|
Additionally morss can detect rss feeds in html pages' `<meta>`.
|
||||||
rss feeds in html pages' `<meta>`.
|
|
||||||
|
|
||||||
You can use this program online for free at **[morss.it](https://morss.it/)**.
|
You can use this program online for free at **[morss.it](https://morss.it/)**.
|
||||||
|
|
||||||
Some features of morss:
|
Some features of morss:
|
||||||
- Read RSS/Atom feeds
|
- Read RSS/Atom feeds
|
||||||
- Create RSS feeds from json/html pages
|
- Create RSS feeds from json/html pages
|
||||||
- Convert iTunes podcast links into xml links
|
|
||||||
- Export feeds as RSS/JSON/CSV/HTML
|
- Export feeds as RSS/JSON/CSV/HTML
|
||||||
- Fetch full-text content of feed items
|
- Fetch full-text content of feed items
|
||||||
- Follow 301/meta redirects
|
- Follow 301/meta redirects
|
||||||
|
@ -75,6 +73,8 @@ The arguments are:
|
||||||
|
|
||||||
- Change what morss does
|
- Change what morss does
|
||||||
- `json`: output as JSON
|
- `json`: output as JSON
|
||||||
|
- `html`: outpout as HTML
|
||||||
|
- `csv`: outpout as CSV
|
||||||
- `proxy`: doesn't fill the articles
|
- `proxy`: doesn't fill the articles
|
||||||
- `clip`: stick the full article content under the original feed content (useful for twitter)
|
- `clip`: stick the full article content under the original feed content (useful for twitter)
|
||||||
- `search=STRING`: does a basic case-sensitive search in the feed
|
- `search=STRING`: does a basic case-sensitive search in the feed
|
||||||
|
@ -88,11 +88,9 @@ The arguments are:
|
||||||
- `mono`: disable multithreading while fetching, makes debugging easier
|
- `mono`: disable multithreading while fetching, makes debugging easier
|
||||||
- `theforce`: force download the rss feed and ignore cached http errros
|
- `theforce`: force download the rss feed and ignore cached http errros
|
||||||
- `silent`: don't output the final RSS (useless on its own, but can be nice when debugging)
|
- `silent`: don't output the final RSS (useless on its own, but can be nice when debugging)
|
||||||
- `encoding=ENCODING`: overrides the encoding auto-detection of the crawler. Some web developers did not quite understand the importance of setting charset/encoding tags correctly...
|
|
||||||
- http server only
|
- http server only
|
||||||
- `callback=NAME`: for JSONP calls
|
- `callback=NAME`: for JSONP calls
|
||||||
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers)
|
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers)
|
||||||
- `html`: changes the http content-type to html, so that python cgi erros (written in html) are readable in a web browser
|
|
||||||
- `txt`: changes the http content-type to txt (for faster "`view-source:`")
|
- `txt`: changes the http content-type to txt (for faster "`view-source:`")
|
||||||
- Custom feeds: you can turn any HTML page into a RSS feed using morss, using xpath rules. The article content will be fetched as usual (with readabilite). Please note that you will have to **replace** any `/` in your rule with a `|` when using morss as a webserver
|
- Custom feeds: you can turn any HTML page into a RSS feed using morss, using xpath rules. The article content will be fetched as usual (with readabilite). Please note that you will have to **replace** any `/` in your rule with a `|` when using morss as a webserver
|
||||||
- `items`: (**mandatory** to activate the custom feeds function) xpath rule to match all the RSS entries
|
- `items`: (**mandatory** to activate the custom feeds function) xpath rule to match all the RSS entries
|
||||||
|
|
|
@ -34,6 +34,25 @@ MIMETYPE = {
|
||||||
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
||||||
|
|
||||||
|
|
||||||
|
def get(*args, **kwargs):
|
||||||
|
return adv_get(*args, **kwargs)[0]
|
||||||
|
|
||||||
|
|
||||||
|
def adv_get(url, timeout=None, *args, **kwargs):
|
||||||
|
if timeout is None:
|
||||||
|
con = custom_handler(*args, **kwargs).open(url)
|
||||||
|
|
||||||
|
else:
|
||||||
|
con = custom_handler(*args, **kwargs).open(url, timeout=timeout)
|
||||||
|
|
||||||
|
data = con.read()
|
||||||
|
|
||||||
|
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
||||||
|
encoding= detect_encoding(data, con)
|
||||||
|
|
||||||
|
return data, con, contenttype, encoding
|
||||||
|
|
||||||
|
|
||||||
def custom_handler(follow=None, delay=None, encoding=None):
|
def custom_handler(follow=None, delay=None, encoding=None):
|
||||||
handlers = []
|
handlers = []
|
||||||
|
|
||||||
|
@ -199,7 +218,6 @@ class BrowserlyHeaderHandler(BaseHandler):
|
||||||
""" Add more headers to look less suspicious """
|
""" Add more headers to look less suspicious """
|
||||||
|
|
||||||
def http_request(self, req):
|
def http_request(self, req):
|
||||||
req.add_unredirected_header('Referer', '%s://%s' % (req.type, req.host))
|
|
||||||
req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
|
req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
|
||||||
req.add_unredirected_header('Accept-Language', 'en-US,en;q=0.5')
|
req.add_unredirected_header('Accept-Language', 'en-US,en;q=0.5')
|
||||||
return req
|
return req
|
||||||
|
|
|
@ -15,7 +15,7 @@ import dateutil.parser
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
|
||||||
import lxml.html
|
import lxml.html
|
||||||
from bs4 import BeautifulSoup
|
from .readabilite import parse as html_parse
|
||||||
|
|
||||||
json.encoder.c_make_encoder = None
|
json.encoder.c_make_encoder = None
|
||||||
|
|
||||||
|
@ -53,7 +53,7 @@ def parse_rules(filename=None):
|
||||||
return rules
|
return rules
|
||||||
|
|
||||||
|
|
||||||
def parse(data, url=None, mimetype=None):
|
def parse(data, url=None, mimetype=None, encoding=None):
|
||||||
" Determine which ruleset to use "
|
" Determine which ruleset to use "
|
||||||
|
|
||||||
rulesets = parse_rules()
|
rulesets = parse_rules()
|
||||||
|
@ -67,14 +67,14 @@ def parse(data, url=None, mimetype=None):
|
||||||
for path in ruleset['path']:
|
for path in ruleset['path']:
|
||||||
if fnmatch(url, path):
|
if fnmatch(url, path):
|
||||||
parser = [x for x in parsers if x.mode == ruleset['mode']][0]
|
parser = [x for x in parsers if x.mode == ruleset['mode']][0]
|
||||||
return parser(data, ruleset)
|
return parser(data, ruleset, encoding=encoding)
|
||||||
|
|
||||||
# 2) Look for a parser based on mimetype
|
# 2) Look for a parser based on mimetype
|
||||||
|
|
||||||
if mimetype is not None:
|
if mimetype is not None:
|
||||||
parser_candidates = [x for x in parsers if mimetype in x.mimetype]
|
parser_candidates = [x for x in parsers if mimetype in x.mimetype]
|
||||||
|
|
||||||
if mimetype is None or parser_candidates is None:
|
if mimetype is None or len(parser_candidates) == 0:
|
||||||
parser_candidates = parsers
|
parser_candidates = parsers
|
||||||
|
|
||||||
# 3) Look for working ruleset for given parser
|
# 3) Look for working ruleset for given parser
|
||||||
|
@ -86,7 +86,7 @@ def parse(data, url=None, mimetype=None):
|
||||||
# 'path' as they should have been caught beforehands
|
# 'path' as they should have been caught beforehands
|
||||||
|
|
||||||
try:
|
try:
|
||||||
feed = parser(data)
|
feed = parser(data, encoding=encoding)
|
||||||
|
|
||||||
except (ValueError):
|
except (ValueError):
|
||||||
# parsing did not work
|
# parsing did not work
|
||||||
|
@ -113,7 +113,7 @@ def parse(data, url=None, mimetype=None):
|
||||||
|
|
||||||
|
|
||||||
class ParserBase(object):
|
class ParserBase(object):
|
||||||
def __init__(self, data=None, rules=None, parent=None):
|
def __init__(self, data=None, rules=None, parent=None, encoding=None):
|
||||||
if rules is None:
|
if rules is None:
|
||||||
rules = parse_rules()[self.default_ruleset]
|
rules = parse_rules()[self.default_ruleset]
|
||||||
|
|
||||||
|
@ -122,9 +122,10 @@ class ParserBase(object):
|
||||||
if data is None:
|
if data is None:
|
||||||
data = rules['base']
|
data = rules['base']
|
||||||
|
|
||||||
self.root = self.parse(data)
|
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
|
self.encoding = encoding
|
||||||
|
|
||||||
|
self.root = self.parse(data)
|
||||||
|
|
||||||
def parse(self, raw):
|
def parse(self, raw):
|
||||||
pass
|
pass
|
||||||
|
@ -442,8 +443,7 @@ class ParserHTML(ParserXML):
|
||||||
mimetype = ['text/html', 'application/xhtml+xml']
|
mimetype = ['text/html', 'application/xhtml+xml']
|
||||||
|
|
||||||
def parse(self, raw):
|
def parse(self, raw):
|
||||||
parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
|
return html_parse(raw, encoding=self.encoding)
|
||||||
return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify('utf-8'), parser)
|
|
||||||
|
|
||||||
def tostring(self, encoding='unicode', **k):
|
def tostring(self, encoding='unicode', **k):
|
||||||
return lxml.html.tostring(self.root, encoding=encoding, **k)
|
return lxml.html.tostring(self.root, encoding=encoding, **k)
|
||||||
|
|
|
@ -10,7 +10,6 @@ import re
|
||||||
|
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
import lxml.html
|
import lxml.html
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
from . import feeds
|
from . import feeds
|
||||||
from . import crawler
|
from . import crawler
|
||||||
|
@ -251,19 +250,17 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||||
delay = -2
|
delay = -2
|
||||||
|
|
||||||
try:
|
try:
|
||||||
con = crawler.custom_handler(delay=delay, encoding=options.encoding).open(link, timeout=TIMEOUT)
|
data, con, contenttype, encoding = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
|
||||||
data = con.read()
|
|
||||||
|
|
||||||
except (IOError, HTTPException) as e:
|
except (IOError, HTTPException) as e:
|
||||||
log('http error')
|
log('http error')
|
||||||
return False # let's just delete errors stuff when in cache mode
|
return False # let's just delete errors stuff when in cache mode
|
||||||
|
|
||||||
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
|
||||||
if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
|
if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
|
||||||
log('non-text page')
|
log('non-text page')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con))
|
out = readabilite.get_article(data, url=con.geturl(), encoding=encoding)
|
||||||
|
|
||||||
if out is not None:
|
if out is not None:
|
||||||
item.content = out
|
item.content = out
|
||||||
|
@ -324,18 +321,14 @@ def FeedFetch(url, options):
|
||||||
delay = 0
|
delay = 0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
con = crawler.custom_handler(follow='rss', delay=delay, encoding=options.encoding) \
|
xml, con, contenttype, encoding = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2)
|
||||||
.open(url, timeout=TIMEOUT * 2)
|
|
||||||
xml = con.read()
|
|
||||||
|
|
||||||
except (IOError, HTTPException):
|
except (IOError, HTTPException):
|
||||||
raise MorssException('Error downloading feed')
|
raise MorssException('Error downloading feed')
|
||||||
|
|
||||||
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
|
||||||
|
|
||||||
if options.items:
|
if options.items:
|
||||||
# using custom rules
|
# using custom rules
|
||||||
rss = feeds.FeedHTML(xml)
|
rss = feeds.FeedHTML(xml, encoding=encoding)
|
||||||
|
|
||||||
rss.rules['title'] = options.title if options.title else '//head/title'
|
rss.rules['title'] = options.title if options.title else '//head/title'
|
||||||
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
|
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
|
||||||
|
@ -355,7 +348,7 @@ def FeedFetch(url, options):
|
||||||
|
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
rss = feeds.parse(xml, url, contenttype)
|
rss = feeds.parse(xml, url, contenttype, encoding=encoding)
|
||||||
rss = rss.convert(feeds.FeedXML)
|
rss = rss.convert(feeds.FeedXML)
|
||||||
# contains all fields, otherwise much-needed data can be lost
|
# contains all fields, otherwise much-needed data can be lost
|
||||||
|
|
||||||
|
@ -469,7 +462,7 @@ def FeedFormat(rss, options, encoding='utf-8'):
|
||||||
elif options.csv:
|
elif options.csv:
|
||||||
return rss.tocsv(encoding=encoding)
|
return rss.tocsv(encoding=encoding)
|
||||||
|
|
||||||
elif options.reader:
|
elif options.html:
|
||||||
if options.indent:
|
if options.indent:
|
||||||
return rss.tohtml(encoding=encoding, pretty_print=True)
|
return rss.tohtml(encoding=encoding, pretty_print=True)
|
||||||
|
|
||||||
|
@ -547,7 +540,7 @@ def cgi_app(environ, start_response):
|
||||||
if options.cors:
|
if options.cors:
|
||||||
headers['access-control-allow-origin'] = '*'
|
headers['access-control-allow-origin'] = '*'
|
||||||
|
|
||||||
if options.html or options.reader:
|
if options.html:
|
||||||
headers['content-type'] = 'text/html'
|
headers['content-type'] = 'text/html'
|
||||||
elif options.txt or options.silent:
|
elif options.txt or options.silent:
|
||||||
headers['content-type'] = 'text/plain'
|
headers['content-type'] = 'text/plain'
|
||||||
|
@ -652,13 +645,10 @@ def cgi_page(environ, start_response):
|
||||||
if urlparse(url).scheme not in ['http', 'https']:
|
if urlparse(url).scheme not in ['http', 'https']:
|
||||||
url = 'http://' + url
|
url = 'http://' + url
|
||||||
|
|
||||||
con = crawler.custom_handler().open(url)
|
data, con, contenttype, encoding = crawler.adv_get(url=url)
|
||||||
data = con.read()
|
|
||||||
|
|
||||||
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
|
||||||
|
|
||||||
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
||||||
html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
|
html = readabilite.parse(data, encoding=encoding)
|
||||||
html.make_links_absolute(con.geturl())
|
html.make_links_absolute(con.geturl())
|
||||||
|
|
||||||
kill_tags = ['script', 'iframe', 'noscript']
|
kill_tags = ['script', 'iframe', 'noscript']
|
||||||
|
|
|
@ -6,11 +6,14 @@ import re
|
||||||
|
|
||||||
def parse(data, encoding=None):
|
def parse(data, encoding=None):
|
||||||
if encoding:
|
if encoding:
|
||||||
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding=encoding)
|
data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')
|
||||||
else:
|
|
||||||
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
|
|
||||||
|
|
||||||
return lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify('utf-8'), parser=parser)
|
else:
|
||||||
|
data = BeautifulSoup(data, 'lxml').prettify('utf-8')
|
||||||
|
|
||||||
|
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding='utf-8')
|
||||||
|
|
||||||
|
return lxml.html.fromstring(data, parser=parser)
|
||||||
|
|
||||||
|
|
||||||
def count_words(string):
|
def count_words(string):
|
||||||
|
@ -44,6 +47,12 @@ def count_content(node):
|
||||||
return count_words(node.text_content()) + len(node.findall('.//img'))
|
return count_words(node.text_content()) + len(node.findall('.//img'))
|
||||||
|
|
||||||
|
|
||||||
|
def percentile(N, P):
|
||||||
|
# https://stackoverflow.com/a/7464107
|
||||||
|
n = max(int(round(P * len(N) + 0.5)), 2)
|
||||||
|
return N[n-2]
|
||||||
|
|
||||||
|
|
||||||
class_bad = ['comment', 'community', 'extra', 'foot',
|
class_bad = ['comment', 'community', 'extra', 'foot',
|
||||||
'sponsor', 'pagination', 'pager', 'tweet', 'twitter', 'com-', 'masthead',
|
'sponsor', 'pagination', 'pager', 'tweet', 'twitter', 'com-', 'masthead',
|
||||||
'media', 'meta', 'related', 'shopping', 'tags', 'tool', 'author', 'about',
|
'media', 'meta', 'related', 'shopping', 'tags', 'tool', 'author', 'about',
|
||||||
|
@ -123,33 +132,42 @@ def score_node(node):
|
||||||
return score
|
return score
|
||||||
|
|
||||||
|
|
||||||
def score_all(node, grades=None):
|
def score_all(node):
|
||||||
" Fairly dumb loop to score all worthwhile nodes. Tries to be fast "
|
" Fairly dumb loop to score all worthwhile nodes. Tries to be fast "
|
||||||
|
|
||||||
if grades is None:
|
|
||||||
grades = {}
|
|
||||||
|
|
||||||
for child in node:
|
for child in node:
|
||||||
score = score_node(child)
|
score = score_node(child)
|
||||||
child.attrib['seen'] = 'yes, ' + str(int(score))
|
child.attrib['seen'] = 'yes, ' + str(int(score))
|
||||||
|
|
||||||
if score > 0 or not len(grades):
|
if score > 0 or len(list(child.iterancestors())) <= 2:
|
||||||
spread_score(child, score, grades)
|
spread_score(child, score)
|
||||||
score_all(child, grades)
|
score_all(child)
|
||||||
|
|
||||||
return grades
|
|
||||||
|
|
||||||
|
|
||||||
def spread_score(node, score, grades):
|
def set_score(node, value):
|
||||||
|
node.attrib['morss_score'] = str(float(value))
|
||||||
|
|
||||||
|
|
||||||
|
def get_score(node):
|
||||||
|
return float(node.attrib.get('morss_score', 0))
|
||||||
|
|
||||||
|
|
||||||
|
def incr_score(node, delta):
|
||||||
|
set_score(node, get_score(node) + delta)
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_scores(node):
|
||||||
|
return {x:get_score(x) for x in list(node.iter()) if get_score(x) != 0}
|
||||||
|
|
||||||
|
|
||||||
|
def spread_score(node, score):
|
||||||
" Spread the node's score to its parents, on a linear way "
|
" Spread the node's score to its parents, on a linear way "
|
||||||
|
|
||||||
delta = score / 2
|
delta = score / 2
|
||||||
|
|
||||||
for ancestor in [node,] + list(node.iterancestors()):
|
for ancestor in [node,] + list(node.iterancestors()):
|
||||||
if score >= 1 or ancestor is node:
|
if score >= 1 or ancestor is node:
|
||||||
try:
|
incr_score(ancestor, score)
|
||||||
grades[ancestor] += score
|
|
||||||
except KeyError:
|
|
||||||
grades[ancestor] = score
|
|
||||||
|
|
||||||
score -= delta
|
score -= delta
|
||||||
|
|
||||||
|
@ -157,26 +175,24 @@ def spread_score(node, score, grades):
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def write_score_all(root, grades):
|
def clean_root(root, keep_threshold=None):
|
||||||
" Useful for debugging "
|
|
||||||
|
|
||||||
for node in root.iter():
|
|
||||||
node.attrib['score'] = str(int(grades.get(node, 0)))
|
|
||||||
|
|
||||||
|
|
||||||
def clean_root(root):
|
|
||||||
for node in list(root):
|
for node in list(root):
|
||||||
clean_root(node)
|
# bottom-up approach, i.e. starting with children before cleaning current node
|
||||||
clean_node(node)
|
clean_root(node, keep_threshold)
|
||||||
|
clean_node(node, keep_threshold)
|
||||||
|
|
||||||
|
|
||||||
def clean_node(node):
|
def clean_node(node, keep_threshold=None):
|
||||||
parent = node.getparent()
|
parent = node.getparent()
|
||||||
|
|
||||||
if parent is None:
|
if parent is None:
|
||||||
# this is <html/> (or a removed element waiting for GC)
|
# this is <html/> (or a removed element waiting for GC)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if keep_threshold is not None and get_score(node) >= keep_threshold:
|
||||||
|
# high score, so keep
|
||||||
|
return
|
||||||
|
|
||||||
gdparent = parent.getparent()
|
gdparent = parent.getparent()
|
||||||
|
|
||||||
# remove shitty tags
|
# remove shitty tags
|
||||||
|
@ -275,18 +291,18 @@ def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
|
||||||
return nodeA # should always find one tho, at least <html/>, but needed for max_depth
|
return nodeA # should always find one tho, at least <html/>, but needed for max_depth
|
||||||
|
|
||||||
|
|
||||||
def rank_nodes(grades):
|
def rank_grades(grades):
|
||||||
|
# largest score to smallest
|
||||||
return sorted(grades.items(), key=lambda x: x[1], reverse=True)
|
return sorted(grades.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
|
||||||
def get_best_node(grades):
|
def get_best_node(ranked_grades):
|
||||||
" To pick the best (raw) node. Another function will clean it "
|
" To pick the best (raw) node. Another function will clean it "
|
||||||
|
|
||||||
if len(grades) == 1:
|
if len(ranked_grades) == 1:
|
||||||
return grades[0]
|
return ranked_grades[0]
|
||||||
|
|
||||||
top = rank_nodes(grades)
|
lowest = lowest_common_ancestor(ranked_grades[0][0], ranked_grades[1][0], 3)
|
||||||
lowest = lowest_common_ancestor(top[0][0], top[1][0], 3)
|
|
||||||
|
|
||||||
return lowest
|
return lowest
|
||||||
|
|
||||||
|
@ -295,12 +311,17 @@ def get_article(data, url=None, encoding=None):
|
||||||
" Input a raw html string, returns a raw html string of the article "
|
" Input a raw html string, returns a raw html string of the article "
|
||||||
|
|
||||||
html = parse(data, encoding)
|
html = parse(data, encoding)
|
||||||
scores = score_all(html)
|
score_all(html)
|
||||||
|
scores = rank_grades(get_all_scores(html))
|
||||||
|
|
||||||
if not len(scores):
|
if not len(scores):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
best = get_best_node(scores)
|
best = get_best_node(scores)
|
||||||
|
|
||||||
|
keep_threshold = percentile([x[1] for x in scores], 0.1)
|
||||||
|
clean_root(best, keep_threshold)
|
||||||
|
|
||||||
wc = count_words(best.text_content())
|
wc = count_words(best.text_content())
|
||||||
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
|
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
|
||||||
|
|
||||||
|
@ -310,6 +331,4 @@ def get_article(data, url=None, encoding=None):
|
||||||
if url:
|
if url:
|
||||||
best.make_links_absolute(url)
|
best.make_links_absolute(url)
|
||||||
|
|
||||||
clean_root(best)
|
|
||||||
|
|
||||||
return lxml.etree.tostring(best, pretty_print=True)
|
return lxml.etree.tostring(best, pretty_print=True)
|
||||||
|
|
Loading…
Reference in New Issue