Compare commits

..

11 Commits

Author SHA1 Message Date
pictuga e5a82ff1f4 crawler: drop auto-referer
Was solving some issues. But creating even more issues.
2020-04-07 10:39:21 +02:00
pictuga f3d1f92b39 Detect encoding everytime 2020-04-07 10:38:36 +02:00
pictuga 7691df5257 Use wrapper for http calls 2020-04-07 10:30:17 +02:00
pictuga 0ae0dbc175 README: mention csv output 2020-04-07 09:24:32 +02:00
pictuga f1d0431e68 morss: drop :html, replaced with :reader
README updated accordingly
2020-04-07 09:23:29 +02:00
pictuga a09831415f feeds: fix bug when mimetype matches nothing 2020-04-06 18:53:07 +02:00
pictuga bfad6b7a4a readabilite: clean before counting
To remove links which are not kept anyway
2020-04-06 16:55:39 +02:00
pictuga 6b8c3e51e7 readabilite: fix threshold feature
Awkward typo...
2020-04-06 16:52:06 +02:00
pictuga dc9e425247 readabilite: don't clean-out the top 10% nodes
Loosen up the code once again to limit over-kill
2020-04-06 14:26:28 +02:00
pictuga 2f48e18bb1 readabilite: put scores directly in html node
Probably slower but makes code somewhat cleaner...
2020-04-06 14:21:41 +02:00
pictuga 31cac921c7 README: remove ref to iTunes 2020-04-05 22:20:33 +02:00
5 changed files with 97 additions and 72 deletions

View File

@ -24,15 +24,13 @@ hand-written rules (ie. there's no automatic detection of links to build feeds).
Please mind that feeds based on html files may stop working unexpectedly, due to Please mind that feeds based on html files may stop working unexpectedly, due to
html structure changes on the target website. html structure changes on the target website.
Additionally morss can grab the source xml feed of iTunes podcast, and detect Additionally morss can detect rss feeds in html pages' `<meta>`.
rss feeds in html pages' `<meta>`.
You can use this program online for free at **[morss.it](https://morss.it/)**. You can use this program online for free at **[morss.it](https://morss.it/)**.
Some features of morss: Some features of morss:
- Read RSS/Atom feeds - Read RSS/Atom feeds
- Create RSS feeds from json/html pages - Create RSS feeds from json/html pages
- Convert iTunes podcast links into xml links
- Export feeds as RSS/JSON/CSV/HTML - Export feeds as RSS/JSON/CSV/HTML
- Fetch full-text content of feed items - Fetch full-text content of feed items
- Follow 301/meta redirects - Follow 301/meta redirects
@ -75,6 +73,8 @@ The arguments are:
- Change what morss does - Change what morss does
- `json`: output as JSON - `json`: output as JSON
- `html`: outpout as HTML
- `csv`: outpout as CSV
- `proxy`: doesn't fill the articles - `proxy`: doesn't fill the articles
- `clip`: stick the full article content under the original feed content (useful for twitter) - `clip`: stick the full article content under the original feed content (useful for twitter)
- `search=STRING`: does a basic case-sensitive search in the feed - `search=STRING`: does a basic case-sensitive search in the feed
@ -88,11 +88,9 @@ The arguments are:
- `mono`: disable multithreading while fetching, makes debugging easier - `mono`: disable multithreading while fetching, makes debugging easier
- `theforce`: force download the rss feed and ignore cached http errros - `theforce`: force download the rss feed and ignore cached http errros
- `silent`: don't output the final RSS (useless on its own, but can be nice when debugging) - `silent`: don't output the final RSS (useless on its own, but can be nice when debugging)
- `encoding=ENCODING`: overrides the encoding auto-detection of the crawler. Some web developers did not quite understand the importance of setting charset/encoding tags correctly...
- http server only - http server only
- `callback=NAME`: for JSONP calls - `callback=NAME`: for JSONP calls
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers) - `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers)
- `html`: changes the http content-type to html, so that python cgi erros (written in html) are readable in a web browser
- `txt`: changes the http content-type to txt (for faster "`view-source:`") - `txt`: changes the http content-type to txt (for faster "`view-source:`")
- Custom feeds: you can turn any HTML page into a RSS feed using morss, using xpath rules. The article content will be fetched as usual (with readabilite). Please note that you will have to **replace** any `/` in your rule with a `|` when using morss as a webserver - Custom feeds: you can turn any HTML page into a RSS feed using morss, using xpath rules. The article content will be fetched as usual (with readabilite). Please note that you will have to **replace** any `/` in your rule with a `|` when using morss as a webserver
- `items`: (**mandatory** to activate the custom feeds function) xpath rule to match all the RSS entries - `items`: (**mandatory** to activate the custom feeds function) xpath rule to match all the RSS entries

View File

@ -34,6 +34,25 @@ MIMETYPE = {
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0' DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
def get(*args, **kwargs):
return adv_get(*args, **kwargs)[0]
def adv_get(url, timeout=None, *args, **kwargs):
if timeout is None:
con = custom_handler(*args, **kwargs).open(url)
else:
con = custom_handler(*args, **kwargs).open(url, timeout=timeout)
data = con.read()
contenttype = con.info().get('Content-Type', '').split(';')[0]
encoding= detect_encoding(data, con)
return data, con, contenttype, encoding
def custom_handler(follow=None, delay=None, encoding=None): def custom_handler(follow=None, delay=None, encoding=None):
handlers = [] handlers = []
@ -199,7 +218,6 @@ class BrowserlyHeaderHandler(BaseHandler):
""" Add more headers to look less suspicious """ """ Add more headers to look less suspicious """
def http_request(self, req): def http_request(self, req):
req.add_unredirected_header('Referer', '%s://%s' % (req.type, req.host))
req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
req.add_unredirected_header('Accept-Language', 'en-US,en;q=0.5') req.add_unredirected_header('Accept-Language', 'en-US,en;q=0.5')
return req return req

View File

@ -15,7 +15,7 @@ import dateutil.parser
from copy import deepcopy from copy import deepcopy
import lxml.html import lxml.html
from bs4 import BeautifulSoup from .readabilite import parse as html_parse
json.encoder.c_make_encoder = None json.encoder.c_make_encoder = None
@ -53,7 +53,7 @@ def parse_rules(filename=None):
return rules return rules
def parse(data, url=None, mimetype=None): def parse(data, url=None, mimetype=None, encoding=None):
" Determine which ruleset to use " " Determine which ruleset to use "
rulesets = parse_rules() rulesets = parse_rules()
@ -67,14 +67,14 @@ def parse(data, url=None, mimetype=None):
for path in ruleset['path']: for path in ruleset['path']:
if fnmatch(url, path): if fnmatch(url, path):
parser = [x for x in parsers if x.mode == ruleset['mode']][0] parser = [x for x in parsers if x.mode == ruleset['mode']][0]
return parser(data, ruleset) return parser(data, ruleset, encoding=encoding)
# 2) Look for a parser based on mimetype # 2) Look for a parser based on mimetype
if mimetype is not None: if mimetype is not None:
parser_candidates = [x for x in parsers if mimetype in x.mimetype] parser_candidates = [x for x in parsers if mimetype in x.mimetype]
if mimetype is None or parser_candidates is None: if mimetype is None or len(parser_candidates) == 0:
parser_candidates = parsers parser_candidates = parsers
# 3) Look for working ruleset for given parser # 3) Look for working ruleset for given parser
@ -86,7 +86,7 @@ def parse(data, url=None, mimetype=None):
# 'path' as they should have been caught beforehands # 'path' as they should have been caught beforehands
try: try:
feed = parser(data) feed = parser(data, encoding=encoding)
except (ValueError): except (ValueError):
# parsing did not work # parsing did not work
@ -113,7 +113,7 @@ def parse(data, url=None, mimetype=None):
class ParserBase(object): class ParserBase(object):
def __init__(self, data=None, rules=None, parent=None): def __init__(self, data=None, rules=None, parent=None, encoding=None):
if rules is None: if rules is None:
rules = parse_rules()[self.default_ruleset] rules = parse_rules()[self.default_ruleset]
@ -122,9 +122,10 @@ class ParserBase(object):
if data is None: if data is None:
data = rules['base'] data = rules['base']
self.root = self.parse(data)
self.parent = parent self.parent = parent
self.encoding = encoding
self.root = self.parse(data)
def parse(self, raw): def parse(self, raw):
pass pass
@ -442,8 +443,7 @@ class ParserHTML(ParserXML):
mimetype = ['text/html', 'application/xhtml+xml'] mimetype = ['text/html', 'application/xhtml+xml']
def parse(self, raw): def parse(self, raw):
parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print return html_parse(raw, encoding=self.encoding)
return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify('utf-8'), parser)
def tostring(self, encoding='unicode', **k): def tostring(self, encoding='unicode', **k):
return lxml.html.tostring(self.root, encoding=encoding, **k) return lxml.html.tostring(self.root, encoding=encoding, **k)

View File

@ -10,7 +10,6 @@ import re
import lxml.etree import lxml.etree
import lxml.html import lxml.html
from bs4 import BeautifulSoup
from . import feeds from . import feeds
from . import crawler from . import crawler
@ -251,19 +250,17 @@ def ItemFill(item, options, feedurl='/', fast=False):
delay = -2 delay = -2
try: try:
con = crawler.custom_handler(delay=delay, encoding=options.encoding).open(link, timeout=TIMEOUT) data, con, contenttype, encoding = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
data = con.read()
except (IOError, HTTPException) as e: except (IOError, HTTPException) as e:
log('http error') log('http error')
return False # let's just delete errors stuff when in cache mode return False # let's just delete errors stuff when in cache mode
contenttype = con.info().get('Content-Type', '').split(';')[0]
if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain': if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
log('non-text page') log('non-text page')
return True return True
out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con)) out = readabilite.get_article(data, url=con.geturl(), encoding=encoding)
if out is not None: if out is not None:
item.content = out item.content = out
@ -324,18 +321,14 @@ def FeedFetch(url, options):
delay = 0 delay = 0
try: try:
con = crawler.custom_handler(follow='rss', delay=delay, encoding=options.encoding) \ xml, con, contenttype, encoding = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2)
.open(url, timeout=TIMEOUT * 2)
xml = con.read()
except (IOError, HTTPException): except (IOError, HTTPException):
raise MorssException('Error downloading feed') raise MorssException('Error downloading feed')
contenttype = con.info().get('Content-Type', '').split(';')[0]
if options.items: if options.items:
# using custom rules # using custom rules
rss = feeds.FeedHTML(xml) rss = feeds.FeedHTML(xml, encoding=encoding)
rss.rules['title'] = options.title if options.title else '//head/title' rss.rules['title'] = options.title if options.title else '//head/title'
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content' rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
@ -355,7 +348,7 @@ def FeedFetch(url, options):
else: else:
try: try:
rss = feeds.parse(xml, url, contenttype) rss = feeds.parse(xml, url, contenttype, encoding=encoding)
rss = rss.convert(feeds.FeedXML) rss = rss.convert(feeds.FeedXML)
# contains all fields, otherwise much-needed data can be lost # contains all fields, otherwise much-needed data can be lost
@ -469,7 +462,7 @@ def FeedFormat(rss, options, encoding='utf-8'):
elif options.csv: elif options.csv:
return rss.tocsv(encoding=encoding) return rss.tocsv(encoding=encoding)
elif options.reader: elif options.html:
if options.indent: if options.indent:
return rss.tohtml(encoding=encoding, pretty_print=True) return rss.tohtml(encoding=encoding, pretty_print=True)
@ -547,7 +540,7 @@ def cgi_app(environ, start_response):
if options.cors: if options.cors:
headers['access-control-allow-origin'] = '*' headers['access-control-allow-origin'] = '*'
if options.html or options.reader: if options.html:
headers['content-type'] = 'text/html' headers['content-type'] = 'text/html'
elif options.txt or options.silent: elif options.txt or options.silent:
headers['content-type'] = 'text/plain' headers['content-type'] = 'text/plain'
@ -652,13 +645,10 @@ def cgi_page(environ, start_response):
if urlparse(url).scheme not in ['http', 'https']: if urlparse(url).scheme not in ['http', 'https']:
url = 'http://' + url url = 'http://' + url
con = crawler.custom_handler().open(url) data, con, contenttype, encoding = crawler.adv_get(url=url)
data = con.read()
contenttype = con.info().get('Content-Type', '').split(';')[0]
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']: if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify()) html = readabilite.parse(data, encoding=encoding)
html.make_links_absolute(con.geturl()) html.make_links_absolute(con.geturl())
kill_tags = ['script', 'iframe', 'noscript'] kill_tags = ['script', 'iframe', 'noscript']

View File

@ -6,11 +6,14 @@ import re
def parse(data, encoding=None): def parse(data, encoding=None):
if encoding: if encoding:
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding=encoding) data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')
else:
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
return lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify('utf-8'), parser=parser) else:
data = BeautifulSoup(data, 'lxml').prettify('utf-8')
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding='utf-8')
return lxml.html.fromstring(data, parser=parser)
def count_words(string): def count_words(string):
@ -44,6 +47,12 @@ def count_content(node):
return count_words(node.text_content()) + len(node.findall('.//img')) return count_words(node.text_content()) + len(node.findall('.//img'))
def percentile(N, P):
# https://stackoverflow.com/a/7464107
n = max(int(round(P * len(N) + 0.5)), 2)
return N[n-2]
class_bad = ['comment', 'community', 'extra', 'foot', class_bad = ['comment', 'community', 'extra', 'foot',
'sponsor', 'pagination', 'pager', 'tweet', 'twitter', 'com-', 'masthead', 'sponsor', 'pagination', 'pager', 'tweet', 'twitter', 'com-', 'masthead',
'media', 'meta', 'related', 'shopping', 'tags', 'tool', 'author', 'about', 'media', 'meta', 'related', 'shopping', 'tags', 'tool', 'author', 'about',
@ -123,33 +132,42 @@ def score_node(node):
return score return score
def score_all(node, grades=None): def score_all(node):
" Fairly dumb loop to score all worthwhile nodes. Tries to be fast " " Fairly dumb loop to score all worthwhile nodes. Tries to be fast "
if grades is None:
grades = {}
for child in node: for child in node:
score = score_node(child) score = score_node(child)
child.attrib['seen'] = 'yes, ' + str(int(score)) child.attrib['seen'] = 'yes, ' + str(int(score))
if score > 0 or not len(grades): if score > 0 or len(list(child.iterancestors())) <= 2:
spread_score(child, score, grades) spread_score(child, score)
score_all(child, grades) score_all(child)
return grades
def spread_score(node, score, grades): def set_score(node, value):
node.attrib['morss_score'] = str(float(value))
def get_score(node):
return float(node.attrib.get('morss_score', 0))
def incr_score(node, delta):
set_score(node, get_score(node) + delta)
def get_all_scores(node):
return {x:get_score(x) for x in list(node.iter()) if get_score(x) != 0}
def spread_score(node, score):
" Spread the node's score to its parents, on a linear way " " Spread the node's score to its parents, on a linear way "
delta = score / 2 delta = score / 2
for ancestor in [node,] + list(node.iterancestors()): for ancestor in [node,] + list(node.iterancestors()):
if score >= 1 or ancestor is node: if score >= 1 or ancestor is node:
try: incr_score(ancestor, score)
grades[ancestor] += score
except KeyError:
grades[ancestor] = score
score -= delta score -= delta
@ -157,26 +175,24 @@ def spread_score(node, score, grades):
break break
def write_score_all(root, grades): def clean_root(root, keep_threshold=None):
" Useful for debugging "
for node in root.iter():
node.attrib['score'] = str(int(grades.get(node, 0)))
def clean_root(root):
for node in list(root): for node in list(root):
clean_root(node) # bottom-up approach, i.e. starting with children before cleaning current node
clean_node(node) clean_root(node, keep_threshold)
clean_node(node, keep_threshold)
def clean_node(node): def clean_node(node, keep_threshold=None):
parent = node.getparent() parent = node.getparent()
if parent is None: if parent is None:
# this is <html/> (or a removed element waiting for GC) # this is <html/> (or a removed element waiting for GC)
return return
if keep_threshold is not None and get_score(node) >= keep_threshold:
# high score, so keep
return
gdparent = parent.getparent() gdparent = parent.getparent()
# remove shitty tags # remove shitty tags
@ -275,18 +291,18 @@ def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
return nodeA # should always find one tho, at least <html/>, but needed for max_depth return nodeA # should always find one tho, at least <html/>, but needed for max_depth
def rank_nodes(grades): def rank_grades(grades):
# largest score to smallest
return sorted(grades.items(), key=lambda x: x[1], reverse=True) return sorted(grades.items(), key=lambda x: x[1], reverse=True)
def get_best_node(grades): def get_best_node(ranked_grades):
" To pick the best (raw) node. Another function will clean it " " To pick the best (raw) node. Another function will clean it "
if len(grades) == 1: if len(ranked_grades) == 1:
return grades[0] return ranked_grades[0]
top = rank_nodes(grades) lowest = lowest_common_ancestor(ranked_grades[0][0], ranked_grades[1][0], 3)
lowest = lowest_common_ancestor(top[0][0], top[1][0], 3)
return lowest return lowest
@ -295,12 +311,17 @@ def get_article(data, url=None, encoding=None):
" Input a raw html string, returns a raw html string of the article " " Input a raw html string, returns a raw html string of the article "
html = parse(data, encoding) html = parse(data, encoding)
scores = score_all(html) score_all(html)
scores = rank_grades(get_all_scores(html))
if not len(scores): if not len(scores):
return None return None
best = get_best_node(scores) best = get_best_node(scores)
keep_threshold = percentile([x[1] for x in scores], 0.1)
clean_root(best, keep_threshold)
wc = count_words(best.text_content()) wc = count_words(best.text_content())
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')])) wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
@ -310,6 +331,4 @@ def get_article(data, url=None, encoding=None):
if url: if url:
best.make_links_absolute(url) best.make_links_absolute(url)
clean_root(best)
return lxml.etree.tostring(best, pretty_print=True) return lxml.etree.tostring(best, pretty_print=True)