Delete feedify.py leftover code

iTunes integration untested, unreliable and not working...
feeds: turns out lxml.etree doesn't have drop_tag
2020-04-05 22:16:52 +02:00 · 2020-04-05 21:50:38 +02:00 · 2020-04-05 21:11:57 +02:00 · 2020-04-05 20:47:30 +02:00 · 2020-04-05 20:46:42 +02:00 · 2020-04-05 20:46:04 +02:00
8 changed files with 231 additions and 126 deletions
--- a/README.md
+++ b/README.md
@@ -48,6 +48,7 @@ You do need:
 - [python](http://www.python.org/) >= 2.6 (python 3 is supported)
 - [lxml](http://lxml.de/) for xml parsing
 - [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
 - [dateutil](http://labix.org/python-dateutil) to parse feed dates
 - [chardet](https://pypi.python.org/pypi/chardet)
 - [six](https://pypi.python.org/pypi/six), a dependency of chardet
@@ -76,7 +77,6 @@ The arguments are:
 	- `json`: output as JSON
 	- `proxy`: doesn't fill the articles
 	- `clip`: stick the full article content under the original feed content (useful for twitter)
 	- `keep`: by default, morss does drop feed description whenever the full-content is found (so as not to mislead users who use Firefox, since the latter only shows the description in the feed preview, so they might believe morss doens't work), but with this argument, the description is kept
 	- `search=STRING`: does a basic case-sensitive search in the feed
 - Advanced
 	- `csv`: export to csv
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -27,13 +27,14 @@ except NameError:
 MIMETYPE = {
    'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
    'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
    'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
 DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
-def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=False):
+def custom_handler(follow=None, delay=None, encoding=None):
    handlers = []
    # as per urllib2 source code, these Handelers are added first
@@ -51,14 +52,12 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F
    handlers.append(HTTPEquivHandler())
    handlers.append(HTTPRefreshHandler())
    handlers.append(UAHandler(DEFAULT_UA))
-
+    handlers.append(BrowserlyHeaderHandler())
    if not basic:
        handlers.append(AutoRefererHandler())
    handlers.append(EncodingFixHandler(encoding))
-    if accept:
+    if follow:
-        handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict))
+        handlers.append(AlternateHandler(MIMETYPE[follow]))
    handlers.append(CacheHandler(force_min=delay))
@@ -196,45 +195,34 @@ class UAHandler(BaseHandler):
    https_request = http_request
-class AutoRefererHandler(BaseHandler):
+class BrowserlyHeaderHandler(BaseHandler):
    """ Add more headers to look less suspicious """
    def http_request(self, req):
-        req.add_unredirected_header('Referer', 'http://%s' % req.host)
+        req.add_unredirected_header('Referer', '%s://%s' % (req.type, req.host))
        req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
        req.add_unredirected_header('Accept-Language', 'en-US,en;q=0.5')
        return req
    https_request = http_request
-class ContentNegociationHandler(BaseHandler):
+class AlternateHandler(BaseHandler):
-    " Handler for content negociation. Also parses <link rel='alternate' type='application/rss+xml' href='...' /> "
+    " Follow <link rel='alternate' type='application/rss+xml' href='...' /> "
-    def __init__(self, accept=None, strict=False):
+    def __init__(self, follow=None):
-        self.accept = accept
+        self.follow = follow or []
        self.strict = strict
    def http_request(self, req):
        if self.accept is not None:
            if isinstance(self.accept, basestring):
                self.accept = (self.accept,)
            string = ','.join(self.accept)
            if self.strict:
                string += ',*/*;q=0.9'
            req.add_unredirected_header('Accept', string)
        return req
    def http_response(self, req, resp):
        contenttype = resp.info().get('Content-Type', '').split(';')[0]
-        if 200 <= resp.code < 300 and self.accept is not None and self.strict and contenttype in MIMETYPE['html'] and contenttype not in self.accept:
+        if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
            # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
            data = resp.read()
            links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
            for link in links:
-                if link.get('type', '') in self.accept:
+                if link.get('type', '') in self.follow:
                    resp.code = 302
                    resp.msg = 'Moved Temporarily'
                    resp.headers['location'] = link.get('href')
@@ -246,7 +234,6 @@ class ContentNegociationHandler(BaseHandler):
        return resp
    https_request = http_request
    https_response = http_response
@@ -384,7 +371,7 @@ class CacheHandler(BaseHandler):
        elif  self.force_min is None and ('no-cache' in cc_list
                                        or 'no-store' in cc_list
-                                        or ('private' in cc_list and not self.private)):
+                                        or ('private' in cc_list and not self.private_cache)):
            # kindly follow web servers indications, refresh
            return None
@@ -419,7 +406,7 @@ class CacheHandler(BaseHandler):
            cc_list = [x for x in cache_control if '=' not in x]
-            if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private):
+            if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache):
                # kindly follow web servers indications
                return resp
--- a/morss/feedify.py
+++ b/morss/feedify.py
@@ -1,28 +0,0 @@
 import re
 import json
 from . import crawler
 try:
    basestring
 except NameError:
    basestring = str
 def pre_worker(url):
    if url.startswith('http://itunes.apple.com/') or url.startswith('https://itunes.apple.com/'):
        match = re.search('/id([0-9]+)(\?.*)?$', url)
        if match:
            iid = match.groups()[0]
            redirect = 'https://itunes.apple.com/lookup?id=%s' % iid
            try:
                con = crawler.custom_handler(basic=True).open(redirect, timeout=4)
                data = con.read()
            except (IOError, HTTPException):
                raise
            return json.loads(data.decode('utf-8', 'replace'))['results'][0]['feedUrl']
    return None
--- a/morss/feeds.py
+++ b/morss/feeds.py
@@ -15,6 +15,7 @@ import dateutil.parser
 from copy import deepcopy
 import lxml.html
 from bs4 import BeautifulSoup
 json.encoder.c_make_encoder = None
@@ -401,13 +402,14 @@ class ParserXML(ParserBase):
        else:
            if html_rich:
                # atom stuff
                if 'atom' in rule:
                    match.attrib['type'] = 'xhtml'
                self._clean_node(match)
                match.append(lxml.html.fragment_fromstring(value, create_parent='div'))
-                match.find('div').drop_tag()
+
                if self.rules['mode'] == 'html':
                    match.find('div').drop_tag() # not supported by lxml.etree
                else: # i.e. if atom
                    match.attrib['type'] = 'xhtml'
            else:
                if match is not None and len(match):
@@ -441,7 +443,7 @@ class ParserHTML(ParserXML):
    def parse(self, raw):
        parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
-        return etree.fromstring(raw, parser)
+        return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify('utf-8'), parser)
    def tostring(self, encoding='unicode', **k):
        return lxml.html.tostring(self.root, encoding=encoding, **k)
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -10,27 +10,28 @@ import re
 import lxml.etree
 import lxml.html
 from bs4 import BeautifulSoup
 from . import feeds
 from . import feedify
 from . import crawler
 from . import readabilite
 import wsgiref.simple_server
 import wsgiref.handlers
 import cgitb
 try:
    # python 2
    from Queue import Queue
    from httplib import HTTPException
-    from urllib import quote_plus
+    from urllib import unquote
    from urlparse import urlparse, urljoin, parse_qs
 except ImportError:
    # python 3
    from queue import Queue
    from http.client import HTTPException
-    from urllib.parse import quote_plus
+    from urllib.parse import unquote
    from urllib.parse import urlparse, urljoin, parse_qs
 LIM_ITEM = 100  # deletes what's beyond
@@ -44,7 +45,7 @@ THREADS = 10  # number of threads (1 for single-threaded)
 DEBUG = False
 PORT = 8080
-PROTOCOL = ['http', 'https', 'ftp']
+PROTOCOL = ['http', 'https']
 def filterOptions(options):
@@ -52,7 +53,7 @@ def filterOptions(options):
    # example of filtering code below
-    #allowed = ['proxy', 'clip', 'keep', 'cache', 'force', 'silent', 'pro', 'debug']
+    #allowed = ['proxy', 'clip', 'cache', 'force', 'silent', 'pro', 'debug']
    #filtered = dict([(key,value) for (key,value) in options.items() if key in allowed])
    #return filtered
@@ -66,6 +67,7 @@ def log(txt, force=False):
    if DEBUG or force:
        if 'REQUEST_URI' in os.environ:
            open('morss.log', 'a').write("%s\n" % repr(txt))
        else:
            print(repr(txt))
@@ -73,6 +75,7 @@ def log(txt, force=False):
 def len_html(txt):
    if len(txt):
        return len(lxml.html.fromstring(txt).text_content())
    else:
        return 0
@@ -80,6 +83,7 @@ def len_html(txt):
 def count_words(txt):
    if len(txt):
        return len(lxml.html.fromstring(txt).text_content().split())
    return 0
@@ -88,12 +92,14 @@ class Options:
        if len(args):
            self.options = args
            self.options.update(options or {})
        else:
            self.options = options or {}
    def __getattr__(self, key):
        if key in self.options:
            return self.options[key]
        else:
            return False
@@ -107,17 +113,23 @@ class Options:
 def parseOptions(options):
    """ Turns ['md=True'] into {'md':True} """
    out = {}
    for option in options:
        split = option.split('=', 1)
        if len(split) > 1:
            if split[0].lower() == 'true':
                out[split[0]] = True
            elif split[0].lower() == 'false':
                out[split[0]] = False
            else:
                out[split[0]] = split[1]
        else:
            out[split[0]] = True
    return out
@@ -158,6 +170,11 @@ def ItemFix(item, feedurl='/'):
        item.link = parse_qs(urlparse(item.link).query)['url'][0]
        log(item.link)
    # pocket
    if fnmatch(item.link, 'https://getpocket.com/redirect?url=*'):
        item.link = parse_qs(urlparse(item.link).query)['url'][0]
        log(item.link)
    # facebook
    if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
        item.link = parse_qs(urlparse(item.link).query)['u'][0]
@@ -208,6 +225,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
        if len(match):
            link = match[0]
            log(link)
        else:
            link = None
@@ -217,6 +235,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
        if len(match) and urlparse(match[0]).netloc != 'www.facebook.com':
            link = match[0]
            log(link)
        else:
            link = None
@@ -232,7 +251,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
        delay = -2
    try:
-        con = crawler.custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT)
+        con = crawler.custom_handler(delay=delay, encoding=options.encoding).open(link, timeout=TIMEOUT)
        data = con.read()
    except (IOError, HTTPException) as e:
@@ -268,9 +287,6 @@ def ItemAfter(item, options):
        item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
        del item.desc
    if not options.keep and not options.proxy:
        del item.desc
    if options.nolink and item.content:
        content = lxml.html.fromstring(item.content)
        for link in content.xpath('//a'):
@@ -284,27 +300,23 @@ def ItemAfter(item, options):
    return item
-def FeedFetch(url, options):
+def UrlFix(url):
    # basic url clean-up
    if url is None:
        raise MorssException('No url provided')
    if isinstance(url, bytes):
        url = url.decode()
    if urlparse(url).scheme not in PROTOCOL:
        url = 'http://' + url
        log(url)
    url = url.replace(' ', '%20')
-    if isinstance(url, bytes):
+    return url
        url = url.decode()
    # allow for code execution for feedify
    pre = feedify.pre_worker(url)
    if pre:
        url = pre
        log('url redirect')
        log(url)
 def FeedFetch(url, options):
    # fetch feed
    delay = DELAY
@@ -312,8 +324,7 @@ def FeedFetch(url, options):
        delay = 0
    try:
-        con = crawler.custom_handler(accept='xml', strict=True, delay=delay,
+        con = crawler.custom_handler(follow='rss', delay=delay, encoding=options.encoding) \
            encoding=options.encoding, basic=not options.items) \
            .open(url, timeout=TIMEOUT * 2)
        xml = con.read()
@@ -324,20 +335,24 @@ def FeedFetch(url, options):
    if options.items:
        # using custom rules
-        rss = feeds.FeedHTML(xml, url, contenttype)
+        rss = feeds.FeedHTML(xml)
-        feed.rule
+
        rss.rules['title'] = options.title              if options.title        else '//head/title'
        rss.rules['desc'] = options.desc                if options.desc         else '//head/meta[@name="description"]/@content'
        rss.rules['items'] = options.items
-        if options.item_title:
+        rss.rules['item_title'] = options.item_title    if options.item_title   else './/a|.'
-            rss.rules['item_title'] = options.item_title
+        rss.rules['item_link'] = options.item_link      if options.item_link    else './@href|.//a/@href'
-        if options.item_link:
+
            rss.rules['item_link'] = options.item_link
        if options.item_content:
            rss.rules['item_content'] = options.item_content
        if options.item_time:
            rss.rules['item_time'] = options.item_time
        rss = rss.convert(feeds.FeedXML)
    else:
        try:
            rss = feeds.parse(xml, url, contenttype)
@@ -375,6 +390,7 @@ def FeedGather(rss, url, options):
            value = queue.get()
            try:
                worker(*value)
            except Exception as e:
                log('Thread Error: %s' % e.message)
            queue.task_done()
@@ -414,6 +430,7 @@ def FeedGather(rss, url, options):
    for i, item in enumerate(list(rss.items)):
        if threads == 1:
            worker(*[i, item])
        else:
            queue.put([i, item])
@@ -433,37 +450,38 @@ def FeedGather(rss, url, options):
    return rss
-def FeedFormat(rss, options):
+def FeedFormat(rss, options, encoding='utf-8'):
    if options.callback:
        if re.match(r'^[a-zA-Z0-9\.]+$', options.callback) is not None:
-            return '%s(%s)' % (options.callback, rss.tojson())
+            out = '%s(%s)' % (options.callback, rss.tojson(encoding='unicode'))
            return out if encoding == 'unicode' else out.encode(encoding)
        else:
            raise MorssException('Invalid callback var name')
    elif options.json:
        if options.indent:
-            return rss.tojson(encoding='UTF-8', indent=4)
+            return rss.tojson(encoding=encoding, indent=4)
        else:
-            return rss.tojson(encoding='UTF-8')
+            return rss.tojson(encoding=encoding)
    elif options.csv:
-        return rss.tocsv(encoding='UTF-8')
+        return rss.tocsv(encoding=encoding)
    elif options.reader:
        if options.indent:
-            return rss.tohtml(encoding='UTF-8', pretty_print=True)
+            return rss.tohtml(encoding=encoding, pretty_print=True)
        else:
-            return rss.tohtml(encoding='UTF-8')
+            return rss.tohtml(encoding=encoding)
    else:
        if options.indent:
-            return rss.torss(xml_declaration=True, encoding='UTF-8', pretty_print=True)
+            return rss.torss(xml_declaration=True, encoding=encoding, pretty_print=True)
        else:
-            return rss.torss(xml_declaration=True, encoding='UTF-8')
+            return rss.torss(xml_declaration=True, encoding=encoding)
 def process(url, cache=None, options=None):
@@ -475,14 +493,16 @@ def process(url, cache=None, options=None):
    if cache:
        crawler.default_cache = crawler.SQLiteCache(cache)
    url = UrlFix(url)
    rss = FeedFetch(url, options)
    rss = FeedGather(rss, url, options)
    return FeedFormat(rss, options)
-def cgi_app(environ, start_response):
+def cgi_parse_environ(environ):
    # get options
    if 'REQUEST_URI' in environ:
        url = environ['REQUEST_URI'][1:]
    else:
@@ -496,7 +516,7 @@ def cgi_app(environ, start_response):
    if url.startswith(':'):
        split = url.split('/', 1)
-        options = split[0].replace('|', '/').replace('\\\'', '\'').split(':')[1:]
+        raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:]
        if len(split) > 1:
            url = split[1]
@@ -504,15 +524,22 @@ def cgi_app(environ, start_response):
            url = ''
    else:
-        options = []
+        raw_options = []
    # init
-    options = Options(filterOptions(parseOptions(options)))
+    options = Options(filterOptions(parseOptions(raw_options)))
    headers = {}
    global DEBUG
    DEBUG = options.debug
    return (url, options)
 def cgi_app(environ, start_response):
    url, options = cgi_parse_environ(environ)
    headers = {}
    # headers
    headers['status'] = '200 OK'
    headers['cache-control'] = 'max-age=%s' % DELAY
@@ -537,6 +564,7 @@ def cgi_app(environ, start_response):
    crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
    # get the work done
    url = UrlFix(url)
    rss = FeedFetch(url, options)
    if headers['content-type'] == 'text/xml':
@@ -547,18 +575,42 @@ def cgi_app(environ, start_response):
    rss = FeedGather(rss, url, options)
    out = FeedFormat(rss, options)
-    if not options.silent:
+    if options.silent:
-        return out
+        return ['']
    else:
        return [out]
-def cgi_wrapper(environ, start_response):
+def middleware(func):
-    # simple http server for html and css
+    " Decorator to turn a function into a wsgi middleware "
    # This is called when parsing the code
    def app_builder(app):
        # This is called when doing app = cgi_wrapper(app)
        def app_wrap(environ, start_response):
            # This is called when a http request is being processed
            return func(environ, start_response, app)
        return app_wrap
    return app_builder
@middleware
 def cgi_file_handler(environ, start_response, app):
    " Simple HTTP server to serve static files (.html, .css, etc.) "
    files = {
        '': 'text/html',
-        'index.html': 'text/html'}
+        'index.html': 'text/html',
        'sheet.xsl': 'text/xsl'}
    if 'REQUEST_URI' in environ:
        url = environ['REQUEST_URI'][1:]
    else:
        url = environ['PATH_INFO'][1:]
@@ -587,16 +639,80 @@ def cgi_wrapper(environ, start_response):
            start_response(headers['status'], list(headers.items()))
            return ['Error %s' % headers['status']]
-    # actual morss use
+    else:
        return app(environ, start_response)
 def cgi_page(environ, start_response):
    url, options = cgi_parse_environ(environ)
    # get page
    PROTOCOL = ['http', 'https']
    if urlparse(url).scheme not in ['http', 'https']:
        url = 'http://' + url
    con = crawler.custom_handler().open(url)
    data = con.read()
    contenttype = con.info().get('Content-Type', '').split(';')[0]
    if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
        html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
        html.make_links_absolute(con.geturl())
        kill_tags = ['script', 'iframe', 'noscript']
        for tag in kill_tags:
            for elem in html.xpath('//'+tag):
                elem.getparent().remove(elem)
        output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
    else:
        output = None
    # return html page
    headers = {'status': '200 OK', 'content-type': 'text/html'}
    start_response(headers['status'], list(headers.items()))
    return [output]
 dispatch_table = {
    'getpage': cgi_page
    }
@middleware
 def cgi_dispatcher(environ, start_response, app):
    url, options = cgi_parse_environ(environ)
    for key in dispatch_table.keys():
        if key in options:
            return dispatch_table[key](environ, start_response)
    return app(environ, start_response)
@middleware
 def cgi_error_handler(environ, start_response, app):
    try:
-        return [cgi_app(environ, start_response) or '(empty)']
+        return app(environ, start_response)
    except (KeyboardInterrupt, SystemExit):
        raise
    except Exception as e:
-        headers = {'status': '500 Oops', 'content-type': 'text/plain'}
+        headers = {'status': '500 Oops', 'content-type': 'text/html'}
        start_response(headers['status'], list(headers.items()), sys.exc_info())
-        log('ERROR <%s>: %s' % (url, e.message), force=True)
+        log('ERROR: %s' % repr(e), force=True)
-        return ['An error happened:\n%s' % e.message]
+        return [cgitb.html(sys.exc_info())]
@middleware
 def cgi_encode(environ, start_response, app):
    out = app(environ, start_response)
    return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
 def cli_app():
@@ -608,6 +724,7 @@ def cli_app():
    crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
    url = UrlFix(url)
    rss = FeedFetch(url, options)
    rss = FeedGather(rss, url, options)
    out = FeedFormat(rss, options)
@@ -622,6 +739,7 @@ def isInt(string):
    try:
        int(string)
        return True
    except ValueError:
        return False
@@ -629,7 +747,13 @@ def isInt(string):
 def main():
    if 'REQUEST_URI' in os.environ:
        # mod_cgi
-        wsgiref.handlers.CGIHandler().run(cgi_wrapper)
+
        app = cgi_app
        app = cgi_dispatcher(app)
        app = cgi_error_handler(app)
        app = cgi_encode(app)
        wsgiref.handlers.CGIHandler().run(app)
    elif len(sys.argv) <= 1 or isInt(sys.argv[1]) or '--root' in sys.argv[1:]:
        # start internal (basic) http server
@@ -638,22 +762,31 @@ def main():
            argPort = int(sys.argv[1])
            if argPort > 0:
                port = argPort
            else:
                raise MorssException('Port must be positive integer')
        else:
            port = PORT
-        print('Serving http://localhost:%s/'%port)
+        app = cgi_app
-        httpd = wsgiref.simple_server.make_server('', port, cgi_wrapper)
+        app = cgi_file_handler(app)
        app = cgi_dispatcher(app)
        app = cgi_error_handler(app)
        app = cgi_encode(app)
        print('Serving http://localhost:%s/' % port)
        httpd = wsgiref.simple_server.make_server('', port, app)
        httpd.serve_forever()
    else:
        # as a CLI app
        try:
            cli_app()
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            print('ERROR: %s' % e.message)
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@@ -1,5 +1,6 @@
 import lxml.etree
 import lxml.html
 from bs4 import BeautifulSoup
 import re
@@ -9,7 +10,7 @@ def parse(data, encoding=None):
    else:
        parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
-    return lxml.html.fromstring(data, parser=parser)
+    return lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify('utf-8'), parser=parser)
 def count_words(string):
@@ -62,7 +63,7 @@ regex_good = re.compile('|'.join(class_good), re.I)
 tags_junk = ['script', 'head', 'iframe', 'object', 'noscript',
    'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea',
-    'button', 'footer']
+    'button', 'footer', 'link', 'meta']
 tags_bad = tags_junk + ['a', 'aside']
@@ -93,10 +94,18 @@ def score_node(node):
    class_id = node.get('class', '') + node.get('id', '')
    if (isinstance(node, lxml.html.HtmlComment)
-            or node.tag in tags_bad
+            or isinstance(node, lxml.html.HtmlProcessingInstruction)):
            or regex_bad.search(class_id)):
        return 0
    if node.tag in tags_junk:
        score += -1 # actuall -2 as tags_junk is included tags_bad
    if node.tag in tags_bad:
        score += -1
    if regex_bad.search(class_id):
        score += -1
    if node.tag in tags_good:
        score += 4
@@ -124,7 +133,7 @@ def score_all(node, grades=None):
        score = score_node(child)
        child.attrib['seen'] = 'yes, ' + str(int(score))
-        if score > 0:
+        if score > 0 or not len(grades):
            spread_score(child, score, grades)
            score_all(child, grades)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 lxml
 bs4
 python-dateutil <= 1.5
 chardet
 pymysql
--- a/www/sheet.xsl
+++ b/www/sheet.xsl
@@ -13,6 +13,7 @@
 				body {
 					overflow-wrap: anywhere;
 					word-wrap: anywhere;
 					font-family: sans;
 				}
 				#url {
Author	SHA1	Message	Date
pictuga	a82ec96eb7	Delete feedify.py leftover code iTunes integration untested, unreliable and not working...	2020-04-05 22:16:52 +02:00
pictuga	aad2398e69	feeds: turns out lxml.etree doesn't have drop_tag	2020-04-05 21:50:38 +02:00
pictuga	eeac630855	crawler: add more "realistic" headers	2020-04-05 21:11:57 +02:00
pictuga	e136b0feb2	readabilite: loosen the slayer Previous impl. lead to too many empty results	2020-04-05 20:47:30 +02:00
pictuga	6cf32af6c0	readabilite: also use BS	2020-04-05 20:46:42 +02:00
pictuga	568e7d7dd2	feeds: make BS's output bytes for lxml's sake	2020-04-05 20:46:04 +02:00
pictuga	3617f86e9d	morss: make cgi_encore more robust	2020-04-05 16:43:11 +02:00
pictuga	d90756b337	morss: drop 'keep' option Because the Firefox behaviour it is working around is no longer in use	2020-04-05 16:37:27 +02:00
pictuga	40c69f17d2	feeds: parse html with BS More robust & to make it consistent with :getpage	2020-04-05 16:12:41 +02:00
pictuga	99461ea185	crawler: fix var name issues (private_cache)	2020-04-05 16:11:36 +02:00
pictuga	bf86c1e962	crawler: make AutoUA match http(s) type	2020-04-05 16:07:51 +02:00
pictuga	d20f6237bd	crawler: replace ContentNegoHandler with AlternateHandler More basic. Sends the same headers no matter what. Make requests more "replicable". Also, drop "text/xml" from RSS contenttype, too broad, matches garbage	2020-04-05 16:05:59 +02:00
pictuga	8a4d68d72c	crawler: drop 'basic' toggle Can't even remember the use case	2020-04-05 16:03:06 +02:00
pictuga	e6811138fd	morss: use redirected url in :getpage Still have to find how to do the same thing with feeds...	2020-04-04 20:04:57 +02:00
pictuga	35b702fffd	morss: default values for feed creation	2020-04-04 19:39:32 +02:00
pictuga	4a88886767	morss: get_page to act as a basic proxy (for iframes)	2020-04-04 16:37:15 +02:00
pictuga	1653394cf7	morss: cgi_dispatcher to be able to create extra functions	2020-04-04 16:35:16 +02:00
pictuga	a8a90cf414	morss: move url/options parsing to own function For future re-use	2020-04-04 16:33:52 +02:00
pictuga	bdbaf0f8a7	morss/cgi: fix handling of special chars in url	2020-04-04 16:21:37 +02:00
pictuga	d0e447a2a6	ItemFix: clean up Pocket links	2020-04-04 16:20:39 +02:00
pictuga	e6817e01b4	sheet.xsl: set font to "sans" Browsers don't all have the same default font. Overriding for consistency	2020-04-03 17:47:19 +02:00
pictuga	7c3091d64c	morss: code spacing One of those commits that make me feel useful	2020-03-21 23:41:46 +01:00
pictuga	37b4e144a9	morss: small fixes Includes dropping off ftp support	2020-03-21 23:30:18 +01:00
pictuga	bd4b7b5bb2	morss: convert HTML feeds to XML ones for completeness	2020-03-21 23:27:42 +01:00
pictuga	68d920d4b5	morss: make FeedFormat more flexible with encoding	2020-03-21 23:26:35 +01:00
pictuga	758ff404a8	morss: fix cgi_app silent output Must return sth	2020-03-21 23:25:25 +01:00
pictuga	463530f02c	morss: middleware to enforce encoding bytes are always expected	2020-03-21 23:23:50 +01:00
pictuga	ec0a28a91d	morss: use middleware for wsgi apps	2020-03-21 23:23:21 +01:00
pictuga	421acb439d	morss: make errors more readable over http	2020-03-21 23:08:29 +01:00
pictuga	42c5d09ccb	morss: split "options" var into "raw_options" & "options" To make it clearer who-is-what	2020-03-21 23:07:07 +01:00
pictuga	056de12484	morss: add sheet.xsl to file handled by http server	2020-03-21 23:06:28 +01:00
pictuga	961a31141f	morss: fix url fixing	2020-03-21 17:28:00 +01:00
pictuga	a7b01ee85e	readabilite: further html processing instructions fix	2020-03-21 17:23:50 +01:00