crawler: make AutoUA match http(s) type

crawler: replace ContentNegoHandler with AlternateHandler
More basic. Sends the same headers no matter what. Make requests more "replicable". Also, drop "text/xml" from RSS contenttype, too broad, matches garbage
2020-04-05 16:07:51 +02:00 · 2020-04-05 16:05:59 +02:00 · 2020-04-05 16:03:06 +02:00 · 2020-04-04 20:04:57 +02:00 · 2020-04-04 19:39:32 +02:00 · 2020-04-04 16:37:15 +02:00
6 changed files with 204 additions and 73 deletions
--- a/README.md
+++ b/README.md
@@ -48,6 +48,7 @@ You do need:

 - [python](http://www.python.org/) >= 2.6 (python 3 is supported)
 - [lxml](http://lxml.de/) for xml parsing
+- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
 - [dateutil](http://labix.org/python-dateutil) to parse feed dates
 - [chardet](https://pypi.python.org/pypi/chardet)
 - [six](https://pypi.python.org/pypi/six), a dependency of chardet
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -27,13 +27,14 @@ except NameError:

 MIMETYPE = {
    'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
+    'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
    'html': ['text/html', 'application/xhtml+xml', 'application/xml']}


 DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'


-def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=False):
+def custom_handler(follow=None, delay=None, encoding=None):
    handlers = []

    # as per urllib2 source code, these Handelers are added first
@@ -51,14 +52,12 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F
    handlers.append(HTTPEquivHandler())
    handlers.append(HTTPRefreshHandler())
    handlers.append(UAHandler(DEFAULT_UA))
-
-    if not basic:
    handlers.append(AutoRefererHandler())

    handlers.append(EncodingFixHandler(encoding))

-    if accept:
-        handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict))
+    if follow:
+        handlers.append(AlternateHandler(MIMETYPE[follow]))

    handlers.append(CacheHandler(force_min=delay))

@@ -198,43 +197,28 @@ class UAHandler(BaseHandler):

 class AutoRefererHandler(BaseHandler):
    def http_request(self, req):
-        req.add_unredirected_header('Referer', 'http://%s' % req.host)
+        req.add_unredirected_header('Referer', '%s://%s' % (req.type, req.host))
        return req

    https_request = http_request


-class ContentNegociationHandler(BaseHandler):
-    " Handler for content negociation. Also parses <link rel='alternate' type='application/rss+xml' href='...' /> "
+class AlternateHandler(BaseHandler):
+    " Follow <link rel='alternate' type='application/rss+xml' href='...' /> "

-    def __init__(self, accept=None, strict=False):
-        self.accept = accept
-        self.strict = strict
-
-    def http_request(self, req):
-        if self.accept is not None:
-            if isinstance(self.accept, basestring):
-                self.accept = (self.accept,)
-
-            string = ','.join(self.accept)
-
-            if self.strict:
-                string += ',*/*;q=0.9'
-
-            req.add_unredirected_header('Accept', string)
-
-        return req
+    def __init__(self, follow=None):
+        self.follow = follow or []

    def http_response(self, req, resp):
        contenttype = resp.info().get('Content-Type', '').split(';')[0]
-        if 200 <= resp.code < 300 and self.accept is not None and self.strict and contenttype in MIMETYPE['html'] and contenttype not in self.accept:
+        if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
            # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types

            data = resp.read()
            links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')

            for link in links:
-                if link.get('type', '') in self.accept:
+                if link.get('type', '') in self.follow:
                    resp.code = 302
                    resp.msg = 'Moved Temporarily'
                    resp.headers['location'] = link.get('href')
@@ -246,7 +230,6 @@ class ContentNegociationHandler(BaseHandler):

        return resp

-    https_request = http_request
    https_response = http_response


--- a/morss/morss.py
+++ b/morss/morss.py
@@ -10,6 +10,7 @@ import re

 import lxml.etree
 import lxml.html
+from bs4 import BeautifulSoup

 from . import feeds
 from . import feedify
@@ -18,19 +19,20 @@ from . import readabilite

 import wsgiref.simple_server
 import wsgiref.handlers
+import cgitb


 try:
    # python 2
    from Queue import Queue
    from httplib import HTTPException
-    from urllib import quote_plus
+    from urllib import unquote
    from urlparse import urlparse, urljoin, parse_qs
 except ImportError:
    # python 3
    from queue import Queue
    from http.client import HTTPException
-    from urllib.parse import quote_plus
+    from urllib.parse import unquote
    from urllib.parse import urlparse, urljoin, parse_qs

 LIM_ITEM = 100  # deletes what's beyond
@@ -44,7 +46,7 @@ THREADS = 10  # number of threads (1 for single-threaded)
 DEBUG = False
 PORT = 8080

-PROTOCOL = ['http', 'https', 'ftp']
+PROTOCOL = ['http', 'https']


 def filterOptions(options):
@@ -66,6 +68,7 @@ def log(txt, force=False):
    if DEBUG or force:
        if 'REQUEST_URI' in os.environ:
            open('morss.log', 'a').write("%s\n" % repr(txt))
+
        else:
            print(repr(txt))

@@ -73,6 +76,7 @@ def log(txt, force=False):
 def len_html(txt):
    if len(txt):
        return len(lxml.html.fromstring(txt).text_content())
+
    else:
        return 0

@@ -80,6 +84,7 @@ def len_html(txt):
 def count_words(txt):
    if len(txt):
        return len(lxml.html.fromstring(txt).text_content().split())
+
    return 0


@@ -88,12 +93,14 @@ class Options:
        if len(args):
            self.options = args
            self.options.update(options or {})
+
        else:
            self.options = options or {}

    def __getattr__(self, key):
        if key in self.options:
            return self.options[key]
+
        else:
            return False

@@ -107,17 +114,23 @@ class Options:
 def parseOptions(options):
    """ Turns ['md=True'] into {'md':True} """
    out = {}
+
    for option in options:
        split = option.split('=', 1)
+
        if len(split) > 1:
            if split[0].lower() == 'true':
                out[split[0]] = True
+
            elif split[0].lower() == 'false':
                out[split[0]] = False
+
            else:
                out[split[0]] = split[1]
+
        else:
            out[split[0]] = True
+
    return out


@@ -158,6 +171,11 @@ def ItemFix(item, feedurl='/'):
        item.link = parse_qs(urlparse(item.link).query)['url'][0]
        log(item.link)

+    # pocket
+    if fnmatch(item.link, 'https://getpocket.com/redirect?url=*'):
+        item.link = parse_qs(urlparse(item.link).query)['url'][0]
+        log(item.link)
+
    # facebook
    if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
        item.link = parse_qs(urlparse(item.link).query)['u'][0]
@@ -208,6 +226,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
        if len(match):
            link = match[0]
            log(link)
+
        else:
            link = None

@@ -217,6 +236,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
        if len(match) and urlparse(match[0]).netloc != 'www.facebook.com':
            link = match[0]
            log(link)
+
        else:
            link = None

@@ -232,7 +252,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
        delay = -2

    try:
-        con = crawler.custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT)
+        con = crawler.custom_handler(delay=delay, encoding=options.encoding).open(link, timeout=TIMEOUT)
        data = con.read()

    except (IOError, HTTPException) as e:
@@ -284,24 +304,27 @@ def ItemAfter(item, options):
    return item


-def FeedFetch(url, options):
-    # basic url clean-up
+def UrlFix(url):
    if url is None:
        raise MorssException('No url provided')

+    if isinstance(url, bytes):
+        url = url.decode()
+
    if urlparse(url).scheme not in PROTOCOL:
        url = 'http://' + url
        log(url)

    url = url.replace(' ', '%20')

-    if isinstance(url, bytes):
-        url = url.decode()
+    return url

+
+def FeedFetch(url, options):
    # allow for code execution for feedify
    pre = feedify.pre_worker(url)
    if pre:
-        url = pre
+        url = UrlFix(pre)
        log('url redirect')
        log(url)

@@ -312,8 +335,7 @@ def FeedFetch(url, options):
        delay = 0

    try:
-        con = crawler.custom_handler(accept='xml', strict=True, delay=delay,
-            encoding=options.encoding, basic=not options.items) \
+        con = crawler.custom_handler(follow='rss', delay=delay, encoding=options.encoding) \
            .open(url, timeout=TIMEOUT * 2)
        xml = con.read()

@@ -324,20 +346,24 @@ def FeedFetch(url, options):

    if options.items:
        # using custom rules
-        rss = feeds.FeedHTML(xml, url, contenttype)
-        feed.rule
+        rss = feeds.FeedHTML(xml)
+
+        rss.rules['title'] = options.title              if options.title        else '//head/title'
+        rss.rules['desc'] = options.desc                if options.desc         else '//head/meta[@name="description"]/@content'

        rss.rules['items'] = options.items

-        if options.item_title:
-            rss.rules['item_title'] = options.item_title
-        if options.item_link:
-            rss.rules['item_link'] = options.item_link
+        rss.rules['item_title'] = options.item_title    if options.item_title   else './/a|.'
+        rss.rules['item_link'] = options.item_link      if options.item_link    else './@href|.//a/@href'
+
        if options.item_content:
            rss.rules['item_content'] = options.item_content
+
        if options.item_time:
            rss.rules['item_time'] = options.item_time

+        rss = rss.convert(feeds.FeedXML)
+
    else:
        try:
            rss = feeds.parse(xml, url, contenttype)
@@ -375,6 +401,7 @@ def FeedGather(rss, url, options):
            value = queue.get()
            try:
                worker(*value)
+
            except Exception as e:
                log('Thread Error: %s' % e.message)
            queue.task_done()
@@ -414,6 +441,7 @@ def FeedGather(rss, url, options):
    for i, item in enumerate(list(rss.items)):
        if threads == 1:
            worker(*[i, item])
+
        else:
            queue.put([i, item])

@@ -433,37 +461,38 @@ def FeedGather(rss, url, options):
    return rss


-def FeedFormat(rss, options):
+def FeedFormat(rss, options, encoding='utf-8'):
    if options.callback:
        if re.match(r'^[a-zA-Z0-9\.]+$', options.callback) is not None:
-            return '%s(%s)' % (options.callback, rss.tojson())
+            out = '%s(%s)' % (options.callback, rss.tojson(encoding='unicode'))
+            return out if encoding == 'unicode' else out.encode(encoding)

        else:
            raise MorssException('Invalid callback var name')

    elif options.json:
        if options.indent:
-            return rss.tojson(encoding='UTF-8', indent=4)
+            return rss.tojson(encoding=encoding, indent=4)

        else:
-            return rss.tojson(encoding='UTF-8')
+            return rss.tojson(encoding=encoding)

    elif options.csv:
-        return rss.tocsv(encoding='UTF-8')
+        return rss.tocsv(encoding=encoding)

    elif options.reader:
        if options.indent:
-            return rss.tohtml(encoding='UTF-8', pretty_print=True)
+            return rss.tohtml(encoding=encoding, pretty_print=True)

        else:
-            return rss.tohtml(encoding='UTF-8')
+            return rss.tohtml(encoding=encoding)

    else:
        if options.indent:
-            return rss.torss(xml_declaration=True, encoding='UTF-8', pretty_print=True)
+            return rss.torss(xml_declaration=True, encoding=encoding, pretty_print=True)

        else:
-            return rss.torss(xml_declaration=True, encoding='UTF-8')
+            return rss.torss(xml_declaration=True, encoding=encoding)


 def process(url, cache=None, options=None):
@@ -475,14 +504,16 @@ def process(url, cache=None, options=None):
    if cache:
        crawler.default_cache = crawler.SQLiteCache(cache)

+    url = UrlFix(url)
    rss = FeedFetch(url, options)
    rss = FeedGather(rss, url, options)

    return FeedFormat(rss, options)


-def cgi_app(environ, start_response):
+def cgi_parse_environ(environ):
    # get options
+
    if 'REQUEST_URI' in environ:
        url = environ['REQUEST_URI'][1:]
    else:
@@ -496,7 +527,7 @@ def cgi_app(environ, start_response):
    if url.startswith(':'):
        split = url.split('/', 1)

-        options = split[0].replace('|', '/').replace('\\\'', '\'').split(':')[1:]
+        raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:]

        if len(split) > 1:
            url = split[1]
@@ -504,15 +535,22 @@ def cgi_app(environ, start_response):
            url = ''

    else:
-        options = []
+        raw_options = []

    # init
-    options = Options(filterOptions(parseOptions(options)))
-    headers = {}
+    options = Options(filterOptions(parseOptions(raw_options)))

    global DEBUG
    DEBUG = options.debug

+    return (url, options)
+
+
+def cgi_app(environ, start_response):
+    url, options = cgi_parse_environ(environ)
+
+    headers = {}
+
    # headers
    headers['status'] = '200 OK'
    headers['cache-control'] = 'max-age=%s' % DELAY
@@ -537,6 +575,7 @@ def cgi_app(environ, start_response):
    crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))

    # get the work done
+    url = UrlFix(url)
    rss = FeedFetch(url, options)

    if headers['content-type'] == 'text/xml':
@@ -547,18 +586,42 @@ def cgi_app(environ, start_response):
    rss = FeedGather(rss, url, options)
    out = FeedFormat(rss, options)

-    if not options.silent:
-        return out
+    if options.silent:
+        return ['']
+
+    else:
+        return [out]


-def cgi_wrapper(environ, start_response):
-    # simple http server for html and css
+def middleware(func):
+    " Decorator to turn a function into a wsgi middleware "
+    # This is called when parsing the code
+
+    def app_builder(app):
+        # This is called when doing app = cgi_wrapper(app)
+
+        def app_wrap(environ, start_response):
+            # This is called when a http request is being processed
+
+            return func(environ, start_response, app)
+
+        return app_wrap
+
+    return app_builder
+
+
+@middleware
+def cgi_file_handler(environ, start_response, app):
+    " Simple HTTP server to serve static files (.html, .css, etc.) "
+
    files = {
        '': 'text/html',
-        'index.html': 'text/html'}
+        'index.html': 'text/html',
+        'sheet.xsl': 'text/xsl'}

    if 'REQUEST_URI' in environ:
        url = environ['REQUEST_URI'][1:]
+
    else:
        url = environ['PATH_INFO'][1:]

@@ -587,16 +650,80 @@ def cgi_wrapper(environ, start_response):
            start_response(headers['status'], list(headers.items()))
            return ['Error %s' % headers['status']]

-    # actual morss use
+    else:
+        return app(environ, start_response)
+
+
+def cgi_page(environ, start_response):
+    url, options = cgi_parse_environ(environ)
+
+    # get page
+    PROTOCOL = ['http', 'https']
+
+    if urlparse(url).scheme not in ['http', 'https']:
+        url = 'http://' + url
+
+    con = crawler.custom_handler().open(url)
+    data = con.read()
+
+    contenttype = con.info().get('Content-Type', '').split(';')[0]
+
+    if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
+        html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
+        html.make_links_absolute(con.geturl())
+
+        kill_tags = ['script', 'iframe', 'noscript']
+
+        for tag in kill_tags:
+            for elem in html.xpath('//'+tag):
+                elem.getparent().remove(elem)
+
+        output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
+
+    else:
+        output = None
+
+    # return html page
+    headers = {'status': '200 OK', 'content-type': 'text/html'}
+    start_response(headers['status'], list(headers.items()))
+    return [output]
+
+
+dispatch_table = {
+    'getpage': cgi_page
+    }
+
+
+@middleware
+def cgi_dispatcher(environ, start_response, app):
+    url, options = cgi_parse_environ(environ)
+
+    for key in dispatch_table.keys():
+        if key in options:
+            return dispatch_table[key](environ, start_response)
+
+    return app(environ, start_response)
+
+
+@middleware
+def cgi_error_handler(environ, start_response, app):
    try:
-        return [cgi_app(environ, start_response) or '(empty)']
+        return app(environ, start_response)
+
    except (KeyboardInterrupt, SystemExit):
        raise
+
    except Exception as e:
-        headers = {'status': '500 Oops', 'content-type': 'text/plain'}
+        headers = {'status': '500 Oops', 'content-type': 'text/html'}
        start_response(headers['status'], list(headers.items()), sys.exc_info())
-        log('ERROR <%s>: %s' % (url, e.message), force=True)
-        return ['An error happened:\n%s' % e.message]
+        log('ERROR: %s' % repr(e), force=True)
+        return [cgitb.html(sys.exc_info())]
+
+
+@middleware
+def cgi_encode(environ, start_response, app):
+    out = app(environ, start_response)
+    return [x if isinstance(x, bytes) else x.encode('utf-8') for x in out]


 def cli_app():
@@ -608,6 +735,7 @@ def cli_app():

    crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))

+    url = UrlFix(url)
    rss = FeedFetch(url, options)
    rss = FeedGather(rss, url, options)
    out = FeedFormat(rss, options)
@@ -622,6 +750,7 @@ def isInt(string):
    try:
        int(string)
        return True
+
    except ValueError:
        return False

@@ -629,7 +758,13 @@ def isInt(string):
 def main():
    if 'REQUEST_URI' in os.environ:
        # mod_cgi
-        wsgiref.handlers.CGIHandler().run(cgi_wrapper)
+
+        app = cgi_app
+        app = cgi_dispatcher(app)
+        app = cgi_error_handler(app)
+        app = cgi_encode(app)
+
+        wsgiref.handlers.CGIHandler().run(app)

    elif len(sys.argv) <= 1 or isInt(sys.argv[1]) or '--root' in sys.argv[1:]:
        # start internal (basic) http server
@@ -638,22 +773,31 @@ def main():
            argPort = int(sys.argv[1])
            if argPort > 0:
                port = argPort
+
            else:
                raise MorssException('Port must be positive integer')

        else:
            port = PORT

+        app = cgi_app
+        app = cgi_file_handler(app)
+        app = cgi_dispatcher(app)
+        app = cgi_error_handler(app)
+        app = cgi_encode(app)
+
        print('Serving http://localhost:%s/' % port)
-        httpd = wsgiref.simple_server.make_server('', port, cgi_wrapper)
+        httpd = wsgiref.simple_server.make_server('', port, app)
        httpd.serve_forever()

    else:
        # as a CLI app
        try:
            cli_app()
+
        except (KeyboardInterrupt, SystemExit):
            raise
+
        except Exception as e:
            print('ERROR: %s' % e.message)

--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@@ -93,6 +93,7 @@ def score_node(node):
    class_id = node.get('class', '') + node.get('id', '')

    if (isinstance(node, lxml.html.HtmlComment)
+            or isinstance(node, lxml.html.HtmlProcessingInstruction)
            or node.tag in tags_bad
            or regex_bad.search(class_id)):
        return 0
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 lxml
+bs4
 python-dateutil <= 1.5
 chardet
 pymysql
--- a/www/sheet.xsl
+++ b/www/sheet.xsl
@@ -13,6 +13,7 @@
 				body {
 					overflow-wrap: anywhere;
 					word-wrap: anywhere;
+					font-family: sans;
 				}

 				#url {
Author	SHA1	Message	Date
pictuga	bf86c1e962	crawler: make AutoUA match http(s) type	2020-04-05 16:07:51 +02:00
pictuga	d20f6237bd	crawler: replace ContentNegoHandler with AlternateHandler More basic. Sends the same headers no matter what. Make requests more "replicable". Also, drop "text/xml" from RSS contenttype, too broad, matches garbage	2020-04-05 16:05:59 +02:00
pictuga	8a4d68d72c	crawler: drop 'basic' toggle Can't even remember the use case	2020-04-05 16:03:06 +02:00
pictuga	e6811138fd	morss: use redirected url in :getpage Still have to find how to do the same thing with feeds...	2020-04-04 20:04:57 +02:00
pictuga	35b702fffd	morss: default values for feed creation	2020-04-04 19:39:32 +02:00
pictuga	4a88886767	morss: get_page to act as a basic proxy (for iframes)	2020-04-04 16:37:15 +02:00
pictuga	1653394cf7	morss: cgi_dispatcher to be able to create extra functions	2020-04-04 16:35:16 +02:00
pictuga	a8a90cf414	morss: move url/options parsing to own function For future re-use	2020-04-04 16:33:52 +02:00
pictuga	bdbaf0f8a7	morss/cgi: fix handling of special chars in url	2020-04-04 16:21:37 +02:00
pictuga	d0e447a2a6	ItemFix: clean up Pocket links	2020-04-04 16:20:39 +02:00
pictuga	e6817e01b4	sheet.xsl: set font to "sans" Browsers don't all have the same default font. Overriding for consistency	2020-04-03 17:47:19 +02:00
pictuga	7c3091d64c	morss: code spacing One of those commits that make me feel useful	2020-03-21 23:41:46 +01:00
pictuga	37b4e144a9	morss: small fixes Includes dropping off ftp support	2020-03-21 23:30:18 +01:00
pictuga	bd4b7b5bb2	morss: convert HTML feeds to XML ones for completeness	2020-03-21 23:27:42 +01:00
pictuga	68d920d4b5	morss: make FeedFormat more flexible with encoding	2020-03-21 23:26:35 +01:00
pictuga	758ff404a8	morss: fix cgi_app silent output Must return sth	2020-03-21 23:25:25 +01:00
pictuga	463530f02c	morss: middleware to enforce encoding bytes are always expected	2020-03-21 23:23:50 +01:00
pictuga	ec0a28a91d	morss: use middleware for wsgi apps	2020-03-21 23:23:21 +01:00
pictuga	421acb439d	morss: make errors more readable over http	2020-03-21 23:08:29 +01:00
pictuga	42c5d09ccb	morss: split "options" var into "raw_options" & "options" To make it clearer who-is-what	2020-03-21 23:07:07 +01:00
pictuga	056de12484	morss: add sheet.xsl to file handled by http server	2020-03-21 23:06:28 +01:00
pictuga	961a31141f	morss: fix url fixing	2020-03-21 17:28:00 +01:00
pictuga	a7b01ee85e	readabilite: further html processing instructions fix	2020-03-21 17:23:50 +01:00