feeds: properly use html template

feeds: try all parsers regardless of contenttype
Turns out some websites send the wrong contenttype (json for html, html for xml, etc.)
2020-04-09 20:00:51 +02:00 · 2020-04-09 19:17:51 +02:00 · 2020-04-09 19:10:45 +02:00 · 2020-04-09 19:09:10 +02:00 · 2020-04-09 19:08:13 +02:00 · 2020-04-09 19:06:51 +02:00
5 changed files with 66 additions and 62 deletions
--- a/morss/feedify.ini
+++ b/morss/feedify.ini
@@ -99,7 +99,7 @@ item_link = ./a/@href
 item_desc = ./div[class=desc]
 item_content = ./div[class=content]

-base = <!DOCTYPE html> <html> <head> <title>Feed reader by morss</title> <meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" /> </head> <body> <div id="header"> <h1>@feed.title</h1> <h2>@feed.desc</h2> <p>- via morss</p> </div> <div id="content"> <div class="item"> <a class="title link" href="@item.link" target="_blank">@item.title</a> <div class="desc">@item.desc</div> <div class="content">@item.content</div> </div> </div> <script> var items = document.getElementsByClassName('item') for (var i in items) items[i].onclick = function() { this.classList.toggle('active') document.body.classList.toggle('noscroll') } </script> </body> </html>
+base = file:reader.html.template

 [twitter]
 mode = html
--- a/morss/feeds.py
+++ b/morss/feeds.py
@@ -47,7 +47,11 @@ def parse_rules(filename=None):

    for section in rules.keys():
        for arg in rules[section].keys():
-            if '\n' in rules[section][arg]:
+            if rules[section][arg].startswith('file:'):
+                import_file = os.path.join(os.path.dirname(__file__), rules[section][arg][5:])
+                rules[section][arg] = open(import_file).read()
+
+            elif '\n' in rules[section][arg]:
                rules[section][arg] = rules[section][arg].split('\n')[1:]

    return rules
@@ -69,19 +73,13 @@ def parse(data, url=None, mimetype=None, encoding=None):
                        parser = [x for x in parsers if x.mode == ruleset['mode']][0]
                        return parser(data, ruleset, encoding=encoding) 

-    # 2) Look for a parser based on mimetype
-
-    if mimetype is not None:
-        parser_candidates = [x for x in parsers if mimetype in x.mimetype]
-
-    if mimetype is None or len(parser_candidates) == 0:
-        parser_candidates = parsers
+    # 2) Try each and every parser

    # 3) Look for working ruleset for given parser
        # 3a) See if parsing works
        # 3b) See if .items matches anything

-    for parser in parser_candidates:
+    for parser in parsers:
        ruleset_candidates = [x for x in rulesets.values() if x['mode'] == parser.mode and 'path' not in x]
            # 'path' as they should have been caught beforehands

@@ -150,15 +148,15 @@ class ParserBase(object):
        c = csv.writer(out, dialect=csv.excel)

        for item in self.items:
-            row = [getattr(item, x) for x in item.dic]
-
-            if encoding != 'unicode':
-                row = [x.encode(encoding) if isinstance(x, unicode) else x for x in row]
-
-            c.writerow(row)
+            c.writerow([getattr(item, x) for x in item.dic])

        out.seek(0)
-        return out.read()
+        out = out.read()
+
+        if encoding != 'unicode':
+            out = out.encode(encoding)
+
+        return out

    def tohtml(self, **k):
        return self.convert(FeedHTML).tostring(**k)
@@ -269,7 +267,14 @@ class ParserBase(object):

        except AttributeError:
            # does not exist, have to create it
+            try:
                self.rule_create(self.rules[rule_name])
+
+            except AttributeError:
+                # no way to create it, give up
+                pass
+
+            else:
                self.rule_set(self.rules[rule_name], value)

    def rmv(self, rule_name):
@@ -469,6 +474,9 @@ class ParserHTML(ParserXML):
            element = deepcopy(match)
            match.getparent().append(element)

+        else:
+            raise AttributeError('no way to create item')
+

 def parse_time(value):
    if value is None or value == 0:
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -471,10 +471,10 @@ def FeedFormat(rss, options, encoding='utf-8'):

    else:
        if options.indent:
-            return rss.torss(xml_declaration=True, encoding=encoding, pretty_print=True)
+            return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding, pretty_print=True)

        else:
-            return rss.torss(xml_declaration=True, encoding=encoding)
+            return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding)


 def process(url, cache=None, options=None):
@@ -554,6 +554,8 @@ def cgi_app(environ, start_response):
    else:
        headers['content-type'] = 'text/xml'

+    headers['content-type'] += '; charset=utf-8'
+
    crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))

    # get the work done
@@ -636,7 +638,7 @@ def cgi_file_handler(environ, start_response, app):
        return app(environ, start_response)


-def cgi_page(environ, start_response):
+def cgi_get(environ, start_response):
    url, options = cgi_parse_environ(environ)

    # get page
@@ -648,6 +650,7 @@ def cgi_page(environ, start_response):
    data, con, contenttype, encoding = crawler.adv_get(url=url)

    if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
+        if options.get == 'page':
            html = readabilite.parse(data, encoding=encoding)
            html.make_links_absolute(con.geturl())

@@ -659,17 +662,23 @@ def cgi_page(environ, start_response):

            output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')

+        elif options.get == 'article':
+            output = readabilite.get_article(data, url=con.geturl(), encoding=encoding, debug=options.debug)
+
        else:
-        output = None
+            raise MorssException('no :get option passed')
+
+    else:
+        output = data

    # return html page
-    headers = {'status': '200 OK', 'content-type': 'text/html'}
+    headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8'}
    start_response(headers['status'], list(headers.items()))
    return [output]


 dispatch_table = {
-    'getpage': cgi_page
+    'get': cgi_get,
    }


@@ -717,10 +726,10 @@ def cli_app():
    url = UrlFix(url)
    rss = FeedFetch(url, options)
    rss = FeedGather(rss, url, options)
-    out = FeedFormat(rss, options)
+    out = FeedFormat(rss, options, 'unicode')

    if not options.silent:
-        print(out.decode('utf-8', 'replace') if isinstance(out, bytes) else out)
+        print(out)

    log('done')

--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@@ -307,7 +307,7 @@ def get_best_node(ranked_grades):
    return lowest


-def get_article(data, url=None, encoding=None):
+def get_article(data, url=None, encoding=None, debug=False):
    " Input a raw html string, returns a raw html string of the article "

    html = parse(data, encoding)
@@ -319,16 +319,17 @@ def get_article(data, url=None, encoding=None):

    best = get_best_node(scores)

+    if not debug:
        keep_threshold = percentile([x[1] for x in scores], 0.1)
        clean_root(best, keep_threshold)

    wc = count_words(best.text_content())
    wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))

-    if wc - wca < 50 or float(wca) / wc > 0.3:
+    if not debug and (wc - wca < 50 or float(wca) / wc > 0.3):
        return None

    if url:
        best.make_links_absolute(url)

-    return lxml.etree.tostring(best, pretty_print=True)
+    return lxml.etree.tostring(best if not debug else html, pretty_print=True)
--- a/morss/reader.html.template
+++ b/morss/reader.html.template
@@ -1,11 +1,9 @@
-@require(feed)
 <!DOCTYPE html>
 <html>
 	<head>
-		<title>@feed.title &#8211; via morss</title>
-		<meta charset="UTF-8" />
-		<meta name="description" content="@feed.desc (via morss)" />
+		<title>Feed reader by morss</title>
 		<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
+		<meta name="robots" content="noindex" />

 		<style type="text/css">
 			/* columns - from https://thisisdallas.github.io/Simple-Grid/simpleGrid.css */
@@ -32,7 +30,7 @@
 				padding-right: 20px; /* column-space */
 			}

-			@@media handheld, only screen and (max-width: 767px) { /* @@ to escape from the template engine */
+			@media handheld, only screen and (max-width: 767px) {
 				#content {
 					width: 100%;
 					min-width: 0;
@@ -82,6 +80,7 @@

 			#content {
 				text-align: justify;
+				line-height: 1.5em;
 			}

 				.item .title {
@@ -171,31 +170,18 @@

 	<body>
 		<div id="header">
-			<h1>@feed.title</h1>
-			@if feed.desc:
-				<h2>@feed.desc</h2>
-			@end
+			<h1>RSS feed</h1>
+			<h2>with full text articles</h2>
 			<p>- via morss</p>
 		</div>

 		<div id="content">
-			@for item in feed.items:
 				<div class="item">
-					@if item.link:
-						<a class="title link" href="@item.link" target="_blank">@item.title</a>
-					@else:
-						<span class="title">@item.title</span>
-					@end
-					<div class="article">
-						@if item.content:
-							@item.content
-						@else:
-							@item.desc
-						@end
+					<a class="title link" href="@item.link" target="_blank"></a>
+					<div class="desc"></div>
+					<div class="content"></div>
 				</div>
 		</div>
-			@end
-		</div>

 	<script>
 		var items = document.getElementsByClassName('item')
Author	SHA1	Message	Date
pictuga	9e7b9d95ee	feeds: properly use html template	2020-04-09 20:00:51 +02:00
pictuga	987a719c4e	feeds: try all parsers regardless of contenttype Turns out some websites send the wrong contenttype (json for html, html for xml, etc.)	2020-04-09 19:17:51 +02:00
pictuga	47b33f4baa	morss: specify server output encoding	2020-04-09 19:10:45 +02:00
pictuga	3c7f512583	feeds: handle several errors	2020-04-09 19:09:10 +02:00
pictuga	a32f5a8536	readabilite: add debug option (also used by :get)	2020-04-09 19:08:13 +02:00
pictuga	63a06524b7	morss: various encoding fixes	2020-04-09 19:06:51 +02:00
pictuga	b0f80c6d3c	morss: fix csv output encoding	2020-04-09 19:05:50 +02:00
pictuga	78cea10ead	morss: replace :getpage with :get Also provides readabilite debugging	2020-04-09 18:43:20 +02:00