morss: drop 'keep' option

Because the Firefox behaviour it is working around is no longer in use
feeds: parse html with BS
2020-04-05 16:37:27 +02:00 · 2020-04-05 16:12:41 +02:00 · 2020-04-05 16:11:36 +02:00
4 changed files with 5 additions and 8 deletions
--- a/README.md
+++ b/README.md
@@ -77,7 +77,6 @@ The arguments are:
 	- `json`: output as JSON
 	- `proxy`: doesn't fill the articles
 	- `clip`: stick the full article content under the original feed content (useful for twitter)
 	- `keep`: by default, morss does drop feed description whenever the full-content is found (so as not to mislead users who use Firefox, since the latter only shows the description in the feed preview, so they might believe morss doens't work), but with this argument, the description is kept
 	- `search=STRING`: does a basic case-sensitive search in the feed
 - Advanced
 	- `csv`: export to csv
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -367,7 +367,7 @@ class CacheHandler(BaseHandler):
        elif  self.force_min is None and ('no-cache' in cc_list
                                        or 'no-store' in cc_list
-                                        or ('private' in cc_list and not self.private)):
+                                        or ('private' in cc_list and not self.private_cache)):
            # kindly follow web servers indications, refresh
            return None
@@ -402,7 +402,7 @@ class CacheHandler(BaseHandler):
            cc_list = [x for x in cache_control if '=' not in x]
-            if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private):
+            if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache):
                # kindly follow web servers indications
                return resp
--- a/morss/feeds.py
+++ b/morss/feeds.py
@@ -15,6 +15,7 @@ import dateutil.parser
 from copy import deepcopy
 import lxml.html
 from bs4 import BeautifulSoup
 json.encoder.c_make_encoder = None
@@ -441,7 +442,7 @@ class ParserHTML(ParserXML):
    def parse(self, raw):
        parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
-        return etree.fromstring(raw, parser)
+        return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify(), parser)
    def tostring(self, encoding='unicode', **k):
        return lxml.html.tostring(self.root, encoding=encoding, **k)
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -54,7 +54,7 @@ def filterOptions(options):
    # example of filtering code below
-    #allowed = ['proxy', 'clip', 'keep', 'cache', 'force', 'silent', 'pro', 'debug']
+    #allowed = ['proxy', 'clip', 'cache', 'force', 'silent', 'pro', 'debug']
    #filtered = dict([(key,value) for (key,value) in options.items() if key in allowed])
    #return filtered
@@ -288,9 +288,6 @@ def ItemAfter(item, options):
        item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
        del item.desc
    if not options.keep and not options.proxy:
        del item.desc
    if options.nolink and item.content:
        content = lxml.html.fromstring(item.content)
        for link in content.xpath('//a'):
Author	SHA1	Message	Date
pictuga	d90756b337	morss: drop 'keep' option Because the Firefox behaviour it is working around is no longer in use	2020-04-05 16:37:27 +02:00
pictuga	40c69f17d2	feeds: parse html with BS More robust & to make it consistent with :getpage	2020-04-05 16:12:41 +02:00
pictuga	99461ea185	crawler: fix var name issues (private_cache)	2020-04-05 16:11:36 +02:00