5 changed files with 72 additions and 97 deletions
--- a/README.md
+++ b/README.md
@ -24,13 +24,15 @@ hand-written rules (ie. there's no automatic detection of links to build feeds).
 Please mind that feeds based on html files may stop working unexpectedly, due to
 html structure changes on the target website.

-Additionally morss can detect rss feeds in html pages' `<meta>`.
+Additionally morss can grab the source xml feed of iTunes podcast, and detect
+rss feeds in html pages' `<meta>`.

 You can use this program online for free at **[morss.it](https://morss.it/)**.

 Some features of morss:
 - Read RSS/Atom feeds
 - Create RSS feeds from json/html pages
+- Convert iTunes podcast links into xml links
 - Export feeds as RSS/JSON/CSV/HTML
 - Fetch full-text content of feed items
 - Follow 301/meta redirects
@ -73,8 +75,6 @@ The arguments are:

 - Change what morss does
 	- `json`: output as JSON
-	- `html`: outpout as HTML
-	- `csv`: outpout as CSV
 	- `proxy`: doesn't fill the articles
 	- `clip`: stick the full article content under the original feed content (useful for twitter)
 	- `search=STRING`: does a basic case-sensitive search in the feed
@ -88,9 +88,11 @@ The arguments are:
 	- `mono`: disable multithreading while fetching, makes debugging easier
 	- `theforce`: force download the rss feed and ignore cached http errros
 	- `silent`: don't output the final RSS (useless on its own, but can be nice when debugging)
+	- `encoding=ENCODING`: overrides the encoding auto-detection of the crawler. Some web developers did not quite understand the importance of setting charset/encoding tags correctly...
 - http server only
 	- `callback=NAME`: for JSONP calls
 	- `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers)
+	- `html`: changes the http content-type to html, so that python cgi erros (written in html) are readable in a web browser
 	- `txt`: changes the http content-type to txt (for faster "`view-source:`")
 - Custom feeds: you can turn any HTML page into a RSS feed using morss, using xpath rules. The article content will be fetched as usual (with readabilite). Please note that you will have to **replace** any `/` in your rule with a `|` when using morss as a webserver
 	- `items`: (**mandatory** to activate the custom feeds function) xpath rule to match all the RSS entries
--- a/morss/crawler.py
+++ b/morss/crawler.py
@ -34,25 +34,6 @@ MIMETYPE = {
 DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'


-def get(*args, **kwargs):
-    return adv_get(*args, **kwargs)[0]
-
-
-def adv_get(url, timeout=None, *args, **kwargs):
-    if timeout is None:
-        con = custom_handler(*args, **kwargs).open(url)
-
-    else:
-        con = custom_handler(*args, **kwargs).open(url, timeout=timeout)
-
-    data = con.read()
-
-    contenttype = con.info().get('Content-Type', '').split(';')[0]
-    encoding= detect_encoding(data, con)
-
-    return data, con, contenttype, encoding
-
-
 def custom_handler(follow=None, delay=None, encoding=None):
    handlers = []

@ -218,6 +199,7 @@ class BrowserlyHeaderHandler(BaseHandler):
    """ Add more headers to look less suspicious """

    def http_request(self, req):
+        req.add_unredirected_header('Referer', '%s://%s' % (req.type, req.host))
        req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
        req.add_unredirected_header('Accept-Language', 'en-US,en;q=0.5')
        return req
--- a/morss/feeds.py
+++ b/morss/feeds.py
@ -15,7 +15,7 @@ import dateutil.parser
 from copy import deepcopy

 import lxml.html
-from .readabilite import parse as html_parse
+from bs4 import BeautifulSoup

 json.encoder.c_make_encoder = None

@ -53,7 +53,7 @@ def parse_rules(filename=None):
    return rules


-def parse(data, url=None, mimetype=None, encoding=None):
+def parse(data, url=None, mimetype=None):
    " Determine which ruleset to use "

    rulesets = parse_rules()
@ -67,14 +67,14 @@ def parse(data, url=None, mimetype=None, encoding=None):
                for path in ruleset['path']:
                    if fnmatch(url, path):
                        parser = [x for x in parsers if x.mode == ruleset['mode']][0]
-                        return parser(data, ruleset, encoding=encoding) 
+                        return parser(data, ruleset) 

    # 2) Look for a parser based on mimetype

    if mimetype is not None:
        parser_candidates = [x for x in parsers if mimetype in x.mimetype]

-    if mimetype is None or len(parser_candidates) == 0:
+    if mimetype is None or parser_candidates is None:
        parser_candidates = parsers

    # 3) Look for working ruleset for given parser
@ -86,7 +86,7 @@ def parse(data, url=None, mimetype=None, encoding=None):
            # 'path' as they should have been caught beforehands

        try:
-            feed = parser(data, encoding=encoding)
+            feed = parser(data)

        except (ValueError):
            # parsing did not work
@ -113,7 +113,7 @@ def parse(data, url=None, mimetype=None, encoding=None):


 class ParserBase(object):
-    def __init__(self, data=None, rules=None, parent=None, encoding=None):
+    def __init__(self, data=None, rules=None, parent=None):
        if rules is None:
            rules = parse_rules()[self.default_ruleset]

@ -122,10 +122,9 @@ class ParserBase(object):
        if data is None:
            data = rules['base']

-        self.parent = parent
-        self.encoding = encoding
-
        self.root = self.parse(data)
+        self.parent = parent
+

    def parse(self, raw):
        pass
@ -443,7 +442,8 @@ class ParserHTML(ParserXML):
    mimetype = ['text/html', 'application/xhtml+xml']

    def parse(self, raw):
-        return html_parse(raw, encoding=self.encoding)
+        parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
+        return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify('utf-8'), parser)

    def tostring(self, encoding='unicode', **k):
        return lxml.html.tostring(self.root, encoding=encoding, **k)
--- a/morss/morss.py
+++ b/morss/morss.py
@ -10,6 +10,7 @@ import re

 import lxml.etree
 import lxml.html
+from bs4 import BeautifulSoup

 from . import feeds
 from . import crawler
@ -250,17 +251,19 @@ def ItemFill(item, options, feedurl='/', fast=False):
        delay = -2

    try:
-        data, con, contenttype, encoding = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
+        con = crawler.custom_handler(delay=delay, encoding=options.encoding).open(link, timeout=TIMEOUT)
+        data = con.read()

    except (IOError, HTTPException) as e:
        log('http error')
        return False # let's just delete errors stuff when in cache mode

+    contenttype = con.info().get('Content-Type', '').split(';')[0]
    if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
        log('non-text page')
        return True

-    out = readabilite.get_article(data, url=con.geturl(), encoding=encoding)
+    out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con))

    if out is not None:
        item.content = out
@ -321,14 +324,18 @@ def FeedFetch(url, options):
        delay = 0

    try:
-        xml, con, contenttype, encoding = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2)
+        con = crawler.custom_handler(follow='rss', delay=delay, encoding=options.encoding) \
+            .open(url, timeout=TIMEOUT * 2)
+        xml = con.read()

    except (IOError, HTTPException):
        raise MorssException('Error downloading feed')

+    contenttype = con.info().get('Content-Type', '').split(';')[0]
+
    if options.items:
        # using custom rules
-        rss = feeds.FeedHTML(xml, encoding=encoding)
+        rss = feeds.FeedHTML(xml)

        rss.rules['title'] = options.title              if options.title        else '//head/title'
        rss.rules['desc'] = options.desc                if options.desc         else '//head/meta[@name="description"]/@content'
@ -348,7 +355,7 @@ def FeedFetch(url, options):

    else:
        try:
-            rss = feeds.parse(xml, url, contenttype, encoding=encoding)
+            rss = feeds.parse(xml, url, contenttype)
            rss = rss.convert(feeds.FeedXML)
                # contains all fields, otherwise much-needed data can be lost

@ -462,7 +469,7 @@ def FeedFormat(rss, options, encoding='utf-8'):
    elif options.csv:
        return rss.tocsv(encoding=encoding)

-    elif options.html:
+    elif options.reader:
        if options.indent:
            return rss.tohtml(encoding=encoding, pretty_print=True)

@ -540,7 +547,7 @@ def cgi_app(environ, start_response):
    if options.cors:
        headers['access-control-allow-origin'] = '*'

-    if options.html:
+    if options.html or options.reader:
        headers['content-type'] = 'text/html'
    elif options.txt or options.silent:
        headers['content-type'] = 'text/plain'
@ -645,10 +652,13 @@ def cgi_page(environ, start_response):
    if urlparse(url).scheme not in ['http', 'https']:
        url = 'http://' + url

-    data, con, contenttype, encoding = crawler.adv_get(url=url)
+    con = crawler.custom_handler().open(url)
+    data = con.read()
+
+    contenttype = con.info().get('Content-Type', '').split(';')[0]

    if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
-        html = readabilite.parse(data, encoding=encoding)
+        html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
        html.make_links_absolute(con.geturl())

        kill_tags = ['script', 'iframe', 'noscript']
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@ -6,14 +6,11 @@ import re

 def parse(data, encoding=None):
    if encoding:
-        data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')
-
+        parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding=encoding)
    else:
-        data = BeautifulSoup(data, 'lxml').prettify('utf-8')
+        parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)

-    parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding='utf-8')
-
-    return lxml.html.fromstring(data, parser=parser)
+    return lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify('utf-8'), parser=parser)


 def count_words(string):
@ -47,12 +44,6 @@ def count_content(node):
    return count_words(node.text_content()) + len(node.findall('.//img'))


-def percentile(N, P):
-    # https://stackoverflow.com/a/7464107
-    n = max(int(round(P * len(N) + 0.5)), 2)
-    return N[n-2]
-
-
 class_bad = ['comment', 'community', 'extra', 'foot',
    'sponsor', 'pagination', 'pager', 'tweet', 'twitter', 'com-', 'masthead',
    'media', 'meta', 'related', 'shopping', 'tags', 'tool', 'author', 'about',
@ -132,42 +123,33 @@ def score_node(node):
    return score


-def score_all(node):
+def score_all(node, grades=None):
    " Fairly dumb loop to score all worthwhile nodes. Tries to be fast "

+    if grades is None:
+        grades = {}
+
    for child in node:
        score = score_node(child)
        child.attrib['seen'] = 'yes, ' + str(int(score))

-        if score > 0 or len(list(child.iterancestors())) <= 2:
-            spread_score(child, score)
-            score_all(child)
+        if score > 0 or not len(grades):
+            spread_score(child, score, grades)
+            score_all(child, grades)
+
+    return grades


-def set_score(node, value):
-    node.attrib['morss_score'] = str(float(value))
-
-
-def get_score(node):
-    return float(node.attrib.get('morss_score', 0))
-
-
-def incr_score(node, delta):
-    set_score(node, get_score(node) + delta)
-
-
-def get_all_scores(node):
-    return {x:get_score(x) for x in list(node.iter()) if get_score(x) != 0}
-
-
-def spread_score(node, score):
+def spread_score(node, score, grades):
    " Spread the node's score to its parents, on a linear way "

    delta = score / 2
-
    for ancestor in [node,] + list(node.iterancestors()):
        if score >= 1 or ancestor is node:
-            incr_score(ancestor, score)
+            try:
+                grades[ancestor] += score
+            except KeyError:
+                grades[ancestor] = score

            score -= delta

@ -175,24 +157,26 @@ def spread_score(node, score):
            break


-def clean_root(root, keep_threshold=None):
+def write_score_all(root, grades):
+    " Useful for debugging "
+
+    for node in root.iter():
+        node.attrib['score'] = str(int(grades.get(node, 0)))
+
+
+def clean_root(root):
    for node in list(root):
-        # bottom-up approach, i.e. starting with children before cleaning current node
-        clean_root(node, keep_threshold)
-        clean_node(node, keep_threshold)
+        clean_root(node)
+        clean_node(node)


-def clean_node(node, keep_threshold=None):
+def clean_node(node):
    parent = node.getparent()

    if parent is None:
        # this is <html/> (or a removed element waiting for GC)
        return

-    if keep_threshold is not None and get_score(node) >= keep_threshold:
-        # high score, so keep
-        return
-
    gdparent = parent.getparent()

    # remove shitty tags
@ -291,18 +275,18 @@ def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
    return nodeA # should always find one tho, at least <html/>, but needed for max_depth


-def rank_grades(grades):
-    # largest score to smallest
+def rank_nodes(grades):
    return sorted(grades.items(), key=lambda x: x[1], reverse=True)


-def get_best_node(ranked_grades):
+def get_best_node(grades):
    " To pick the best (raw) node. Another function will clean it "

-    if len(ranked_grades) == 1:
-        return ranked_grades[0]
+    if len(grades) == 1:
+        return grades[0]

-    lowest = lowest_common_ancestor(ranked_grades[0][0], ranked_grades[1][0], 3)
+    top = rank_nodes(grades)
+    lowest = lowest_common_ancestor(top[0][0], top[1][0], 3)

    return lowest

@ -311,17 +295,12 @@ def get_article(data, url=None, encoding=None):
    " Input a raw html string, returns a raw html string of the article "

    html = parse(data, encoding)
-    score_all(html)
-    scores = rank_grades(get_all_scores(html))
+    scores = score_all(html)

    if not len(scores):
        return None

    best = get_best_node(scores)
-
-    keep_threshold = percentile([x[1] for x in scores], 0.1)
-    clean_root(best, keep_threshold)
-
    wc = count_words(best.text_content())
    wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))

@ -331,4 +310,6 @@ def get_article(data, url=None, encoding=None):
    if url:
        best.make_links_absolute(url)

+    clean_root(best)
+
    return lxml.etree.tostring(best, pretty_print=True)