morss/morss/readabilite.py

import lxml.etree
import lxml.html
import re


def parse(data, encoding=None):
    if encoding:
        parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding=encoding)
    else:
        parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)

    return lxml.html.fromstring(data, parser=parser)


def count_words(string):
    """ Quick word count

    Simply assumes that all words are 5 letter long.
    And so in about every language (sorry chinese).
    Basically skips spaces in the count. """

    if string is None:
        return 0

    i = 0
    count = 0

    try:
        while True:
            if string[i] not in "\r\n\t ":
                count += 1
                i += 6
            else:
                i += 1
    except IndexError:
        pass

    return count


def count_content(node):
    # count words and imgs
    return count_words(node.text_content()) + len(node.findall('.//img'))


class_bad = ['comment', 'community', 'extra', 'foot',
    'sponsor', 'pagination', 'pager', 'tweet', 'twitter', 'com-', 'masthead',
    'media', 'meta', 'related', 'shopping', 'tags', 'tool', 'author', 'about',
    'head', 'robots-nocontent', 'combx', 'disqus', 'menu', 'remark', 'rss',
    'shoutbox', 'sidebar', 'ad-', 'agegate', 'popup', 'sharing', 'share',
    'social', 'contact', 'footnote', 'outbrain', 'promo', 'scroll', 'hidden',
    'widget', 'hide']

regex_bad = re.compile('|'.join(class_bad), re.I)

class_good = ['and', 'article', 'body', 'column', 'main',
    'shadow', 'content', 'entry', 'hentry', 'main', 'page', 'pagination',
    'post', 'text', 'blog', 'story', 'par', 'editorial']

regex_good = re.compile('|'.join(class_good), re.I)


tags_junk = ['script', 'head', 'iframe', 'object', 'noscript',
    'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea',
    'button', 'footer']

tags_bad = tags_junk + ['a', 'aside']

tags_good = ['h1', 'h2', 'h3', 'article', 'p', 'cite', 'section', 'figcaption',
    'figure', 'em', 'strong', 'pre', 'br', 'hr', 'headline']

tags_meaning = ['a', 'abbr', 'address', 'acronym', 'audio', 'article', 'aside',
    'b', 'bdi', 'bdo', 'big', 'blockquote', 'br', 'caption', 'cite', 'center',
    'code', 'col', 'colgroup', 'data', 'dd', 'del', 'details', 'description',
    'dfn', 'dl', 'font', 'dt', 'em', 'figure', 'figcaption', 'h1', 'h2', 'h3',
    'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'kbd', 'li', 'main', 'mark',
    'nav', 'ol', 'p', 'pre', 'q', 'ruby', 'rp', 'rt', 's', 'samp', 'small',
    'source', 'strike', 'strong', 'sub', 'summary', 'sup', 'table', 'tbody',
    'td', 'tfoot', 'th', 'thead', 'time', 'tr', 'track', 'tt', 'u', 'ul', 'var',
    'wbr', 'video']
    # adapted from tt-rss source code, to keep even as shells

tags_void = ['img', 'hr', 'br'] # to keep even if empty


attributes_fine = ['title', 'src', 'href', 'type', 'value']


def score_node(node):
    " Score individual node "

    score = 0
    class_id = node.get('class', '') + node.get('id', '')

    if (isinstance(node, lxml.html.HtmlComment)
            or isinstance(node, lxml.html.HtmlProcessingInstruction)
            or node.tag in tags_bad
            or regex_bad.search(class_id)):
        return 0

    if node.tag in tags_good:
        score += 4

    if regex_good.search(class_id):
        score += 3

    wc = count_words(node.text_content())

    score += min(int(wc/10), 3) # give 1pt bonus for every 10 words, max of 3

    if wc != 0:
        wca = count_words(' '.join([x.text_content() for x in node.findall('.//a')]))
        score = score * ( 1 - float(wca)/wc )

    return score


def score_all(node, grades=None):
    " Fairly dumb loop to score all worthwhile nodes. Tries to be fast "

    if grades is None:
        grades = {}

    for child in node:
        score = score_node(child)
        child.attrib['seen'] = 'yes, ' + str(int(score))

        if score > 0:
            spread_score(child, score, grades)
            score_all(child, grades)

    return grades


def spread_score(node, score, grades):
    " Spread the node's score to its parents, on a linear way "

    delta = score / 2
    for ancestor in [node,] + list(node.iterancestors()):
        if score >= 1 or ancestor is node:
            try:
                grades[ancestor] += score
            except KeyError:
                grades[ancestor] = score

            score -= delta

        else:
            break


def write_score_all(root, grades):
    " Useful for debugging "

    for node in root.iter():
        node.attrib['score'] = str(int(grades.get(node, 0)))


def clean_root(root):
    for node in list(root):
        clean_root(node)
        clean_node(node)


def clean_node(node):
    parent = node.getparent()

    if parent is None:
        # this is <html/> (or a removed element waiting for GC)
        return

    gdparent = parent.getparent()

    # remove shitty tags
    if node.tag in tags_junk:
        parent.remove(node)
        return

    # remove shitty class/id FIXME TODO too efficient, might want to add a toggle
    class_id = node.get('class', '') + node.get('id', '')
    if len(regex_bad.findall(class_id)) >= 2:
        node.getparent().remove(node)
        return

    # remove shitty link
    if node.tag == 'a' and len(list(node.iter())) > 3:
        parent.remove(node)
        return

    # remove comments
    if isinstance(node, lxml.html.HtmlComment) or isinstance(node, lxml.html.HtmlProcessingInstruction):
        parent.remove(node)
        return

    # remove if too many kids & too high link density
    wc = count_words(node.text_content())
    if wc != 0 and len(list(node.iter())) > 3:
        wca = count_words(' '.join([x.text_content() for x in node.findall('.//a')]))
        if float(wca)/wc > 0.8:
            parent.remove(node)
            return

    # squash text-less elements shells
    if node.tag in tags_void:
        # keep 'em
        pass
    elif node.tag in tags_meaning:
        # remove if content-less
        if not count_content(node):
            parent.remove(node)
            return
    else:
        # squash non-meaningful if no direct text
        content = (node.text or '') + ' '.join([child.tail or '' for child in node])
        if not count_words(content):
            node.drop_tag()
            return

    # for http://vice.com/fr/
    if node.tag == 'img' and 'data-src' in node.attrib:
        node.attrib['src'] = node.attrib['data-src']

    # clean the node's attributes
    for attrib in node.attrib:
        if attrib not in attributes_fine:
            del node.attrib[attrib]

    # br2p
    if node.tag == 'br':
        if gdparent is None:
            return

        if not count_words(node.tail):
            # if <br/> is at the end of a div (to avoid having <p/>)
            return

        else:
            # set up new node
            new_node = lxml.html.Element(parent.tag)
            new_node.text = node.tail

            for child in node.itersiblings():
                new_node.append(child)

            # delete br
            node.tail = None
            parent.remove(node)

            gdparent.insert(gdparent.index(parent)+1, new_node)


def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
    ancestorsA = list(nodeA.iterancestors())
    ancestorsB = list(nodeB.iterancestors())

    if max_depth is not None:
        ancestorsA = ancestorsA[:max_depth]
        ancestorsB = ancestorsB[:max_depth]

    ancestorsA.insert(0, nodeA)
    ancestorsB.insert(0, nodeB)

    for ancestorA in ancestorsA:
        if ancestorA in ancestorsB:
            return ancestorA

    return nodeA # should always find one tho, at least <html/>, but needed for max_depth


def rank_nodes(grades):
    return sorted(grades.items(), key=lambda x: x[1], reverse=True)


def get_best_node(grades):
    " To pick the best (raw) node. Another function will clean it "

    if len(grades) == 1:
        return grades[0]

    top = rank_nodes(grades)
    lowest = lowest_common_ancestor(top[0][0], top[1][0], 3)

    return lowest


def get_article(data, url=None, encoding=None):
    " Input a raw html string, returns a raw html string of the article "

    html = parse(data, encoding)
    scores = score_all(html)

    if not len(scores):
        return None

    best = get_best_node(scores)
    wc = count_words(best.text_content())
    wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))

    if wc - wca < 50 or float(wca) / wc > 0.3:
        return None

    if url:
        best.make_links_absolute(url)

    clean_root(best)

    return lxml.etree.tostring(best, pretty_print=True)