From 3bfad54add267fb681d6daafc74f7bb40c3afde4 Mon Sep 17 00:00:00 2001
From: pictuga
Date: Mon, 17 Jul 2017 00:27:41 +0200
Subject: [PATCH] readabilite: change cleaning & code structure
Kinda struggled to make some "nice" code
---
morss/readabilite.py | 110 ++++++++++++++++++++++++++++---------------
1 file changed, 71 insertions(+), 39 deletions(-)
diff --git a/morss/readabilite.py b/morss/readabilite.py
index 36c9f5f..a451a3d 100644
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@@ -98,60 +98,95 @@ def score_node(node):
def score_all(root):
grades = {}
- for item in root.iter():
- score = score_node(item)
+ for node in list(root.iter()):
+ score = score_node(node)
- grades[item] = score
+ parent = node.getparent()
+ clean_node(node)
+
+ if parent is not None and node.getparent() is None:
+ # if the node got deleted/dropped (else, nothing to do)
+ # maybe now the parent only contains 1 item and needs to be flattened?
+
+ gdparent = parent.getparent()
+ clean_node(parent)
+
+ if gdparent is not None and parent.getparent() is None:
+ # if the parent got deleted/dropped
+ spread_score(gdparent, score + grades[parent], grades)
- factor = 2
- for ancestor in item.iterancestors():
- if score / factor > 1:
- grades[ancestor] += score / factor
- factor *= 2
else:
- break
+ # if the parent was kept
+ spread_score(parent, score, grades)
+
+ else:
+ # if the node was kept
+ spread_score(node, score, grades)
return grades
+def spread_score(node, score, grades):
+ for ancestor in [node,] + list(node.iterancestors()):
+ if score >= 1 or ancestor is node:
+ try:
+ grades[ancestor] += score
+ except KeyError:
+ grades[ancestor] = score
+
+ score /= 2
+
+ else:
+ break
+
+
def write_score_all(root, grades):
for node in root.iter():
node.attrib['score'] = str(int(grades[node]))
-def clean_html(root):
- for item in list(root.iter()): # list() needed to be able to remove elements while iterating
- # Step 1. Do we keep the node?
+def clean_node(node):
+ # Step 1. Do we keep the node?
- if item.tag in tags_junk:
- # remove shitty tags
- item.getparent().remove(item)
- continue
+ if node.getparent() is None:
+ # this is
+ return
- if item.tag in ['div'] \
- and len(list(item.iterchildren())) <= 1 \
- and not (item.text or '').strip() \
- and not (item.tail or '').strip():
- # remove div with only one item inside
- item.drop_tag()
- continue
+ if node.tag in tags_junk:
+ # remove shitty tags
+ node.getparent().remove(node)
+ return
- class_id = item.get('class', '') + item.get('id', '')
- if regex_bad.match(class_id) is not None:
- # remove shitty class/id
- item.getparent().remove(item)
- continue
+ # Turn
into Bla bla bla
- if isinstance(item, lxml.html.HtmlComment):
- # remove comments
- item.getparent().remove(item)
- continue
+ if node.tag in ['div'] \
+ and len(list(node.iterchildren())) <= 1 \
+ and not (node.text or '').strip() \
+ and not (node.tail or '').strip():
+ node.drop_tag()
+ return
- # Step 2. Clean the node's attributes
+ class_id = node.get('class', '') + node.get('id', '')
+ if len(regex_junk.findall(class_id)) >= 2:
+ # remove shitty class/id
+ node.getparent().remove(node)
+ return
- for attrib in item.attrib:
- if attrib not in attributes_fine:
- del item.attrib[attrib]
+ if node.tag == 'a' and len(list(node.iter())) > 3:
+ # shitty link
+ node.getparent().remove(node)
+ return
+
+ if isinstance(node, lxml.html.HtmlComment):
+ # remove comments
+ node.getparent().remove(node)
+ return
+
+ # Step 2. Clean the node's attributes
+
+ for attrib in node.attrib:
+ if attrib not in attributes_fine:
+ del node.attrib[attrib]
def br2p(root):
@@ -219,10 +254,7 @@ def get_best_node(grades, highlight=False):
def get_article(data, url=None, encoding=None):
html = parse(data, encoding)
-
- clean_html(html)
br2p(html)
-
scores = score_all(html)
best = get_best_node(scores)