From f6bc23927fbf6f9b5e39ca78403ce50af87cc7d9 Mon Sep 17 00:00:00 2001 From: pictuga Date: Sat, 25 Apr 2020 12:25:02 +0200 Subject: [PATCH] readabilite: drop dangerous tags (script, style) --- morss/readabilite.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/morss/readabilite.py b/morss/readabilite.py index 0f39144..1cef84f 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -70,9 +70,10 @@ class_good = ['and', 'article', 'body', 'column', 'main', regex_good = re.compile('|'.join(class_good), re.I) -tags_junk = ['script', 'head', 'iframe', 'object', 'noscript', - 'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea', - 'button', 'footer', 'link', 'meta'] +tags_dangerous = ['script', 'head', 'iframe', 'object', 'style', 'link', 'meta'] + +tags_junk = tags_dangerous + ['noscript', 'param', 'embed', 'layer', 'applet', + 'form', 'input', 'textarea', 'button', 'footer'] tags_bad = tags_junk + ['a', 'aside'] @@ -106,6 +107,9 @@ def score_node(node): or isinstance(node, lxml.html.HtmlProcessingInstruction)): return 0 + if node.tag in tags_dangerous: + return 0 + if node.tag in tags_junk: score += -1 # actuall -2 as tags_junk is included tags_bad @@ -189,6 +193,11 @@ def clean_node(node, keep_threshold=None): # this is (or a removed element waiting for GC) return + # remove dangerous tags, no matter what + if node.tag in tags_dangerous: + parent.remove(node) + return + if keep_threshold is not None and get_score(node) >= keep_threshold: # high score, so keep return