readabilite: remove code duplicate
parent
fe5dbf1ce0
commit
e81f6b173f
|
@ -207,8 +207,10 @@ def clean_root(root, keep_threshold=None):
|
||||||
def clean_node(node, keep_threshold=None):
|
def clean_node(node, keep_threshold=None):
|
||||||
parent = node.getparent()
|
parent = node.getparent()
|
||||||
|
|
||||||
|
# remove comments
|
||||||
if (isinstance(node, lxml.html.HtmlComment)
|
if (isinstance(node, lxml.html.HtmlComment)
|
||||||
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
|
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
|
||||||
|
parent.remove(node)
|
||||||
return
|
return
|
||||||
|
|
||||||
if parent is None:
|
if parent is None:
|
||||||
|
@ -242,11 +244,6 @@ def clean_node(node, keep_threshold=None):
|
||||||
parent.remove(node)
|
parent.remove(node)
|
||||||
return
|
return
|
||||||
|
|
||||||
# remove comments
|
|
||||||
if isinstance(node, lxml.html.HtmlComment) or isinstance(node, lxml.html.HtmlProcessingInstruction):
|
|
||||||
parent.remove(node)
|
|
||||||
return
|
|
||||||
|
|
||||||
# remove if too many kids & too high link density
|
# remove if too many kids & too high link density
|
||||||
wc = count_words(node.text_content())
|
wc = count_words(node.text_content())
|
||||||
if wc != 0 and len(list(node.iter())) > 3:
|
if wc != 0 and len(list(node.iter())) > 3:
|
||||||
|
|
Loading…
Reference in New Issue