| 
						
					 | 
				
			
			 | 
			 | 
			
				@@ -70,9 +70,10 @@ class_good = ['and', 'article', 'body', 'column', 'main',
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				regex_good = re.compile('|'.join(class_good), re.I)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				tags_junk = ['script', 'head', 'iframe', 'object', 'noscript',
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea',
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    'button', 'footer', 'link', 'meta']
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				tags_dangerous = ['script', 'head', 'iframe', 'object', 'style', 'link', 'meta']
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				tags_junk = tags_dangerous + ['noscript', 'param', 'embed', 'layer', 'applet',
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    'form', 'input', 'textarea', 'button', 'footer']
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				tags_bad = tags_junk + ['a', 'aside']
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
	
		
			
				
					
					| 
						
					 | 
				
			
			 | 
			 | 
			
				@@ -106,6 +107,9 @@ def score_node(node):
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				            or isinstance(node, lxml.html.HtmlProcessingInstruction)):
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				        return 0
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    if node.tag in tags_dangerous:
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				        return 0
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    if node.tag in tags_junk:
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				        score += -1 # actuall -2 as tags_junk is included tags_bad
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
	
		
			
				
					
					| 
						
					 | 
				
			
			 | 
			 | 
			
				@@ -189,6 +193,11 @@ def clean_node(node, keep_threshold=None):
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				        # this is <html/> (or a removed element waiting for GC)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				        return
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    # remove dangerous tags, no matter what
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    if node.tag in tags_dangerous:
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				        parent.remove(node)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				        return
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    if keep_threshold is not None and get_score(node) >= keep_threshold:
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				        # high score, so keep
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				        return
 | 
			
		
		
	
	
		
			
				
					
					| 
						
					 | 
				
			
			 | 
			 | 
			
				@@ -307,14 +316,14 @@ def get_best_node(ranked_grades):
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    return lowest
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				def get_article(data, url=None, encoding=None, debug=False):
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				def get_article(data, url=None, encoding=None, debug=False, threshold=5):
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    " Input a raw html string, returns a raw html string of the article "
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    html = parse(data, encoding)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    score_all(html)
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    scores = rank_grades(get_all_scores(html))
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    if not len(scores):
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    if not len(scores) or scores[0][1] < threshold:
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				        return None
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				
 | 
			
		
		
	
		
			
				 | 
				 | 
			
			 | 
			 | 
			
				    best = get_best_node(scores)
 | 
			
		
		
	
	
		
			
				
					
					| 
						
					 | 
				
			
			 | 
			 | 
			
				 
 |