readabilite: custom xpath for article detection
	
		
			
	
		
	
	
		
	
		
			Some checks failed
		
		
	
	
		
			
				
	
				continuous-integration/drone/push Build is failing
				
			
		
		
	
	
				
					
				
			
		
			Some checks failed
		
		
	
	continuous-integration/drone/push Build is failing
				
			This commit is contained in:
		
							
								
								
									
										12
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								README.md
									
									
									
									
									
								
							@@ -262,11 +262,12 @@ arguments to morss is explained in Run above.
 | 
			
		||||
The list of arguments can be obtained by running `morss --help`
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
usage: morss [-h] [--post STRING] [--format {rss,json,html,csv}]
 | 
			
		||||
             [--search STRING] [--clip] [--indent] [--cache] [--force]
 | 
			
		||||
             [--proxy] [--newest] [--firstlink] [--resolve] [--items XPATH]
 | 
			
		||||
             [--item_link XPATH] [--item_title XPATH] [--item_content XPATH]
 | 
			
		||||
             [--item_time XPATH] [--nolink] [--noref] [--silent]
 | 
			
		||||
usage: morss [-h] [--post STRING] [--xpath XPATH]
 | 
			
		||||
             [--format {rss,json,html,csv}] [--search STRING] [--clip]
 | 
			
		||||
             [--indent] [--cache] [--force] [--proxy] [--newest] [--firstlink]
 | 
			
		||||
             [--resolve] [--items XPATH] [--item_link XPATH]
 | 
			
		||||
             [--item_title XPATH] [--item_content XPATH] [--item_time XPATH]
 | 
			
		||||
             [--nolink] [--noref] [--silent]
 | 
			
		||||
             url
 | 
			
		||||
 | 
			
		||||
Get full-text RSS feeds
 | 
			
		||||
@@ -277,6 +278,7 @@ positional arguments:
 | 
			
		||||
optional arguments:
 | 
			
		||||
  -h, --help            show this help message and exit
 | 
			
		||||
  --post STRING         POST request
 | 
			
		||||
  --xpath XPATH         xpath rule to manually detect the article
 | 
			
		||||
 | 
			
		||||
output:
 | 
			
		||||
  --format {rss,json,html,csv}
 | 
			
		||||
 
 | 
			
		||||
@@ -32,6 +32,7 @@ def cli_app():
 | 
			
		||||
    parser.add_argument('url', help='feed url')
 | 
			
		||||
 | 
			
		||||
    parser.add_argument('--post', action='store', type=str, metavar='STRING', help='POST request')
 | 
			
		||||
    parser.add_argument('--xpath', action='store', type=str, metavar='XPATH', help='xpath rule to manually detect the article')
 | 
			
		||||
 | 
			
		||||
    group = parser.add_argument_group('output')
 | 
			
		||||
    group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
 | 
			
		||||
 
 | 
			
		||||
@@ -222,7 +222,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
 | 
			
		||||
        log('empty page')
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
 | 
			
		||||
    out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode', xpath=options.xpath)
 | 
			
		||||
 | 
			
		||||
    if out is not None:
 | 
			
		||||
        item.content = out
 | 
			
		||||
 
 | 
			
		||||
@@ -211,7 +211,7 @@ def clean_node(node, keep_threshold=None):
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    # high score, so keep
 | 
			
		||||
    if keep_threshold is not None and get_score(node) >= keep_threshold:
 | 
			
		||||
    if keep_threshold is not None and keep_threshold > 0 and get_score(node) >= keep_threshold:
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    gdparent = parent.getparent()
 | 
			
		||||
@@ -312,10 +312,8 @@ def lowest_common_ancestor(node_a, node_b, max_depth=None):
 | 
			
		||||
    return node_a # should always find one tho, at least <html/>, but needed for max_depth
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5):
 | 
			
		||||
    " Input a raw html string, returns a raw html string of the article "
 | 
			
		||||
 | 
			
		||||
    html = parse(data, encoding_in)
 | 
			
		||||
def get_best_node(html, threshold=5):
 | 
			
		||||
    # score all nodes
 | 
			
		||||
    score_all(html)
 | 
			
		||||
 | 
			
		||||
    # rank all nodes (largest to smallest)
 | 
			
		||||
@@ -332,9 +330,29 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=
 | 
			
		||||
    else:
 | 
			
		||||
        best = ranked_nodes[0]
 | 
			
		||||
 | 
			
		||||
    return best
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5, xpath=None):
 | 
			
		||||
    " Input a raw html string, returns a raw html string of the article "
 | 
			
		||||
 | 
			
		||||
    html = parse(data, encoding_in)
 | 
			
		||||
 | 
			
		||||
    if xpath is not None:
 | 
			
		||||
        xpath_match = html.xpath(xpath)
 | 
			
		||||
 | 
			
		||||
        if len(xpath_match):
 | 
			
		||||
            best = xpath_match[0]
 | 
			
		||||
 | 
			
		||||
        else:
 | 
			
		||||
            best = get_best_node(html, threshold)
 | 
			
		||||
 | 
			
		||||
    else:
 | 
			
		||||
        best = get_best_node(html, threshold)
 | 
			
		||||
 | 
			
		||||
    # clean up
 | 
			
		||||
    if not debug:
 | 
			
		||||
        keep_threshold = get_score(ranked_nodes[0]) * 3/4
 | 
			
		||||
        keep_threshold = get_score(best) * 3/4
 | 
			
		||||
        clean_root(best, keep_threshold)
 | 
			
		||||
 | 
			
		||||
    # check for spammy content (links only)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user