From ef6efd981c3232894c3b53c2ac73eac2ff95c9b0 Mon Sep 17 00:00:00 2001 From: pictuga Date: Sat, 18 Sep 2021 16:16:34 +0200 Subject: [PATCH] readabilite: custom xpath for article detection --- README.md | 12 +++++++----- morss/cli.py | 1 + morss/morss.py | 2 +- morss/readabilite.py | 30 ++++++++++++++++++++++++------ 4 files changed, 33 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 4955eb5..6990f57 100644 --- a/README.md +++ b/README.md @@ -262,11 +262,12 @@ arguments to morss is explained in Run above. The list of arguments can be obtained by running `morss --help` ``` -usage: morss [-h] [--post STRING] [--format {rss,json,html,csv}] - [--search STRING] [--clip] [--indent] [--cache] [--force] - [--proxy] [--newest] [--firstlink] [--resolve] [--items XPATH] - [--item_link XPATH] [--item_title XPATH] [--item_content XPATH] - [--item_time XPATH] [--nolink] [--noref] [--silent] +usage: morss [-h] [--post STRING] [--xpath XPATH] + [--format {rss,json,html,csv}] [--search STRING] [--clip] + [--indent] [--cache] [--force] [--proxy] [--newest] [--firstlink] + [--resolve] [--items XPATH] [--item_link XPATH] + [--item_title XPATH] [--item_content XPATH] [--item_time XPATH] + [--nolink] [--noref] [--silent] url Get full-text RSS feeds @@ -277,6 +278,7 @@ positional arguments: optional arguments: -h, --help show this help message and exit --post STRING POST request + --xpath XPATH xpath rule to manually detect the article output: --format {rss,json,html,csv} diff --git a/morss/cli.py b/morss/cli.py index 94c22fd..f43c5dd 100644 --- a/morss/cli.py +++ b/morss/cli.py @@ -32,6 +32,7 @@ def cli_app(): parser.add_argument('url', help='feed url') parser.add_argument('--post', action='store', type=str, metavar='STRING', help='POST request') + parser.add_argument('--xpath', action='store', type=str, metavar='XPATH', help='xpath rule to manually detect the article') group = parser.add_argument_group('output') group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format') diff --git a/morss/morss.py b/morss/morss.py index b24b600..b34dabf 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -222,7 +222,7 @@ def ItemFill(item, options, feedurl='/', fast=False): log('empty page') return True - out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode') + out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode', xpath=options.xpath) if out is not None: item.content = out diff --git a/morss/readabilite.py b/morss/readabilite.py index 709a033..4ea17cb 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -211,7 +211,7 @@ def clean_node(node, keep_threshold=None): return # high score, so keep - if keep_threshold is not None and get_score(node) >= keep_threshold: + if keep_threshold is not None and keep_threshold > 0 and get_score(node) >= keep_threshold: return gdparent = parent.getparent() @@ -312,10 +312,8 @@ def lowest_common_ancestor(node_a, node_b, max_depth=None): return node_a # should always find one tho, at least , but needed for max_depth -def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5): - " Input a raw html string, returns a raw html string of the article " - - html = parse(data, encoding_in) +def get_best_node(html, threshold=5): + # score all nodes score_all(html) # rank all nodes (largest to smallest) @@ -332,9 +330,29 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug= else: best = ranked_nodes[0] + return best + + +def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5, xpath=None): + " Input a raw html string, returns a raw html string of the article " + + html = parse(data, encoding_in) + + if xpath is not None: + xpath_match = html.xpath(xpath) + + if len(xpath_match): + best = xpath_match[0] + + else: + best = get_best_node(html, threshold) + + else: + best = get_best_node(html, threshold) + # clean up if not debug: - keep_threshold = get_score(ranked_nodes[0]) * 3/4 + keep_threshold = get_score(best) * 3/4 clean_root(best, keep_threshold) # check for spammy content (links only)