From 4a70aa9dfaf689adc40a863c9ccabb12fe7ef407 Mon Sep 17 00:00:00 2001 From: pictuga Date: Wed, 18 Mar 2020 16:34:40 +0100 Subject: [PATCH] feeds: auto-parse() --- morss/feeds.py | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/morss/feeds.py b/morss/feeds.py index 4ce0929..0144a86 100644 --- a/morss/feeds.py +++ b/morss/feeds.py @@ -9,6 +9,8 @@ import re import json import csv +from fnmatch import fnmatch + from lxml import etree from dateutil import tz import dateutil.parser @@ -51,6 +53,65 @@ def parse_rules(filename=None): return rules +def parse(data, url=None, mimetype=None): + " Determine which ruleset to use " + + rulesets = parse_rules() + parsers = [FeedXML, FeedHTML, FeedJSON] + + # 1) Look for a ruleset based on path + + if url is not None: + for ruleset in rulesets.values(): + if 'path' in ruleset: + for path in ruleset['path']: + if fnmatch(url, path): + parser = [x for x in parsers if x.mode == ruleset['mode']][0] + return parser(data, ruleset) + + # 2) Look for a parser based on mimetype + + if mimetype is not None: + parser_candidates = [x for x in parsers if mimetype in x.mimetype] + + if mimetype is None or parser_candidates is None: + parser_candidates = parsers + + # 3) Look for working ruleset for given parser + # 3a) See if parsing works + # 3b) See if .items matches anything + + for parser in parser_candidates: + ruleset_candidates = [x for x in rulesets.values() if x['mode'] == parser.mode and 'path' not in x] + # 'path' as they should have been caught beforehands + + try: + feed = parser(data) + + except (ValueError): + # parsing did not work + pass + + else: + # parsing worked, now we try the rulesets + + for ruleset in ruleset_candidates: + feed.rules = ruleset + + try: + feed.items[0] + + except (AttributeError, IndexError): + # parsing and or item picking did not work out + pass + + else: + # it worked! + return feed + + raise Exception('no way to handle this feed') + + class ParserBase(object): def __init__(self, data=None, rules=None, parent=None): if rules is None: