From da0a8feaddd9ad1a071cba513c6fbf11d881d774 Mon Sep 17 00:00:00 2001 From: pictuga Date: Sat, 21 Jun 2014 18:35:59 +0200 Subject: [PATCH] Replace TABS with FOUR SPACES in .py (you might want to use: git diff -w) --- morss/feedify.py | 274 +++++------ morss/feeds.py | 1074 ++++++++++++++++++++-------------------- morss/morss.py | 1220 +++++++++++++++++++++++----------------------- setup.py | 20 +- 4 files changed, 1294 insertions(+), 1294 deletions(-) diff --git a/morss/feedify.py b/morss/feedify.py index 804fcd7..6271817 100644 --- a/morss/feedify.py +++ b/morss/feedify.py @@ -13,170 +13,170 @@ import urlparse def toclass(query): - pattern = r'\[class=([^\]]+)\]' - repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]' - return re.sub(pattern, repl, query) + pattern = r'\[class=([^\]]+)\]' + repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]' + return re.sub(pattern, repl, query) def getRule(link): - config = ConfigParser() - config.read('feedify.ini') + config = ConfigParser() + config.read('feedify.ini') - for section in config.sections(): - values = dict(config.items(section)) - values['path'] = values['path'].split('\n')[1:] - for path in values['path']: - if fnmatch(link, path): - return values - return False + for section in config.sections(): + values = dict(config.items(section)) + values['path'] = values['path'].split('\n')[1:] + for path in values['path']: + if fnmatch(link, path): + return values + return False def supported(link): - return getRule(link) is not False + return getRule(link) is not False def formatString(string, getter, error=False): - out = "" - char = string[0] + out = "" + char = string[0] - follow = string[1:] + follow = string[1:] - if char == '"': - match = follow.partition('"') - out = match[0] - if len(match) >= 2: - next = match[2] - else: - next = None - elif char == '{': - match = follow.partition('}') - try: - test = formatString(match[0], getter, True) - except ValueError, KeyError: - pass - else: - out = test + if char == '"': + match = follow.partition('"') + out = match[0] + if len(match) >= 2: + next = match[2] + else: + next = None + elif char == '{': + match = follow.partition('}') + try: + test = formatString(match[0], getter, True) + except ValueError, KeyError: + pass + else: + out = test - next = match[2] - elif char == ' ': - next = follow - elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string): - match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups() - rawValue = getter(match[0]) - if not isinstance(rawValue, basestring): - if match[1] is not None: - out = match[1].join(rawValue) - else: - out = ''.join(rawValue) - if not out and error: - raise ValueError - next = match[2] - else: - raise ValueError('bogus string') + next = match[2] + elif char == ' ': + next = follow + elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string): + match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups() + rawValue = getter(match[0]) + if not isinstance(rawValue, basestring): + if match[1] is not None: + out = match[1].join(rawValue) + else: + out = ''.join(rawValue) + if not out and error: + raise ValueError + next = match[2] + else: + raise ValueError('bogus string') - if next is not None and len(next): - return out + formatString(next, getter, error) - else: - return out + if next is not None and len(next): + return out + formatString(next, getter, error) + else: + return out def PreWorker(url, cache): - if urlparse.urlparse(url).netloc == 'itunes.apple.com': - match = re.search('/id([0-9]+)(\?.*)?$', url) - if match: - iid = match.groups()[0] - redirect = 'https://itunes.apple.com/lookup?id={id}'.format(id=iid) - cache.set('redirect', redirect) + if urlparse.urlparse(url).netloc == 'itunes.apple.com': + match = re.search('/id([0-9]+)(\?.*)?$', url) + if match: + iid = match.groups()[0] + redirect = 'https://itunes.apple.com/lookup?id={id}'.format(id=iid) + cache.set('redirect', redirect) class Builder(object): - def __init__(self, link, data=None, cache=False): - self.link = link - self.cache = cache + def __init__(self, link, data=None, cache=False): + self.link = link + self.cache = cache - if data is None: - data = urllib2.urlopen(link).read() - self.data = data + if data is None: + data = urllib2.urlopen(link).read() + self.data = data - self.rule = getRule(link) + self.rule = getRule(link) - if self.rule['mode'] == 'xpath': - if not isinstance(self.data, unicode): - self.data = self.data.decode(morss.detEncoding(self.data), 'replace') - self.doc = lxml.html.fromstring(self.data) - elif self.rule['mode'] == 'json': - self.doc = json.loads(data) + if self.rule['mode'] == 'xpath': + if not isinstance(self.data, unicode): + self.data = self.data.decode(morss.detEncoding(self.data), 'replace') + self.doc = lxml.html.fromstring(self.data) + elif self.rule['mode'] == 'json': + self.doc = json.loads(data) - self.feed = feeds.FeedParserAtom() + self.feed = feeds.FeedParserAtom() - def raw(self, html, expr): - if self.rule['mode'] == 'xpath': - return html.xpath(toclass(expr)) + def raw(self, html, expr): + if self.rule['mode'] == 'xpath': + return html.xpath(toclass(expr)) - elif self.rule['mode'] == 'json': - a = [html] - b = [] - for x in expr.strip(".").split("."): - match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups() - for elem in a: - if isinstance(elem, dict): - kids = elem.get(match[0]) - if kids is None: - pass - elif isinstance(kids, list): - [b.append(i) for i in kids] - elif isinstance(kids, basestring): - b.append(kids.replace('\n', '
')) - else: - b.append(kids) + elif self.rule['mode'] == 'json': + a = [html] + b = [] + for x in expr.strip(".").split("."): + match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups() + for elem in a: + if isinstance(elem, dict): + kids = elem.get(match[0]) + if kids is None: + pass + elif isinstance(kids, list): + [b.append(i) for i in kids] + elif isinstance(kids, basestring): + b.append(kids.replace('\n', '
')) + else: + b.append(kids) - if match[1] is None: - a = b - else: - if len(b)-1 >= int(match[1]): - a = [b[int(match[1])]] - else: - a = [] - b = [] - return a + if match[1] is None: + a = b + else: + if len(b)-1 >= int(match[1]): + a = [b[int(match[1])]] + else: + a = [] + b = [] + return a - def strings(self, html, expr): - if self.rule['mode'] == 'xpath': - out = [] - for match in self.raw(html, expr): - if isinstance(match, basestring): - out.append(match) - elif isinstance(match, lxml.html.HtmlElement): - out.append(lxml.html.tostring(match)) - return out + def strings(self, html, expr): + if self.rule['mode'] == 'xpath': + out = [] + for match in self.raw(html, expr): + if isinstance(match, basestring): + out.append(match) + elif isinstance(match, lxml.html.HtmlElement): + out.append(lxml.html.tostring(match)) + return out - elif self.rule['mode'] == 'json': - return self.raw(html, expr) + elif self.rule['mode'] == 'json': + return self.raw(html, expr) - def string(self, html, expr): - getter = lambda x: self.strings(html, x) - return formatString(self.rule[expr], getter) + def string(self, html, expr): + getter = lambda x: self.strings(html, x) + return formatString(self.rule[expr], getter) - def build(self): - if 'title' in self.rule: - self.feed.title = self.string(self.doc, 'title') + def build(self): + if 'title' in self.rule: + self.feed.title = self.string(self.doc, 'title') - if 'items' in self.rule: - matches = self.raw(self.doc, self.rule['items']) - if matches and len(matches): - for item in matches: - feedItem = {} + if 'items' in self.rule: + matches = self.raw(self.doc, self.rule['items']) + if matches and len(matches): + for item in matches: + feedItem = {} - if 'item_title' in self.rule: - feedItem['title'] = self.string(item, 'item_title') - if 'item_link' in self.rule: - url = self.string(item, 'item_link') - url = urlparse.urljoin(self.link, url) - feedItem['link'] = url - if 'item_desc' in self.rule: - feedItem['desc'] = self.string(item, 'item_desc') - if 'item_content' in self.rule: - feedItem['content'] = self.string(item, 'item_content') - if 'item_time' in self.rule: - feedItem['updated'] = self.string(item, 'item_time') - if 'item_id' in self.rule: - feedItem['id'] = self.string(item, 'item_id') - feedItem['isPermaLink'] = False + if 'item_title' in self.rule: + feedItem['title'] = self.string(item, 'item_title') + if 'item_link' in self.rule: + url = self.string(item, 'item_link') + url = urlparse.urljoin(self.link, url) + feedItem['link'] = url + if 'item_desc' in self.rule: + feedItem['desc'] = self.string(item, 'item_desc') + if 'item_content' in self.rule: + feedItem['content'] = self.string(item, 'item_content') + if 'item_time' in self.rule: + feedItem['updated'] = self.string(item, 'item_time') + if 'item_id' in self.rule: + feedItem['id'] = self.string(item, 'item_id') + feedItem['isPermaLink'] = False - self.feed.items.append(feedItem) + self.feed.items.append(feedItem) diff --git a/morss/feeds.py b/morss/feeds.py index 7985637..f18232a 100644 --- a/morss/feeds.py +++ b/morss/feeds.py @@ -11,740 +11,740 @@ import json import csv try: - from wheezy.template.engine import Engine - from wheezy.template.loader import DictLoader - from wheezy.template.ext.core import CoreExtension + from wheezy.template.engine import Engine + from wheezy.template.loader import DictLoader + from wheezy.template.ext.core import CoreExtension except ImportError: - Engine = DictLoader = CoreExtension = None + Engine = DictLoader = CoreExtension = None json.encoder.c_make_encoder = None try: - from collections import OrderedDict + from collections import OrderedDict except ImportError: - from ordereddict import OrderedDict + from ordereddict import OrderedDict Element = etree.Element -NSMAP = {'atom': 'http://www.w3.org/2005/Atom', - 'atom03': 'http://purl.org/atom/ns#', - 'media': 'http://search.yahoo.com/mrss/', - 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', - 'slash': 'http://purl.org/rss/1.0/modules/slash/', - 'dc': 'http://purl.org/dc/elements/1.1/', - 'content': 'http://purl.org/rss/1.0/modules/content/', - 'rssfake': 'http://purl.org/rss/1.0/'} +NSMAP = {'atom': 'http://www.w3.org/2005/Atom', + 'atom03': 'http://purl.org/atom/ns#', + 'media': 'http://search.yahoo.com/mrss/', + 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', + 'slash': 'http://purl.org/rss/1.0/modules/slash/', + 'dc': 'http://purl.org/dc/elements/1.1/', + 'content': 'http://purl.org/rss/1.0/modules/content/', + 'rssfake': 'http://purl.org/rss/1.0/'} def load(url): - import urllib2 - d = urllib2.urlopen(url).read() - return parse(d) + import urllib2 + d = urllib2.urlopen(url).read() + return parse(d) def tagNS(tag, nsmap=NSMAP): - match = re.search(r'^\{([^\}]+)\}(.*)$', tag) - if match: - match = match.groups() - for (key, url) in nsmap.iteritems(): - if url == match[0]: - return "%s:%s" % (key, match[1].lower()) - else: - match = re.search(r'^([^:]+):([^:]+)$', tag) - if match: - match = match.groups() - if match[0] in nsmap: - return "{%s}%s" % (nsmap[match[0]], match[1].lower()) - return tag + match = re.search(r'^\{([^\}]+)\}(.*)$', tag) + if match: + match = match.groups() + for (key, url) in nsmap.iteritems(): + if url == match[0]: + return "%s:%s" % (key, match[1].lower()) + else: + match = re.search(r'^([^:]+):([^:]+)$', tag) + if match: + match = match.groups() + if match[0] in nsmap: + return "{%s}%s" % (nsmap[match[0]], match[1].lower()) + return tag def innerHTML(xml): - return (xml.text or '') + ''.join([etree.tostring(child) for child in xml.iterchildren()]) + return (xml.text or '') + ''.join([etree.tostring(child) for child in xml.iterchildren()]) def cleanNode(xml): - [xml.remove(child) for child in xml.iterchildren()] + [xml.remove(child) for child in xml.iterchildren()] class FeedException(Exception): - pass + pass def parse(data): - # encoding - match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100]) - if match: - enc = match.groups()[0].lower() - if not isinstance(data, unicode): - data = data.decode(enc, 'ignore') - data = data.encode(enc) + # encoding + match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100]) + if match: + enc = match.groups()[0].lower() + if not isinstance(data, unicode): + data = data.decode(enc, 'ignore') + data = data.encode(enc) - # parse - parser = etree.XMLParser(recover=True) - doc = etree.fromstring(data, parser) + # parse + parser = etree.XMLParser(recover=True) + doc = etree.fromstring(data, parser) - # rss - match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP) - if len(match): - mtable = { 'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS, - 'atom03:feed': FeedParserAtom, 'atom:feed': FeedParserAtom } - match = match[0] - tag = tagNS(match.tag) - if tag in mtable: - return mtable[tag](doc, tag) + # rss + match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP) + if len(match): + mtable = { 'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS, + 'atom03:feed': FeedParserAtom, 'atom:feed': FeedParserAtom } + match = match[0] + tag = tagNS(match.tag) + if tag in mtable: + return mtable[tag](doc, tag) - raise FeedException('unknown feed type') + raise FeedException('unknown feed type') class FeedBase(object): - """ - Base for xml-related classes, which provides simple wrappers around xpath - selection and item creation - """ + """ + Base for xml-related classes, which provides simple wrappers around xpath + selection and item creation + """ - def __getitem__(self, item): - return getattr(self, item) + def __getitem__(self, item): + return getattr(self, item) - def __setitem__(self, item, value): - setattr(self, item, value) + def __setitem__(self, item, value): + setattr(self, item, value) - def __delitem__(self, item): - delattr(self, item) + def __delitem__(self, item): + delattr(self, item) - def __iter__(self): - for element in self.dic: - value = self[element] + def __iter__(self): + for element in self.dic: + value = self[element] - if isinstance(value, FeedList): - value = [OrderedDict(x) for x in value] - elif isinstance(value, datetime): - value = value.isoformat() + if isinstance(value, FeedList): + value = [OrderedDict(x) for x in value] + elif isinstance(value, datetime): + value = value.isoformat() - yield element, value + yield element, value - def xpath(self, path): - """ Test xpath rule on xml tree """ - return self.root.xpath(path, namespaces=NSMAP) + def xpath(self, path): + """ Test xpath rule on xml tree """ + return self.root.xpath(path, namespaces=NSMAP) - def xget(self, path): - """ Returns the 1st xpath match """ - match = self.xpath(path) - if len(match): - return match[0] - else: - return None + def xget(self, path): + """ Returns the 1st xpath match """ + match = self.xpath(path) + if len(match): + return match[0] + else: + return None - def xval(self, path): - """ Returns the .text of the 1st match """ - match = self.xget(path) - if match is not None: - return match.text or "" - else: - return "" + def xval(self, path): + """ Returns the .text of the 1st match """ + match = self.xget(path) + if match is not None: + return match.text or "" + else: + return "" - def xgetCreate(self, table): - """ Returns an element, and creates it when not present """ - value = table[self.tag] - if not isinstance(value, tuple): - value = (value, value) - new, xpath = value - match = self.xget(xpath) - if match is not None: - return match - else: - element = etree.Element(tagNS(new)) - self.root.append(element) - return element + def xgetCreate(self, table): + """ Returns an element, and creates it when not present """ + value = table[self.tag] + if not isinstance(value, tuple): + value = (value, value) + new, xpath = value + match = self.xget(xpath) + if match is not None: + return match + else: + element = etree.Element(tagNS(new)) + self.root.append(element) + return element - def xdel(self, path): - match = self.xget(path) - if match is not None: - return match.getparent().remove(match) + def xdel(self, path): + match = self.xget(path) + if match is not None: + return match.getparent().remove(match) - def tostring(self, **k): - """ Returns string using lxml. Arguments passed to tostring """ - return etree.tostring(self.xml, pretty_print=True, **k) + def tostring(self, **k): + """ Returns string using lxml. Arguments passed to tostring """ + return etree.tostring(self.xml, pretty_print=True, **k) class FeedDescriptor(object): - """ - Descriptor which gives off elements based on "self.getName" and - "self.setName" as getter/setters. Looks far better, and avoids duplicates - """ - def __init__(self, name): - self.name = name - self.nname = name[0].upper() + name[1:] + """ + Descriptor which gives off elements based on "self.getName" and + "self.setName" as getter/setters. Looks far better, and avoids duplicates + """ + def __init__(self, name): + self.name = name + self.nname = name[0].upper() + name[1:] - def __get__(self, instance, owner): - getter = getattr(instance, 'get%s' % self.nname) - return getter() + def __get__(self, instance, owner): + getter = getattr(instance, 'get%s' % self.nname) + return getter() - def __set__(self, instance, value): - setter = getattr(instance, 'set%s' % self.nname) - return setter(value) + def __set__(self, instance, value): + setter = getattr(instance, 'set%s' % self.nname) + return setter(value) - def __delete__(self, instance): - deleter = getattr(instance, 'del%s' % self.nname) - return deleter() + def __delete__(self, instance): + deleter = getattr(instance, 'del%s' % self.nname) + return deleter() class FeedTime(FeedDescriptor): - def __get__(self, instance, owner): - getter = getattr(instance, 'get%s' % self.nname) - raw = getter() - try: - time = parseTime(raw) - return time - except ValueError: - return None + def __get__(self, instance, owner): + getter = getattr(instance, 'get%s' % self.nname) + raw = getter() + try: + time = parseTime(raw) + return time + except ValueError: + return None - def __set__(self, instance, value): - try: - time = parseTime(value) - raw = time.strftime(instance.timeFormat) - setter = getattr(instance, 'set%s' % self.nname) - return setter(raw) - except ValueError: - pass + def __set__(self, instance, value): + try: + time = parseTime(value) + raw = time.strftime(instance.timeFormat) + setter = getattr(instance, 'set%s' % self.nname) + return setter(raw) + except ValueError: + pass class FeedBool(FeedDescriptor): - def __get__(self, instance, owner): - getter = getattr(instance, 'get%s' % self.nname) - raw = getter() - return (raw or '').lower() != 'false' + def __get__(self, instance, owner): + getter = getattr(instance, 'get%s' % self.nname) + raw = getter() + return (raw or '').lower() != 'false' - def __set__(self, instance, value): - raw = 'true' if value else 'false' - setter = getattr(instance, 'set%s' % self.nname) - return setter(raw) + def __set__(self, instance, value): + raw = 'true' if value else 'false' + setter = getattr(instance, 'set%s' % self.nname) + return setter(raw) def parseTime(value): - if isinstance(value, basestring): - if re.match(r'^[0-9]+$', value): - return datetime.fromtimestamp(int(value), tz.tzutc()) - else: - return dateutil.parser.parse(value, tzinfos=tz.tzutc) - elif isinstance(value, int): - return datetime.fromtimestamp(value, tz.tzutc()) - elif isinstance(value, datetime): - return value - else: - return False + if isinstance(value, basestring): + if re.match(r'^[0-9]+$', value): + return datetime.fromtimestamp(int(value), tz.tzutc()) + else: + return dateutil.parser.parse(value, tzinfos=tz.tzutc) + elif isinstance(value, int): + return datetime.fromtimestamp(value, tz.tzutc()) + elif isinstance(value, datetime): + return value + else: + return False class FeedList(object): - """ - Class to map a list of xml elements against a list of matching objects, - while avoiding to recreate the same matching object over and over again. So - as to avoid extra confusion, list's elements are called "children" here, so - as not to use "items", which is already in use in RSS/Atom related code. + """ + Class to map a list of xml elements against a list of matching objects, + while avoiding to recreate the same matching object over and over again. So + as to avoid extra confusion, list's elements are called "children" here, so + as not to use "items", which is already in use in RSS/Atom related code. - Comes with its very own descriptor. - """ - def __init__(self, parent, getter, tag, childClass): - self.parent = parent - self.getter = getter - self.childClass = childClass - self.tag = tag - self._children = {} # id(xml) => FeedItem + Comes with its very own descriptor. + """ + def __init__(self, parent, getter, tag, childClass): + self.parent = parent + self.getter = getter + self.childClass = childClass + self.tag = tag + self._children = {} # id(xml) => FeedItem - def getChildren(self): - children = self.getter() - out = [] - for child in children: - if id(child) in self._children: - out.append(self._children[id(child)]) - else: - new = self.childClass(child, self.tag) - self._children[id(child)] = new - out.append(new) - return out + def getChildren(self): + children = self.getter() + out = [] + for child in children: + if id(child) in self._children: + out.append(self._children[id(child)]) + else: + new = self.childClass(child, self.tag) + self._children[id(child)] = new + out.append(new) + return out - def append(self, cousin=None): - new = self.childClass(tag=self.tag) - self.parent.root.append(new.xml) - self._children[id(new.xml)] = new + def append(self, cousin=None): + new = self.childClass(tag=self.tag) + self.parent.root.append(new.xml) + self._children[id(new.xml)] = new - if cousin is None: - return new + if cousin is None: + return new - for key in self.childClass.__dict__: - if key[:3] == 'set': - attr = key[3:].lower() - if hasattr(cousin, attr): - setattr(new, attr, getattr(cousin, attr)) - elif attr in cousin: - setattr(new, attr, cousin[attr]) + for key in self.childClass.__dict__: + if key[:3] == 'set': + attr = key[3:].lower() + if hasattr(cousin, attr): + setattr(new, attr, getattr(cousin, attr)) + elif attr in cousin: + setattr(new, attr, cousin[attr]) - return new + return new - def __getitem__(self, key): - return self.getChildren()[key] + def __getitem__(self, key): + return self.getChildren()[key] - def __delitem__(self, key): - child = self.getter()[key] - if id(child) in self._children: - self._children[id(child)].remove() - del self._children[id(child)] - else: - child.getparent().remove(child) + def __delitem__(self, key): + child = self.getter()[key] + if id(child) in self._children: + self._children[id(child)].remove() + del self._children[id(child)] + else: + child.getparent().remove(child) - def __len__(self): - return len(self.getter()) + def __len__(self): + return len(self.getter()) class FeedListDescriptor(object): - """ - Descriptor for FeedList - """ - def __init__(self, name): - self.name = name - self.items = {} # id(instance) => FeedList + """ + Descriptor for FeedList + """ + def __init__(self, name): + self.name = name + self.items = {} # id(instance) => FeedList - def __get__(self, instance, owner=None): - key = id(instance) - if key in self.items: - return self.items[key] - else: - getter = getattr(instance, 'get%s' % self.name.title()) - className = globals()[getattr(instance, '%sClass' % self.name)] - self.items[key] = FeedList(instance, getter, instance.tag, className) - return self.items[key] + def __get__(self, instance, owner=None): + key = id(instance) + if key in self.items: + return self.items[key] + else: + getter = getattr(instance, 'get%s' % self.name.title()) + className = globals()[getattr(instance, '%sClass' % self.name)] + self.items[key] = FeedList(instance, getter, instance.tag, className) + return self.items[key] - def __set__(self, instance, value): - feedlist = self.__get__(instance) - [x.remove() for x in [x for x in f.items]] - [feedlist.append(x) for x in value] + def __set__(self, instance, value): + feedlist = self.__get__(instance) + [x.remove() for x in [x for x in f.items]] + [feedlist.append(x) for x in value] class FeedParser(FeedBase): - itemsClass = 'FeedItem' - mimetype = 'application/xml' - base = '' - dic = ('title', 'desc', 'items') + itemsClass = 'FeedItem' + mimetype = 'application/xml' + base = '' + dic = ('title', 'desc', 'items') - def __init__(self, xml=None, tag='atom:feed'): - if xml is None: - xml = etree.fromstring(self.base[tag]) - self.xml = xml - self.root = self.xml.xpath("//atom03:feed|//atom:feed|//channel|//rssfake:channel", namespaces=NSMAP)[0] - self.tag = tag + def __init__(self, xml=None, tag='atom:feed'): + if xml is None: + xml = etree.fromstring(self.base[tag]) + self.xml = xml + self.root = self.xml.xpath("//atom03:feed|//atom:feed|//channel|//rssfake:channel", namespaces=NSMAP)[0] + self.tag = tag - def getTitle(self): - return "" + def getTitle(self): + return "" - def setTitle(self, value): - pass + def setTitle(self, value): + pass - def delTitle(self): - self.title = "" + def delTitle(self): + self.title = "" - def getDesc(self): - pass + def getDesc(self): + pass - def setDesc(self, value): - pass + def setDesc(self, value): + pass - def delDesc(self): - self.desc = "" + def delDesc(self): + self.desc = "" - def getItems(self): - return [] + def getItems(self): + return [] - title = FeedDescriptor('title') - description = desc = FeedDescriptor('desc') - items = FeedListDescriptor('items') + title = FeedDescriptor('title') + description = desc = FeedDescriptor('desc') + items = FeedListDescriptor('items') - def tostring(self, **k): - return etree.tostring(self.xml.getroottree(), pretty_print=True, **k) + def tostring(self, **k): + return etree.tostring(self.xml.getroottree(), pretty_print=True, **k) - def tojson(self, indent=None): - return json.dumps(OrderedDict(self), indent=indent) + def tojson(self, indent=None): + return json.dumps(OrderedDict(self), indent=indent) - def tocsv(self): - out = StringIO() - c = csv.writer(out, dialect=csv.excel) - for item in self.items: - row = [x[1].encode('utf-8') if isinstance(x[1], unicode) else x[1] for x in item if isinstance(x[1], basestring)] - c.writerow(row) - out.seek(0) - return out.read() + def tocsv(self): + out = StringIO() + c = csv.writer(out, dialect=csv.excel) + for item in self.items: + row = [x[1].encode('utf-8') if isinstance(x[1], unicode) else x[1] for x in item if isinstance(x[1], basestring)] + c.writerow(row) + out.seek(0) + return out.read() - def tohtml(self): - if DictLoader is None: - log('dep wheezy.template needed') + def tohtml(self): + if DictLoader is None: + log('dep wheezy.template needed') - loader = DictLoader({'reader': open('reader.html.template').read()}) - engine = Engine(loader=loader, extensions=[CoreExtension()]) - template = engine.get_template('reader') - return template.render({'feed':self}).encode('utf-8') + loader = DictLoader({'reader': open('reader.html.template').read()}) + engine = Engine(loader=loader, extensions=[CoreExtension()]) + template = engine.get_template('reader') + return template.render({'feed':self}).encode('utf-8') class FeedParserRSS(FeedParser): - """ - RSS Parser - """ - itemsClass = 'FeedItemRSS' - mimetype = 'application/rss+xml' - base = { 'rdf:rdf': '', - 'channel': ''} + """ + RSS Parser + """ + itemsClass = 'FeedItemRSS' + mimetype = 'application/rss+xml' + base = { 'rdf:rdf': '', + 'channel': ''} - def getTitle(self): - return self.xval('rssfake:title|title') + def getTitle(self): + return self.xval('rssfake:title|title') - def setTitle(self, value): - if not value: - return self.xdel('rssfake:title|title') + def setTitle(self, value): + if not value: + return self.xdel('rssfake:title|title') - table = { 'rdf:rdf': 'rssfake:title', - 'channel': 'title'} - element = self.xgetCreate(table) - element.text = value + table = { 'rdf:rdf': 'rssfake:title', + 'channel': 'title'} + element = self.xgetCreate(table) + element.text = value - def getDesc(self): - return self.xval('rssfake:description|description') + def getDesc(self): + return self.xval('rssfake:description|description') - def setDesc(self, value): - if not value: - return self.xdel('rssfake:description|description') + def setDesc(self, value): + if not value: + return self.xdel('rssfake:description|description') - table = { 'rdf:rdf': 'rssfake:description', - 'channel': 'description'} - element = self.xgetCreate(table) - element.text = value + table = { 'rdf:rdf': 'rssfake:description', + 'channel': 'description'} + element = self.xgetCreate(table) + element.text = value - def getItems(self): - return self.xpath('rssfake:item|item') + def getItems(self): + return self.xpath('rssfake:item|item') class FeedParserAtom(FeedParser): - """ - Atom Parser - """ - itemsClass = 'FeedItemAtom' - mimetype = 'application/atom+xml' - base = { 'atom:feed': '', - 'atom03:feed': ''} + """ + Atom Parser + """ + itemsClass = 'FeedItemAtom' + mimetype = 'application/atom+xml' + base = { 'atom:feed': '', + 'atom03:feed': ''} - def getTitle(self): - return self.xval('atom:title|atom03:title') + def getTitle(self): + return self.xval('atom:title|atom03:title') - def setTitle(self, value): - if not value: - return self.xval('atom:title|atom03:title') + def setTitle(self, value): + if not value: + return self.xval('atom:title|atom03:title') - table = { 'atom:feed': 'atom:title', - 'atom03:feed': 'atom03:title'} - element = self.xgetCreate(table) - element.text = value + table = { 'atom:feed': 'atom:title', + 'atom03:feed': 'atom03:title'} + element = self.xgetCreate(table) + element.text = value - def getDesc(self): - return self.xval('atom:subtitle|atom03:subtitle') + def getDesc(self): + return self.xval('atom:subtitle|atom03:subtitle') - def setDesc(self, value): - if not value: - return self.xdel('atom:subtitle|atom03:subtitle') + def setDesc(self, value): + if not value: + return self.xdel('atom:subtitle|atom03:subtitle') - table = { 'atom:feed': 'atom:subtitle', - 'atom03:feed': 'atom03:subtitle'} - element = self.xgetCreate(table) - element.text = value + table = { 'atom:feed': 'atom:subtitle', + 'atom03:feed': 'atom03:subtitle'} + element = self.xgetCreate(table) + element.text = value - def getItems(self): - return self.xpath('atom:entry|atom03:entry') + def getItems(self): + return self.xpath('atom:entry|atom03:entry') class FeedItem(FeedBase): - timeFormat = '' - dic = ('title', 'link', 'desc', 'content', 'id', 'isPermaLink', 'time', 'updated') + timeFormat = '' + dic = ('title', 'link', 'desc', 'content', 'id', 'isPermaLink', 'time', 'updated') - def __init__(self, xml=None, tag='atom:feed'): - if xml is None: - xml = Element(tagNS(self.base[tag])) + def __init__(self, xml=None, tag='atom:feed'): + if xml is None: + xml = Element(tagNS(self.base[tag])) - self.root = self.xml = xml - self.tag = tag + self.root = self.xml = xml + self.tag = tag - def getTitle(self): - return "" + def getTitle(self): + return "" - def setTitle(self): - pass + def setTitle(self): + pass - def delTitle(self): - self.title = "" + def delTitle(self): + self.title = "" - def getLink(self): - return "" + def getLink(self): + return "" - def setLink(self, value): - pass + def setLink(self, value): + pass - def delLink(self): - self.link = "" + def delLink(self): + self.link = "" - def getIsPermaLink(self): - return "" + def getIsPermaLink(self): + return "" - def setIsPermaLink(self, value): - pass + def setIsPermaLink(self, value): + pass - def getDesc(self): - return "" + def getDesc(self): + return "" - def setDesc(self, value): - pass + def setDesc(self, value): + pass - def delDesc(self): - self.desc = "" + def delDesc(self): + self.desc = "" - def getContent(self): - return "" + def getContent(self): + return "" - def setContent(self, value): - pass + def setContent(self, value): + pass - def delContent(self): - self.content = "" + def delContent(self): + self.content = "" - def getId(self): - return "" + def getId(self): + return "" - def setId(self, value): - pass + def setId(self, value): + pass - def delId(self): - self.id = "" + def delId(self): + self.id = "" - def getTime(self): - return None + def getTime(self): + return None - def setTime(self, value): - pass + def setTime(self, value): + pass - def delTime(self): - self.time = None + def delTime(self): + self.time = None - def getUpdated(self): - return None + def getUpdated(self): + return None - def setUpdated(self, value): - pass + def setUpdated(self, value): + pass - def delUpdated(self): - self.updated = None + def delUpdated(self): + self.updated = None - title = FeedDescriptor('title') - link = FeedDescriptor('link') - description = desc = FeedDescriptor('desc') - content = FeedDescriptor('content') - id = FeedDescriptor('id') - isPermaLink = FeedBool('isPermaLink') - time = FeedTime('time') - updated = FeedTime('updated') + title = FeedDescriptor('title') + link = FeedDescriptor('link') + description = desc = FeedDescriptor('desc') + content = FeedDescriptor('content') + id = FeedDescriptor('id') + isPermaLink = FeedBool('isPermaLink') + time = FeedTime('time') + updated = FeedTime('updated') - def pushContent(self, value): - if not self.desc and self.content: - self.desc = self.content + def pushContent(self, value): + if not self.desc and self.content: + self.desc = self.content - self.content = value + self.content = value - def remove(self): - self.xml.getparent().remove(self.xml) + def remove(self): + self.xml.getparent().remove(self.xml) class FeedItemRSS(FeedItem): - timeFormat = '%a, %d %b %Y %H:%M:%S %Z' - base = { 'rdf:rdf': 'rssfake:item', - 'channel': 'item'} + timeFormat = '%a, %d %b %Y %H:%M:%S %Z' + base = { 'rdf:rdf': 'rssfake:item', + 'channel': 'item'} - def getTitle(self): - return self.xval('rssfake:title|title') + def getTitle(self): + return self.xval('rssfake:title|title') - def setTitle(self, value): - if not value: - return self.xdel('rssfake:title|title') + def setTitle(self, value): + if not value: + return self.xdel('rssfake:title|title') - table = { 'rdf:rdf': 'rssfake:title', - 'channel': 'title'} - element = self.xgetCreate(table) - element.text = value + table = { 'rdf:rdf': 'rssfake:title', + 'channel': 'title'} + element = self.xgetCreate(table) + element.text = value - def getLink(self): - return self.xval('rssfake:link|link') + def getLink(self): + return self.xval('rssfake:link|link') - def setLink(self, value): - if self.isPermaLink and self.id == self.link != value: - self.isPermaLink = False + def setLink(self, value): + if self.isPermaLink and self.id == self.link != value: + self.isPermaLink = False - table = { 'rdf:rdf': 'rssfake:link', - 'channel': 'link'} - element = self.xgetCreate(table) - element.text = value + table = { 'rdf:rdf': 'rssfake:link', + 'channel': 'link'} + element = self.xgetCreate(table) + element.text = value - def getDesc(self): - return self.xval('rssfake:description|description') + def getDesc(self): + return self.xval('rssfake:description|description') - def setDesc(self, value): - if not value: - return self.xdel('rssfake:description|description') + def setDesc(self, value): + if not value: + return self.xdel('rssfake:description|description') - table = { 'rdf:rdf': 'rssfake:description', - 'channel': 'description'} - element = self.xgetCreate(table) - element.text = value + table = { 'rdf:rdf': 'rssfake:description', + 'channel': 'description'} + element = self.xgetCreate(table) + element.text = value - def getContent(self): - return self.xval('content:encoded') + def getContent(self): + return self.xval('content:encoded') - def setContent(self, value): - if not value: - return self.xdel('content:encoded') + def setContent(self, value): + if not value: + return self.xdel('content:encoded') - table = { 'rdf:rdf': 'content:encoded', - 'channel': 'content:encoded'} - element = self.xgetCreate(table) - element.text = value + table = { 'rdf:rdf': 'content:encoded', + 'channel': 'content:encoded'} + element = self.xgetCreate(table) + element.text = value - def getId(self): - return self.xval('rssfake:guid|guid') + def getId(self): + return self.xval('rssfake:guid|guid') - def setId(self, value): - if not value: - return self.xdel('rssfake:guid|guid') + def setId(self, value): + if not value: + return self.xdel('rssfake:guid|guid') - table = { 'rdf:rdf': 'rssfake:guid', - 'channel': 'guid'} - element = self.xgetCreate(table) - element.text = value + table = { 'rdf:rdf': 'rssfake:guid', + 'channel': 'guid'} + element = self.xgetCreate(table) + element.text = value - def getIsPermaLink(self): - return self.xget('rssfake:guid/@isPermaLink|guid/@isPermaLink') + def getIsPermaLink(self): + return self.xget('rssfake:guid/@isPermaLink|guid/@isPermaLink') - def setIsPermaLink(self, value): - table = { 'rdf:rdf': 'rssfake:guid', - 'channel': 'guid'} - element = self.xgetCreate(table) - element.attrib['isPermaLink'] = value + def setIsPermaLink(self, value): + table = { 'rdf:rdf': 'rssfake:guid', + 'channel': 'guid'} + element = self.xgetCreate(table) + element.attrib['isPermaLink'] = value - def getTime(self): - return self.xval('rssfake:pubDate|pubDate') + def getTime(self): + return self.xval('rssfake:pubDate|pubDate') - def setTime(self, value): - if not value: - return self.xdel('rssfake:pubDate|pubDate') + def setTime(self, value): + if not value: + return self.xdel('rssfake:pubDate|pubDate') - table = { 'rdf:rdf': 'rssfake:pubDate', - 'channel': 'pubDate'} - element = self.xgetCreate(table) - element.text = value + table = { 'rdf:rdf': 'rssfake:pubDate', + 'channel': 'pubDate'} + element = self.xgetCreate(table) + element.text = value class FeedItemAtom(FeedItem): - timeFormat = '%Y-%m-%dT%H:%M:%SZ' - base = { 'atom:feed': 'atom:entry', - 'atom03:feed': 'atom03:entry'} + timeFormat = '%Y-%m-%dT%H:%M:%SZ' + base = { 'atom:feed': 'atom:entry', + 'atom03:feed': 'atom03:entry'} - def getTitle(self): - return self.xval('atom:title|atom03:title') + def getTitle(self): + return self.xval('atom:title|atom03:title') - def setTitle(self, value): - if not value: - return self.xdel('atom:title|atom03:title') + def setTitle(self, value): + if not value: + return self.xdel('atom:title|atom03:title') - table = { 'atom:feed': 'atom:title', - 'atom03:feed': 'atom03:title'} - element = self.xgetCreate(table) - element.text = value + table = { 'atom:feed': 'atom:title', + 'atom03:feed': 'atom03:title'} + element = self.xgetCreate(table) + element.text = value - def getLink(self): - return self.xget('(atom:link|atom03:link)[@rel="alternate" or not(@rel)]/@href') + def getLink(self): + return self.xget('(atom:link|atom03:link)[@rel="alternate" or not(@rel)]/@href') - def setLink(self, value): - table = { 'atom:feed': ('atom:link', 'atom:link[@rel="alternate" or not(@rel)]'), - 'atom03:feed': ('atom03:link', 'atom03:link[@rel="alternate" or not(@rel)]')} - element = self.xgetCreate(table) - element.attrib['href'] = value + def setLink(self, value): + table = { 'atom:feed': ('atom:link', 'atom:link[@rel="alternate" or not(@rel)]'), + 'atom03:feed': ('atom03:link', 'atom03:link[@rel="alternate" or not(@rel)]')} + element = self.xgetCreate(table) + element.attrib['href'] = value - def getDesc(self): - # default "type" is "text" - element = self.xget('atom:summary|atom03:summary') - if element is not None: - return innerHTML(element) - else: - return "" + def getDesc(self): + # default "type" is "text" + element = self.xget('atom:summary|atom03:summary') + if element is not None: + return innerHTML(element) + else: + return "" - def setDesc(self, value): - if not value: - return self.xdel('atom:summary|atom03:summary') + def setDesc(self, value): + if not value: + return self.xdel('atom:summary|atom03:summary') - table = { 'atom:feed': 'atom:summary', - 'atom03:feed': 'atom03:summary'} - element = self.xgetCreate(table) - if element.attrib.get('type', '') == 'xhtml': - cleanNode(element) - element.attrib['type'] = 'html' - element.text = value + table = { 'atom:feed': 'atom:summary', + 'atom03:feed': 'atom03:summary'} + element = self.xgetCreate(table) + if element.attrib.get('type', '') == 'xhtml': + cleanNode(element) + element.attrib['type'] = 'html' + element.text = value - def getContent(self): - element = self.xget('atom:content|atom03:content') - if element is not None: - return innerHTML(element) - else: - return "" + def getContent(self): + element = self.xget('atom:content|atom03:content') + if element is not None: + return innerHTML(element) + else: + return "" - def setContent(self, value): - if not value: - return self.xdel('atom:content|atom03:content') + def setContent(self, value): + if not value: + return self.xdel('atom:content|atom03:content') - table = { 'atom:feed': 'atom:content', - 'atom03:feed': 'atom03:content'} - element = self.xgetCreate(table) - if element.attrib.get('type', '') == 'xhtml': - cleanNode(element) - element.attrib['type'] = 'html' - element.text = value + table = { 'atom:feed': 'atom:content', + 'atom03:feed': 'atom03:content'} + element = self.xgetCreate(table) + if element.attrib.get('type', '') == 'xhtml': + cleanNode(element) + element.attrib['type'] = 'html' + element.text = value - def getId(self): - return self.xval('atom:id|atom03:id') + def getId(self): + return self.xval('atom:id|atom03:id') - def setId(self, value): - if not value: - return self.xdel('atom:id|atom03:id') + def setId(self, value): + if not value: + return self.xdel('atom:id|atom03:id') - table = { 'atom:feed': 'atom:id', - 'atom03:feed': 'atom03:id'} - element = self.xgetCreate(table) - element.text = value + table = { 'atom:feed': 'atom:id', + 'atom03:feed': 'atom03:id'} + element = self.xgetCreate(table) + element.text = value - def getTime(self): - return self.xval('atom:published|atom03:published') + def getTime(self): + return self.xval('atom:published|atom03:published') - def setTime(self, value): - if not value: - return self.xdel('atom:published|atom03:published') + def setTime(self, value): + if not value: + return self.xdel('atom:published|atom03:published') - table = { 'atom:feed': 'atom:published', - 'atom03:feed': 'atom03:published'} - element = self.xgetCreate(table) - element.text = value + table = { 'atom:feed': 'atom:published', + 'atom03:feed': 'atom03:published'} + element = self.xgetCreate(table) + element.text = value - def getUpdated(self): - return self.xval('atom:updated|atom03:updated') + def getUpdated(self): + return self.xval('atom:updated|atom03:updated') - def setUpdated(self, value): - if not value: - return self.xdel('atom:updated|atom03:updated') + def setUpdated(self, value): + if not value: + return self.xdel('atom:updated|atom03:updated') - table = { 'atom:feed': 'atom:updated', - 'atom03:feed': 'atom03:updated'} - element = self.xgetCreate(table) - element.text = value + table = { 'atom:feed': 'atom:updated', + 'atom03:feed': 'atom03:updated'} + element = self.xgetCreate(table) + element.text = value diff --git a/morss/morss.py b/morss/morss.py index 611cb0f..2b9884c 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -31,21 +31,21 @@ from StringIO import StringIO from readability import readability from html2text import HTML2Text -LIM_ITEM = 100 # deletes what's beyond -LIM_TIME = 7 # deletes what's after -MAX_ITEM = 50 # cache-only beyond -MAX_TIME = 7 # cache-only after (in sec) -DELAY = 10*60 # xml cache & ETag cache (in sec) -TIMEOUT = 2 # http timeout (in sec) -THREADS = 10 # number of threads (1 for single-threaded) +LIM_ITEM = 100 # deletes what's beyond +LIM_TIME = 7 # deletes what's after +MAX_ITEM = 50 # cache-only beyond +MAX_TIME = 7 # cache-only after (in sec) +DELAY = 10*60 # xml cache & ETag cache (in sec) +TIMEOUT = 2 # http timeout (in sec) +THREADS = 10 # number of threads (1 for single-threaded) DEBUG = False UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)' UA_HTML = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0' -MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'], - 'html': ['text/html', 'application/xhtml+xml', 'application/xml']} +MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'], + 'html': ['text/html', 'application/xhtml+xml', 'application/xml']} FBAPPID = "" FBSECRET = "" @@ -54,791 +54,791 @@ FBAPPTOKEN = FBAPPID + '|' + FBSECRET PROTOCOL = ['http', 'https', 'ftp'] if 'SCRIPT_NAME' in os.environ: - httplib.HTTPConnection.debuglevel = 1 + httplib.HTTPConnection.debuglevel = 1 - import cgitb - cgitb.enable() + import cgitb + cgitb.enable() class MorssException(Exception): - pass + pass def log(txt, force=False): - if DEBUG or force: - if 'REQUEST_URI' in os.environ: - open('morss.log', 'a').write("%s\n" % repr(txt)) - else: - print repr(txt) + if DEBUG or force: + if 'REQUEST_URI' in os.environ: + open('morss.log', 'a').write("%s\n" % repr(txt)) + else: + print repr(txt) def lenHTML(txt): - if len(txt): - return len(lxml.html.fromstring(txt).text_content()) - else: - return 0 + if len(txt): + return len(lxml.html.fromstring(txt).text_content()) + else: + return 0 def countWord(txt): - if len(txt): - return len(lxml.html.fromstring(txt).text_content().split()) - else: - return 0 + if len(txt): + return len(lxml.html.fromstring(txt).text_content().split()) + else: + return 0 class Options: - def __init__(self, options=None): - self.options = options or [] + def __init__(self, options=None): + self.options = options or [] - def __getattr__(self, key): - return key in self.options + def __getattr__(self, key): + return key in self.options - def __setitem__(self, key, value): - self.options[key] = value + def __setitem__(self, key, value): + self.options[key] = value - def __contains__(self, key): - return key in self.options + def __contains__(self, key): + return key in self.options class Cache: - """ Light, error-prone caching system. """ - def __init__(self, folder=None, key='cache', lifespan=10*24*3600): - self._key = key - self._dir = folder - self._lifespan = lifespan + """ Light, error-prone caching system. """ + def __init__(self, folder=None, key='cache', lifespan=10*24*3600): + self._key = key + self._dir = folder + self._lifespan = lifespan - self._cache = {} + self._cache = {} - if self._dir is None: - self._hash = "NO CACHE" - return + if self._dir is None: + self._hash = "NO CACHE" + return - maxsize = os.statvfs('./').f_namemax - len(self._dir) - 1 - 4 # ".tmp" - self._hash = urllib.quote_plus(self._key)[:maxsize] + maxsize = os.statvfs('./').f_namemax - len(self._dir) - 1 - 4 # ".tmp" + self._hash = urllib.quote_plus(self._key)[:maxsize] - self._file = self._dir + '/' + self._hash - self._file_tmp = self._file + '.tmp' + self._file = self._dir + '/' + self._hash + self._file_tmp = self._file + '.tmp' - if os.path.isfile(self._file): - data = open(self._file).read() - if data: - self._cache = json.loads(data) + if os.path.isfile(self._file): + data = open(self._file).read() + if data: + self._cache = json.loads(data) - def __del__(self): - self.save() + def __del__(self): + self.save() - def __contains__(self, key): - return key in self._cache + def __contains__(self, key): + return key in self._cache - def get(self, key): - if key in self._cache: - self._cache[key]['last'] = time.time() - return self._cache[key]['value'] - else: - return None + def get(self, key): + if key in self._cache: + self._cache[key]['last'] = time.time() + return self._cache[key]['value'] + else: + return None - def set(self, key, content): - self._cache[key] = {'last': time.time(), 'value': content} + def set(self, key, content): + self._cache[key] = {'last': time.time(), 'value': content} - __getitem__ = get - __setitem__ = set + __getitem__ = get + __setitem__ = set - def save(self): - if len(self._cache) == 0 or self._dir is None: - return + def save(self): + if len(self._cache) == 0 or self._dir is None: + return - if not os.path.exists(self._dir): - os.makedirs(self._dir) + if not os.path.exists(self._dir): + os.makedirs(self._dir) - for i in self._cache.keys(): - if time.time() - self._cache[i]['last'] > self._lifespan > -1: - del self._cache[i] + for i in self._cache.keys(): + if time.time() - self._cache[i]['last'] > self._lifespan > -1: + del self._cache[i] - out = json.dumps(self._cache, indent=4) + out = json.dumps(self._cache, indent=4) - try: - open(self._file_tmp, 'w+').write(out) - os.rename(self._file_tmp, self._file) - except IOError: - log('failed to write cache to tmp file') - except OSError: - log('failed to move cache to file') + try: + open(self._file_tmp, 'w+').write(out) + os.rename(self._file_tmp, self._file) + except IOError: + log('failed to write cache to tmp file') + except OSError: + log('failed to move cache to file') - def last(self, key): - if key not in self._cache: - return -1 + def last(self, key): + if key not in self._cache: + return -1 - return self._cache[key]['last'] + return self._cache[key]['last'] - def age(self, key): - if key not in self._cache: - return -1 + def age(self, key): + if key not in self._cache: + return -1 - return time.time() - self.last(key) + return time.time() - self.last(key) - def new(self, *arg, **karg): - """ Returns a Cache object in the same directory """ - if arg[0] != self._key: - return Cache(self._dir, *arg, **karg) - else: - return self + def new(self, *arg, **karg): + """ Returns a Cache object in the same directory """ + if arg[0] != self._key: + return Cache(self._dir, *arg, **karg) + else: + return self class SimpleDownload(urllib2.HTTPCookieProcessor): - """ - Custom urllib2 handler to download a page, using etag/last-modified headers, - to save bandwidth. The given headers are added back into the header on error - 304 for easier use. - """ - def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None, accept=None, strict=False): - urllib2.HTTPCookieProcessor.__init__(self, cookiejar) - self.cache = cache - self.etag = etag - self.lastmodified = lastmodified - self.useragent = useragent - self.decode = decode - self.accept = accept - self.strict = strict + """ + Custom urllib2 handler to download a page, using etag/last-modified headers, + to save bandwidth. The given headers are added back into the header on error + 304 for easier use. + """ + def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None, accept=None, strict=False): + urllib2.HTTPCookieProcessor.__init__(self, cookiejar) + self.cache = cache + self.etag = etag + self.lastmodified = lastmodified + self.useragent = useragent + self.decode = decode + self.accept = accept + self.strict = strict - def http_request(self, req): - urllib2.HTTPCookieProcessor.http_request(self, req) - req.add_unredirected_header('Accept-Encoding', 'gzip') - req.add_unredirected_header('User-Agent', self.useragent) - if req.get_host() != 'feeds.feedburner.com': - req.add_unredirected_header('Referer', 'http://%s' % req.get_host()) + def http_request(self, req): + urllib2.HTTPCookieProcessor.http_request(self, req) + req.add_unredirected_header('Accept-Encoding', 'gzip') + req.add_unredirected_header('User-Agent', self.useragent) + if req.get_host() != 'feeds.feedburner.com': + req.add_unredirected_header('Referer', 'http://%s' % req.get_host()) - if self.cache: - if self.etag: - req.add_unredirected_header('If-None-Match', self.etag) - if self.lastmodified: - req.add_unredirected_header('If-Modified-Since', self.lastmodified) + if self.cache: + if self.etag: + req.add_unredirected_header('If-None-Match', self.etag) + if self.lastmodified: + req.add_unredirected_header('If-Modified-Since', self.lastmodified) - if self.accept is not None: - if isinstance(self.accept, basestring): - self.accept = (self.accept,) + if self.accept is not None: + if isinstance(self.accept, basestring): + self.accept = (self.accept,) - out = {} - rank = 1.1 - for group in self.accept: - rank = rank - 0.1 + out = {} + rank = 1.1 + for group in self.accept: + rank = rank - 0.1 - if isinstance(group, basestring): - if group in MIMETYPE: - group = MIMETYPE[group] - else: - out[group] = rank - continue + if isinstance(group, basestring): + if group in MIMETYPE: + group = MIMETYPE[group] + else: + out[group] = rank + continue - for mime in group: - if mime not in out: - out[mime] = rank + for mime in group: + if mime not in out: + out[mime] = rank - if not self.strict: - out['*/*'] = rank-0.1 + if not self.strict: + out['*/*'] = rank-0.1 - string = ','.join([x+';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out]) - req.add_unredirected_header('Accept', string) + string = ','.join([x+';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out]) + req.add_unredirected_header('Accept', string) - return req + return req - def http_error_304(self, req, fp, code, msg, headers): - log('http cached') - if self.etag: - headers.addheader('etag', self.etag) - if self.lastmodified: - headers.addheader('last-modified', self.lastmodified) - resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200) - return resp + def http_error_304(self, req, fp, code, msg, headers): + log('http cached') + if self.etag: + headers.addheader('etag', self.etag) + if self.lastmodified: + headers.addheader('last-modified', self.lastmodified) + resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200) + return resp - def http_response(self, req, resp): - urllib2.HTTPCookieProcessor.http_response(self, req, resp) - data = resp.read() + def http_response(self, req, resp): + urllib2.HTTPCookieProcessor.http_response(self, req, resp) + data = resp.read() - if 200 <= resp.code < 300: - # gzip - if resp.headers.get('Content-Encoding') == 'gzip': - log('un-gzip') - data = GzipFile(fileobj=StringIO(data), mode='r').read() + if 200 <= resp.code < 300: + # gzip + if resp.headers.get('Content-Encoding') == 'gzip': + log('un-gzip') + data = GzipFile(fileobj=StringIO(data), mode='r').read() - if 200 <= resp.code < 300 and resp.info().maintype == 'text': - # redirect - if resp.info().type in MIMETYPE['html']: - match = re.search(r'(?i)]*?url=(http.*?)["\']', data) - if match: - newurl = match.groups()[0] - log('redirect: %s' % newurl) + if 200 <= resp.code < 300 and resp.info().maintype == 'text': + # redirect + if resp.info().type in MIMETYPE['html']: + match = re.search(r'(?i)]*?url=(http.*?)["\']', data) + if match: + newurl = match.groups()[0] + log('redirect: %s' % newurl) - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ('content-length', 'content-type')) - new = urllib2.Request(newurl, - headers=newheaders, - origin_req_host=req.get_origin_req_host(), - unverifiable=True) + newheaders = dict((k,v) for k,v in req.headers.items() + if k.lower() not in ('content-length', 'content-type')) + new = urllib2.Request(newurl, + headers=newheaders, + origin_req_host=req.get_origin_req_host(), + unverifiable=True) - return self.parent.open(new, timeout=req.timeout) + return self.parent.open(new, timeout=req.timeout) - # encoding - enc = detEncoding(data, resp) + # encoding + enc = detEncoding(data, resp) - if enc: - data = data.decode(enc, 'replace') + if enc: + data = data.decode(enc, 'replace') - if not self.decode: - data = data.encode(enc) + if not self.decode: + data = data.encode(enc) - fp = StringIO(data) - old_resp = resp - resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg + fp = StringIO(data) + old_resp = resp + resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg - return resp + return resp - https_response = http_response - https_request = http_request + https_response = http_response + https_request = http_request def detEncoding(data, con=None): - if con is not None and con.headers.getparam('charset'): - log('header') - return con.headers.getparam('charset') + if con is not None and con.headers.getparam('charset'): + log('header') + return con.headers.getparam('charset') - match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data[:1000]) - if match: - log('meta.re') - return match.groups()[0] + match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data[:1000]) + if match: + log('meta.re') + return match.groups()[0] - match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100]) - if match: - return match.groups()[0].lower() + match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100]) + if match: + return match.groups()[0].lower() - return None + return None def Fix(item, feedurl='/'): - """ Improves feed items (absolute links, resolve feedburner links, etc) """ + """ Improves feed items (absolute links, resolve feedburner links, etc) """ - # check unwanted uppercase title - if len(item.title) > 20 and item.title.isupper(): - item.title = item.title.title() + # check unwanted uppercase title + if len(item.title) > 20 and item.title.isupper(): + item.title = item.title.title() - # check if it includes link - if not item.link: - log('no link') - return item + # check if it includes link + if not item.link: + log('no link') + return item - # wikipedia daily highlight - if fnmatch(feedurl, 'http*://*.wikipedia.org/w/api.php?*&feedformat=atom'): - match = lxml.html.fromstring(item.desc).xpath('//b/a/@href') - if len(match): - item.link = match[0] - log(item.link) + # wikipedia daily highlight + if fnmatch(feedurl, 'http*://*.wikipedia.org/w/api.php?*&feedformat=atom'): + match = lxml.html.fromstring(item.desc).xpath('//b/a/@href') + if len(match): + item.link = match[0] + log(item.link) - # check relative urls - item.link = urlparse.urljoin(feedurl, item.link) + # check relative urls + item.link = urlparse.urljoin(feedurl, item.link) - # google translate - if fnmatch(item.link, 'http://translate.google.*/translate*u=*'): - item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['u'][0] - log(item.link) + # google translate + if fnmatch(item.link, 'http://translate.google.*/translate*u=*'): + item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['u'][0] + log(item.link) - # google - if fnmatch(item.link, 'http://www.google.*/url?q=*'): - item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['q'][0] - log(item.link) + # google + if fnmatch(item.link, 'http://www.google.*/url?q=*'): + item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['q'][0] + log(item.link) - # google news - if fnmatch(item.link, 'http://news.google.com/news/url*url=*'): - item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['url'][0] - log(item.link) + # google news + if fnmatch(item.link, 'http://news.google.com/news/url*url=*'): + item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['url'][0] + log(item.link) - # facebook - if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'): - item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['u'][0] - log(item.link) + # facebook + if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'): + item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['u'][0] + log(item.link) - # feedburner - feeds.NSMAP['feedburner'] = 'http://rssnamespace.org/feedburner/ext/1.0' - match = item.xval('feedburner:origLink') - if match: - item.link = match + # feedburner + feeds.NSMAP['feedburner'] = 'http://rssnamespace.org/feedburner/ext/1.0' + match = item.xval('feedburner:origLink') + if match: + item.link = match - # feedsportal - match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link) - if match: - url = match.groups()[0].split('0') - t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'H':',', 'I':'_', 'L':'http://', 'S':'www.', 'N':'.com', 'O':'.co.uk'} - item.link = ''.join([(t[s[0]] if s[0] in t else '=') + s[1:] for s in url[1:]]) - log(item.link) + # feedsportal + match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link) + if match: + url = match.groups()[0].split('0') + t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'H':',', 'I':'_', 'L':'http://', 'S':'www.', 'N':'.com', 'O':'.co.uk'} + item.link = ''.join([(t[s[0]] if s[0] in t else '=') + s[1:] for s in url[1:]]) + log(item.link) - # reddit - if urlparse.urlparse(feedurl).netloc == 'www.reddit.com': - match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href') - if len(match): - item.link = match[0] - log(item.link) + # reddit + if urlparse.urlparse(feedurl).netloc == 'www.reddit.com': + match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href') + if len(match): + item.link = match[0] + log(item.link) - return item + return item def Fill(item, cache, feedurl='/', fast=False): - """ Returns True when it has done its best """ + """ Returns True when it has done its best """ - if not item.link: - log('no link') - return item + if not item.link: + log('no link') + return item - log(item.link) + log(item.link) - # content already provided? - count_content = countWord(item.content) - count_desc = countWord(item.desc) + # content already provided? + count_content = countWord(item.content) + count_desc = countWord(item.desc) - if max(count_content, count_desc) > 500: - if count_desc > count_content: - item.content = item.desc - del item.desc - log('reversed sizes') - log('long enough') - return True + if max(count_content, count_desc) > 500: + if count_desc > count_content: + item.content = item.desc + del item.desc + log('reversed sizes') + log('long enough') + return True - if count_content > 5*count_desc > 0 and count_content > 50: - log('content bigger enough') - return True + if count_content > 5*count_desc > 0 and count_content > 50: + log('content bigger enough') + return True - link = item.link + link = item.link - # twitter - if urlparse.urlparse(feedurl).netloc == 'twitter.com': - match = lxml.html.fromstring(item.content).xpath('//a/@data-expanded-url') - if len(match): - link = match[0] - log(link) - else: - link = None + # twitter + if urlparse.urlparse(feedurl).netloc == 'twitter.com': + match = lxml.html.fromstring(item.content).xpath('//a/@data-expanded-url') + if len(match): + link = match[0] + log(link) + else: + link = None - # facebook - if urlparse.urlparse(feedurl).netloc == 'graph.facebook.com': - match = lxml.html.fromstring(item.content).xpath('//a/@href') - if len(match) and urlparse.urlparse(match[0]).netloc != 'www.facebook.com': - link = match[0] - log(link) - else: - link = None + # facebook + if urlparse.urlparse(feedurl).netloc == 'graph.facebook.com': + match = lxml.html.fromstring(item.content).xpath('//a/@href') + if len(match) and urlparse.urlparse(match[0]).netloc != 'www.facebook.com': + link = match[0] + log(link) + else: + link = None - if link is None: - log('no used link') - return True + if link is None: + log('no used link') + return True - # check cache and previous errors - if link in cache: - content = cache.get(link) - match = re.search(r'^error-([a-z]{2,10})$', content) - if match: - if cache.age(link) > DELAY: - log('cached error: %s' % match.groups()[0]) - return True - else: - log('old error') - else: - log('cached') - item.pushContent(cache.get(link)) - return True + # check cache and previous errors + if link in cache: + content = cache.get(link) + match = re.search(r'^error-([a-z]{2,10})$', content) + if match: + if cache.age(link) > DELAY: + log('cached error: %s' % match.groups()[0]) + return True + else: + log('old error') + else: + log('cached') + item.pushContent(cache.get(link)) + return True - # super-fast mode - if fast: - log('skipped') - return False + # super-fast mode + if fast: + log('skipped') + return False - # download - try: - url = link.encode('utf-8') - con = urllib2.build_opener(SimpleDownload(accept=('html', 'text/*'), strict=True)).open(url, timeout=TIMEOUT) - data = con.read() - except (IOError, httplib.HTTPException) as e: - log('http error: %s' % e.message) - cache.set(link, 'error-http') - return True + # download + try: + url = link.encode('utf-8') + con = urllib2.build_opener(SimpleDownload(accept=('html', 'text/*'), strict=True)).open(url, timeout=TIMEOUT) + data = con.read() + except (IOError, httplib.HTTPException) as e: + log('http error: %s' % e.message) + cache.set(link, 'error-http') + return True - if con.info().type not in MIMETYPE['html'] and con.info().type != 'text/plain': - log('non-text page') - cache.set(link, 'error-type') - return True + if con.info().type not in MIMETYPE['html'] and con.info().type != 'text/plain': + log('non-text page') + cache.set(link, 'error-type') + return True - out = readability.Document(data, url=con.url).summary(True) + out = readability.Document(data, url=con.url).summary(True) - if countWord(out) > max(count_content, count_desc) > 0: - item.pushContent(out) - cache.set(link, out) - else: - log('not bigger enough') - cache.set(link, 'error-length') - return True + if countWord(out) > max(count_content, count_desc) > 0: + item.pushContent(out) + cache.set(link, out) + else: + log('not bigger enough') + cache.set(link, 'error-length') + return True - return True + return True def Init(url, cachePath, options): - # url clean up - log(url) + # url clean up + log(url) - if url is None: - raise MorssException('No url provided') + if url is None: + raise MorssException('No url provided') - if urlparse.urlparse(url).scheme not in PROTOCOL: - url = 'http://' + url - log(url) + if urlparse.urlparse(url).scheme not in PROTOCOL: + url = 'http://' + url + log(url) - url = url.replace(' ', '%20') + url = url.replace(' ', '%20') - # cache - cache = Cache(cachePath, url) - log(cache._hash) + # cache + cache = Cache(cachePath, url) + log(cache._hash) - return (url, cache) + return (url, cache) def Fetch(url, cache, options): - # do some useful facebook work - feedify.PreWorker(url, cache) + # do some useful facebook work + feedify.PreWorker(url, cache) - if 'redirect' in cache: - url = cache.get('redirect') - log('url redirect') - log(url) + if 'redirect' in cache: + url = cache.get('redirect') + log('url redirect') + log(url) - # fetch feed - if not options.theforce and 'xml' in cache and cache.age('xml') < DELAY and 'style' in cache: - log('xml cached') - xml = cache.get('xml') - style = cache.get('style') - else: - try: - opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), accept=('xml','html')) - con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT*2) - xml = con.read() - except (IOError, httplib.HTTPException): - raise MorssException('Error downloading feed') + # fetch feed + if not options.theforce and 'xml' in cache and cache.age('xml') < DELAY and 'style' in cache: + log('xml cached') + xml = cache.get('xml') + style = cache.get('style') + else: + try: + opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), accept=('xml','html')) + con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT*2) + xml = con.read() + except (IOError, httplib.HTTPException): + raise MorssException('Error downloading feed') - cache.set('xml', xml) - cache.set('etag', con.headers.getheader('etag')) - cache.set('lastmodified', con.headers.getheader('last-modified')) + cache.set('xml', xml) + cache.set('etag', con.headers.getheader('etag')) + cache.set('lastmodified', con.headers.getheader('last-modified')) - if url.startswith('https://itunes.apple.com/lookup?id='): - style = 'itunes' - elif xml.startswith(' lim_time >= 0 or i+1 > lim_item >= 0: - log('dropped') - item.remove() - return + def worker(i, item): + if time.time() - startTime > lim_time >= 0 or i+1 > lim_item >= 0: + log('dropped') + item.remove() + return - item = Fix(item, url) + item = Fix(item, url) - if time.time() - startTime > max_time >= 0 or i+1 > max_item >= 0: - if not options.proxy: - if Fill(item, cache, url, True) is False: - item.remove() - return - else: - if not options.proxy: - Fill(item, cache, url) + if time.time() - startTime > max_time >= 0 or i+1 > max_item >= 0: + if not options.proxy: + if Fill(item, cache, url, True) is False: + item.remove() + return + else: + if not options.proxy: + Fill(item, cache, url) - queue = Queue.Queue() + queue = Queue.Queue() - for i in range(THREADS): - t = threading.Thread(target=runner, args=(queue,)) - t.daemon = True - t.start() + for i in range(THREADS): + t = threading.Thread(target=runner, args=(queue,)) + t.daemon = True + t.start() - for i, item in enumerate(rss.items): - queue.put([i, item]) + for i, item in enumerate(rss.items): + queue.put([i, item]) - queue.join() - cache.save() + queue.join() + cache.save() - if options.ad: - new = rss.items.append() - new.title = "Are you hungry?" - new.desc = "Eat some Galler chocolate :)" - new.link = "http://www.galler.com/" - new.time = "5 Oct 2013 22:42" + if options.ad: + new = rss.items.append() + new.title = "Are you hungry?" + new.desc = "Eat some Galler chocolate :)" + new.link = "http://www.galler.com/" + new.time = "5 Oct 2013 22:42" - log(len(rss.items)) - log(time.time() - startTime) + log(len(rss.items)) + log(time.time() - startTime) - return rss + return rss def After(rss, options): - for i, item in enumerate(rss.items): + for i, item in enumerate(rss.items): - if item.desc and item.content: - if options.clip: - item.content = item.desc + "

* * *


" + item.content - del item.desc - if not options.keep: - del item.desc + if item.desc and item.content: + if options.clip: + item.content = item.desc + "

* * *


" + item.content + del item.desc + if not options.keep: + del item.desc - if options.nolink and item.content: - content = lxml.html.fromstring(item.content) - for link in content.xpath('//a'): - log(link.text_content()) - link.drop_tag() - item.content = lxml.etree.tostring(content) + if options.nolink and item.content: + content = lxml.html.fromstring(item.content) + for link in content.xpath('//a'): + log(link.text_content()) + link.drop_tag() + item.content = lxml.etree.tostring(content) - if options.noref: - item.link = '' + if options.noref: + item.link = '' - if options.md: - conv = HTML2Text(baseurl=item.link) - conv.unicode_snob = True + if options.md: + conv = HTML2Text(baseurl=item.link) + conv.unicode_snob = True - if item.desc: - item.desc = conv.handle(item.desc) - if item.content: - item.content = conv.handle(item.content) + if item.desc: + item.desc = conv.handle(item.desc) + if item.content: + item.content = conv.handle(item.content) - if options.json: - if options.indent: - return rss.tojson(indent=4) - else: - return rss.tojson() - elif options.csv: - return rss.tocsv() - elif options.reader: - return rss.tohtml() - else: - return rss.tostring(xml_declaration=True, encoding='UTF-8') + if options.json: + if options.indent: + return rss.tojson(indent=4) + else: + return rss.tojson() + elif options.csv: + return rss.tocsv() + elif options.reader: + return rss.tohtml() + else: + return rss.tostring(xml_declaration=True, encoding='UTF-8') def process(url, cache=None, options=None): - if options == None: - options = [] + if options == None: + options = [] - options = Options(options) - url, cache = Init(url, cache, options) - rss = Fetch(url, cache, options) - rss = Gather(rss, url, cache, options) + options = Options(options) + url, cache = Init(url, cache, options) + rss = Fetch(url, cache, options) + rss = Gather(rss, url, cache, options) - return After(rss, options) + return After(rss, options) def cgi_app(environ, start_response): - # get options - if 'REQUEST_URI' in environ: - url = environ['REQUEST_URI'][1:] - else: - url = environ['PATH_INFO'][1:] + # get options + if 'REQUEST_URI' in environ: + url = environ['REQUEST_URI'][1:] + else: + url = environ['PATH_INFO'][1:] - url = re.sub(r'^/?morss.py/', '', url) + url = re.sub(r'^/?morss.py/', '', url) - if url.startswith(':'): - options = url.split('/')[0].split(':')[1:] - url = url.split('/', 1)[1] - else: - options = [] + if url.startswith(':'): + options = url.split('/')[0].split(':')[1:] + url = url.split('/', 1)[1] + else: + options = [] - # init - options = Options(options) - headers = {} + # init + options = Options(options) + headers = {} - global DEBUG - DEBUG = options.debug + global DEBUG + DEBUG = options.debug - if 'HTTP_IF_NONE_MATCH' in environ: - if not options.force and not options.facebook and time.time() - int(environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY: - headers['status'] = '304 Not Modified' - start_response(headers['status'], headers.items()) - log(url) - log('etag good') - return [] + if 'HTTP_IF_NONE_MATCH' in environ: + if not options.force and not options.facebook and time.time() - int(environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY: + headers['status'] = '304 Not Modified' + start_response(headers['status'], headers.items()) + log(url) + log('etag good') + return [] - # headers - headers['status'] = '200 OK' - headers['etag'] = '"%s"' % int(time.time()) + # headers + headers['status'] = '200 OK' + headers['etag'] = '"%s"' % int(time.time()) - if options.html or options.reader: - headers['content-type'] = 'text/html' - elif options.debug or options.txt: - headers['content-type'] = 'text/plain' - elif options.json: - headers['content-type'] = 'application/json' - elif options.csv: - headers['content-type'] = 'text/csv' - headers['content-disposition'] = 'attachment; filename="feed.csv"' - else: - headers['content-type'] = 'text/xml' + if options.html or options.reader: + headers['content-type'] = 'text/html' + elif options.debug or options.txt: + headers['content-type'] = 'text/plain' + elif options.json: + headers['content-type'] = 'application/json' + elif options.csv: + headers['content-type'] = 'text/csv' + headers['content-disposition'] = 'attachment; filename="feed.csv"' + else: + headers['content-type'] = 'text/xml' - url, cache = Init(url, os.getcwd() + '/cache', options) + url, cache = Init(url, os.getcwd() + '/cache', options) - if options.facebook: - doFacebook(url, environ, headers, options, cache) - start_response(headers['status'], headers.items()) - return + if options.facebook: + doFacebook(url, environ, headers, options, cache) + start_response(headers['status'], headers.items()) + return - # get the work done - RSS = Fetch(url, cache, options) + # get the work done + RSS = Fetch(url, cache, options) - if headers['content-type'] == 'text/xml': - headers['content-type'] = RSS.mimetype + if headers['content-type'] == 'text/xml': + headers['content-type'] = RSS.mimetype - start_response(headers['status'], headers.items()) + start_response(headers['status'], headers.items()) - RSS = Gather(RSS, url, cache, options) + RSS = Gather(RSS, url, cache, options) - if not DEBUG and not options.silent: - return After(RSS, options) + if not DEBUG and not options.silent: + return After(RSS, options) - log('done') + log('done') def cgi_wrapper(environ, start_response): - # simple http server for html and css - files = { - '': 'text/html', - 'index.html': 'text/html'} + # simple http server for html and css + files = { + '': 'text/html', + 'index.html': 'text/html'} - if 'REQUEST_URI' in environ: - url = environ['REQUEST_URI'][1:] - else: - url = environ['PATH_INFO'][1:] + if 'REQUEST_URI' in environ: + url = environ['REQUEST_URI'][1:] + else: + url = environ['PATH_INFO'][1:] - if url in files: - headers = {} + if url in files: + headers = {} - if url == '': - url = 'index.html' + if url == '': + url = 'index.html' - if os.path.isfile(url): - headers['status'] = '200 OK' - headers['content-type'] = files[url] - start_response(headers['status'], headers.items()) - return open(url, 'rb').read() - else: - headers['status'] = '404 Not found' - start_response(headers['status'], headers.items()) - return '' + if os.path.isfile(url): + headers['status'] = '200 OK' + headers['content-type'] = files[url] + start_response(headers['status'], headers.items()) + return open(url, 'rb').read() + else: + headers['status'] = '404 Not found' + start_response(headers['status'], headers.items()) + return '' - # actual morss use - try: - return cgi_app(environ, start_response) or [] - except (KeyboardInterrupt, SystemExit): - raise - except Exception as e: - headers = {} - headers['status'] = '500 Oops' - headers['content-type'] = 'text/plain' - start_response(headers['status'], headers.items(), sys.exc_info()) - log('ERROR: %s' % e.message, force=True) - return 'An error happened' + # actual morss use + try: + return cgi_app(environ, start_response) or [] + except (KeyboardInterrupt, SystemExit): + raise + except Exception as e: + headers = {} + headers['status'] = '500 Oops' + headers['content-type'] = 'text/plain' + start_response(headers['status'], headers.items(), sys.exc_info()) + log('ERROR: %s' % e.message, force=True) + return 'An error happened' def cli_app(): - options = Options(sys.argv[1:-1]) - url = sys.argv[-1] + options = Options(sys.argv[1:-1]) + url = sys.argv[-1] - global DEBUG - DEBUG = options.debug + global DEBUG + DEBUG = options.debug - url, cache = Init(url, os.path.expanduser('~/.cache/morss'), options) - RSS = Fetch(url, cache, options) - RSS = Gather(RSS, url, cache, options) + url, cache = Init(url, os.path.expanduser('~/.cache/morss'), options) + RSS = Fetch(url, cache, options) + RSS = Gather(RSS, url, cache, options) - if not DEBUG and not options.silent: - print After(RSS, options) + if not DEBUG and not options.silent: + print After(RSS, options) - log('done') + log('done') def doFacebook(url, environ, headers, options, cache): - log('fb stuff') + log('fb stuff') - query = urlparse.urlparse(url).query + query = urlparse.urlparse(url).query - if 'code' in query: - # get real token from code - code = urlparse.parse_qs(query)['code'][0] - eurl = "https://graph.facebook.com/oauth/access_token?client_id={app_id}&redirect_uri={redirect_uri}&client_secret={app_secret}&code={code_parameter}".format(app_id=FBAPPID, app_secret=FBSECRET, code_parameter=code, redirect_uri=environ['SCRIPT_URI']) - token = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())['access_token'][0] + if 'code' in query: + # get real token from code + code = urlparse.parse_qs(query)['code'][0] + eurl = "https://graph.facebook.com/oauth/access_token?client_id={app_id}&redirect_uri={redirect_uri}&client_secret={app_secret}&code={code_parameter}".format(app_id=FBAPPID, app_secret=FBSECRET, code_parameter=code, redirect_uri=environ['SCRIPT_URI']) + token = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())['access_token'][0] - # get long-lived access token - eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(app_id=FBAPPID, app_secret=FBSECRET, short_lived_token=token) - values = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip()) + # get long-lived access token + eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(app_id=FBAPPID, app_secret=FBSECRET, short_lived_token=token) + values = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip()) - ltoken = values['access_token'][0] - expires = int(time.time() + int(values['expires'][0])) + ltoken = values['access_token'][0] + expires = int(time.time() + int(values['expires'][0])) - headers['set-cookie'] = 'token={token}; Path=/'.format(token=ltoken) + headers['set-cookie'] = 'token={token}; Path=/'.format(token=ltoken) - # headers - headers['status'] = '303 See Other' - headers['location'] = 'http://{domain}/'.format(domain=environ['SERVER_NAME']) + # headers + headers['status'] = '303 See Other' + headers['location'] = 'http://{domain}/'.format(domain=environ['SERVER_NAME']) - log('fb done') - return + log('fb done') + return def main(): - if 'REQUEST_URI' in os.environ: - wsgiref.handlers.CGIHandler().run(cgi_wrapper) + if 'REQUEST_URI' in os.environ: + wsgiref.handlers.CGIHandler().run(cgi_wrapper) - elif len(sys.argv) <= 1: - httpd = wsgiref.simple_server.make_server('', 8080, cgi_wrapper) - httpd.serve_forever() + elif len(sys.argv) <= 1: + httpd = wsgiref.simple_server.make_server('', 8080, cgi_wrapper) + httpd.serve_forever() - else: - try: - cli_app() - except (KeyboardInterrupt, SystemExit): - raise - except Exception as e: - print 'ERROR: %s' % e.message + else: + try: + cli_app() + except (KeyboardInterrupt, SystemExit): + raise + except Exception as e: + print 'ERROR: %s' % e.message if __name__ == '__main__': - main() + main() diff --git a/setup.py b/setup.py index ef764d3..7715db9 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,13 @@ from setuptools import setup, find_packages package_name = 'morss' -setup( name=package_name, - description='Get full-text RSS feeds', - author='pictuga', - author_email='contact at author name dot com', - url='http://morss.it/', - license='GPL 3+', - package_dir={package_name: package_name}, - packages=find_packages(), - package_data={package_name: ['feedify.ini']}, - test_suite=package_name + '.tests') +setup( name=package_name, + description='Get full-text RSS feeds', + author='pictuga', + author_email='contact at author name dot com', + url='http://morss.it/', + license='GPL 3+', + package_dir={package_name: package_name}, + packages=find_packages(), + package_data={package_name: ['feedify.ini']}, + test_suite=package_name + '.tests')