diff --git a/morss/feedify.py b/morss/feedify.py
index 804fcd7..6271817 100644
--- a/morss/feedify.py
+++ b/morss/feedify.py
@@ -13,170 +13,170 @@ import urlparse
def toclass(query):
- pattern = r'\[class=([^\]]+)\]'
- repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
- return re.sub(pattern, repl, query)
+ pattern = r'\[class=([^\]]+)\]'
+ repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
+ return re.sub(pattern, repl, query)
def getRule(link):
- config = ConfigParser()
- config.read('feedify.ini')
+ config = ConfigParser()
+ config.read('feedify.ini')
- for section in config.sections():
- values = dict(config.items(section))
- values['path'] = values['path'].split('\n')[1:]
- for path in values['path']:
- if fnmatch(link, path):
- return values
- return False
+ for section in config.sections():
+ values = dict(config.items(section))
+ values['path'] = values['path'].split('\n')[1:]
+ for path in values['path']:
+ if fnmatch(link, path):
+ return values
+ return False
def supported(link):
- return getRule(link) is not False
+ return getRule(link) is not False
def formatString(string, getter, error=False):
- out = ""
- char = string[0]
+ out = ""
+ char = string[0]
- follow = string[1:]
+ follow = string[1:]
- if char == '"':
- match = follow.partition('"')
- out = match[0]
- if len(match) >= 2:
- next = match[2]
- else:
- next = None
- elif char == '{':
- match = follow.partition('}')
- try:
- test = formatString(match[0], getter, True)
- except ValueError, KeyError:
- pass
- else:
- out = test
+ if char == '"':
+ match = follow.partition('"')
+ out = match[0]
+ if len(match) >= 2:
+ next = match[2]
+ else:
+ next = None
+ elif char == '{':
+ match = follow.partition('}')
+ try:
+ test = formatString(match[0], getter, True)
+ except ValueError, KeyError:
+ pass
+ else:
+ out = test
- next = match[2]
- elif char == ' ':
- next = follow
- elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string):
- match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups()
- rawValue = getter(match[0])
- if not isinstance(rawValue, basestring):
- if match[1] is not None:
- out = match[1].join(rawValue)
- else:
- out = ''.join(rawValue)
- if not out and error:
- raise ValueError
- next = match[2]
- else:
- raise ValueError('bogus string')
+ next = match[2]
+ elif char == ' ':
+ next = follow
+ elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string):
+ match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups()
+ rawValue = getter(match[0])
+ if not isinstance(rawValue, basestring):
+ if match[1] is not None:
+ out = match[1].join(rawValue)
+ else:
+ out = ''.join(rawValue)
+ if not out and error:
+ raise ValueError
+ next = match[2]
+ else:
+ raise ValueError('bogus string')
- if next is not None and len(next):
- return out + formatString(next, getter, error)
- else:
- return out
+ if next is not None and len(next):
+ return out + formatString(next, getter, error)
+ else:
+ return out
def PreWorker(url, cache):
- if urlparse.urlparse(url).netloc == 'itunes.apple.com':
- match = re.search('/id([0-9]+)(\?.*)?$', url)
- if match:
- iid = match.groups()[0]
- redirect = 'https://itunes.apple.com/lookup?id={id}'.format(id=iid)
- cache.set('redirect', redirect)
+ if urlparse.urlparse(url).netloc == 'itunes.apple.com':
+ match = re.search('/id([0-9]+)(\?.*)?$', url)
+ if match:
+ iid = match.groups()[0]
+ redirect = 'https://itunes.apple.com/lookup?id={id}'.format(id=iid)
+ cache.set('redirect', redirect)
class Builder(object):
- def __init__(self, link, data=None, cache=False):
- self.link = link
- self.cache = cache
+ def __init__(self, link, data=None, cache=False):
+ self.link = link
+ self.cache = cache
- if data is None:
- data = urllib2.urlopen(link).read()
- self.data = data
+ if data is None:
+ data = urllib2.urlopen(link).read()
+ self.data = data
- self.rule = getRule(link)
+ self.rule = getRule(link)
- if self.rule['mode'] == 'xpath':
- if not isinstance(self.data, unicode):
- self.data = self.data.decode(morss.detEncoding(self.data), 'replace')
- self.doc = lxml.html.fromstring(self.data)
- elif self.rule['mode'] == 'json':
- self.doc = json.loads(data)
+ if self.rule['mode'] == 'xpath':
+ if not isinstance(self.data, unicode):
+ self.data = self.data.decode(morss.detEncoding(self.data), 'replace')
+ self.doc = lxml.html.fromstring(self.data)
+ elif self.rule['mode'] == 'json':
+ self.doc = json.loads(data)
- self.feed = feeds.FeedParserAtom()
+ self.feed = feeds.FeedParserAtom()
- def raw(self, html, expr):
- if self.rule['mode'] == 'xpath':
- return html.xpath(toclass(expr))
+ def raw(self, html, expr):
+ if self.rule['mode'] == 'xpath':
+ return html.xpath(toclass(expr))
- elif self.rule['mode'] == 'json':
- a = [html]
- b = []
- for x in expr.strip(".").split("."):
- match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups()
- for elem in a:
- if isinstance(elem, dict):
- kids = elem.get(match[0])
- if kids is None:
- pass
- elif isinstance(kids, list):
- [b.append(i) for i in kids]
- elif isinstance(kids, basestring):
- b.append(kids.replace('\n', '
'))
- else:
- b.append(kids)
+ elif self.rule['mode'] == 'json':
+ a = [html]
+ b = []
+ for x in expr.strip(".").split("."):
+ match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups()
+ for elem in a:
+ if isinstance(elem, dict):
+ kids = elem.get(match[0])
+ if kids is None:
+ pass
+ elif isinstance(kids, list):
+ [b.append(i) for i in kids]
+ elif isinstance(kids, basestring):
+ b.append(kids.replace('\n', '
'))
+ else:
+ b.append(kids)
- if match[1] is None:
- a = b
- else:
- if len(b)-1 >= int(match[1]):
- a = [b[int(match[1])]]
- else:
- a = []
- b = []
- return a
+ if match[1] is None:
+ a = b
+ else:
+ if len(b)-1 >= int(match[1]):
+ a = [b[int(match[1])]]
+ else:
+ a = []
+ b = []
+ return a
- def strings(self, html, expr):
- if self.rule['mode'] == 'xpath':
- out = []
- for match in self.raw(html, expr):
- if isinstance(match, basestring):
- out.append(match)
- elif isinstance(match, lxml.html.HtmlElement):
- out.append(lxml.html.tostring(match))
- return out
+ def strings(self, html, expr):
+ if self.rule['mode'] == 'xpath':
+ out = []
+ for match in self.raw(html, expr):
+ if isinstance(match, basestring):
+ out.append(match)
+ elif isinstance(match, lxml.html.HtmlElement):
+ out.append(lxml.html.tostring(match))
+ return out
- elif self.rule['mode'] == 'json':
- return self.raw(html, expr)
+ elif self.rule['mode'] == 'json':
+ return self.raw(html, expr)
- def string(self, html, expr):
- getter = lambda x: self.strings(html, x)
- return formatString(self.rule[expr], getter)
+ def string(self, html, expr):
+ getter = lambda x: self.strings(html, x)
+ return formatString(self.rule[expr], getter)
- def build(self):
- if 'title' in self.rule:
- self.feed.title = self.string(self.doc, 'title')
+ def build(self):
+ if 'title' in self.rule:
+ self.feed.title = self.string(self.doc, 'title')
- if 'items' in self.rule:
- matches = self.raw(self.doc, self.rule['items'])
- if matches and len(matches):
- for item in matches:
- feedItem = {}
+ if 'items' in self.rule:
+ matches = self.raw(self.doc, self.rule['items'])
+ if matches and len(matches):
+ for item in matches:
+ feedItem = {}
- if 'item_title' in self.rule:
- feedItem['title'] = self.string(item, 'item_title')
- if 'item_link' in self.rule:
- url = self.string(item, 'item_link')
- url = urlparse.urljoin(self.link, url)
- feedItem['link'] = url
- if 'item_desc' in self.rule:
- feedItem['desc'] = self.string(item, 'item_desc')
- if 'item_content' in self.rule:
- feedItem['content'] = self.string(item, 'item_content')
- if 'item_time' in self.rule:
- feedItem['updated'] = self.string(item, 'item_time')
- if 'item_id' in self.rule:
- feedItem['id'] = self.string(item, 'item_id')
- feedItem['isPermaLink'] = False
+ if 'item_title' in self.rule:
+ feedItem['title'] = self.string(item, 'item_title')
+ if 'item_link' in self.rule:
+ url = self.string(item, 'item_link')
+ url = urlparse.urljoin(self.link, url)
+ feedItem['link'] = url
+ if 'item_desc' in self.rule:
+ feedItem['desc'] = self.string(item, 'item_desc')
+ if 'item_content' in self.rule:
+ feedItem['content'] = self.string(item, 'item_content')
+ if 'item_time' in self.rule:
+ feedItem['updated'] = self.string(item, 'item_time')
+ if 'item_id' in self.rule:
+ feedItem['id'] = self.string(item, 'item_id')
+ feedItem['isPermaLink'] = False
- self.feed.items.append(feedItem)
+ self.feed.items.append(feedItem)
diff --git a/morss/feeds.py b/morss/feeds.py
index 7985637..f18232a 100644
--- a/morss/feeds.py
+++ b/morss/feeds.py
@@ -11,740 +11,740 @@ import json
import csv
try:
- from wheezy.template.engine import Engine
- from wheezy.template.loader import DictLoader
- from wheezy.template.ext.core import CoreExtension
+ from wheezy.template.engine import Engine
+ from wheezy.template.loader import DictLoader
+ from wheezy.template.ext.core import CoreExtension
except ImportError:
- Engine = DictLoader = CoreExtension = None
+ Engine = DictLoader = CoreExtension = None
json.encoder.c_make_encoder = None
try:
- from collections import OrderedDict
+ from collections import OrderedDict
except ImportError:
- from ordereddict import OrderedDict
+ from ordereddict import OrderedDict
Element = etree.Element
-NSMAP = {'atom': 'http://www.w3.org/2005/Atom',
- 'atom03': 'http://purl.org/atom/ns#',
- 'media': 'http://search.yahoo.com/mrss/',
- 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
- 'slash': 'http://purl.org/rss/1.0/modules/slash/',
- 'dc': 'http://purl.org/dc/elements/1.1/',
- 'content': 'http://purl.org/rss/1.0/modules/content/',
- 'rssfake': 'http://purl.org/rss/1.0/'}
+NSMAP = {'atom': 'http://www.w3.org/2005/Atom',
+ 'atom03': 'http://purl.org/atom/ns#',
+ 'media': 'http://search.yahoo.com/mrss/',
+ 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
+ 'slash': 'http://purl.org/rss/1.0/modules/slash/',
+ 'dc': 'http://purl.org/dc/elements/1.1/',
+ 'content': 'http://purl.org/rss/1.0/modules/content/',
+ 'rssfake': 'http://purl.org/rss/1.0/'}
def load(url):
- import urllib2
- d = urllib2.urlopen(url).read()
- return parse(d)
+ import urllib2
+ d = urllib2.urlopen(url).read()
+ return parse(d)
def tagNS(tag, nsmap=NSMAP):
- match = re.search(r'^\{([^\}]+)\}(.*)$', tag)
- if match:
- match = match.groups()
- for (key, url) in nsmap.iteritems():
- if url == match[0]:
- return "%s:%s" % (key, match[1].lower())
- else:
- match = re.search(r'^([^:]+):([^:]+)$', tag)
- if match:
- match = match.groups()
- if match[0] in nsmap:
- return "{%s}%s" % (nsmap[match[0]], match[1].lower())
- return tag
+ match = re.search(r'^\{([^\}]+)\}(.*)$', tag)
+ if match:
+ match = match.groups()
+ for (key, url) in nsmap.iteritems():
+ if url == match[0]:
+ return "%s:%s" % (key, match[1].lower())
+ else:
+ match = re.search(r'^([^:]+):([^:]+)$', tag)
+ if match:
+ match = match.groups()
+ if match[0] in nsmap:
+ return "{%s}%s" % (nsmap[match[0]], match[1].lower())
+ return tag
def innerHTML(xml):
- return (xml.text or '') + ''.join([etree.tostring(child) for child in xml.iterchildren()])
+ return (xml.text or '') + ''.join([etree.tostring(child) for child in xml.iterchildren()])
def cleanNode(xml):
- [xml.remove(child) for child in xml.iterchildren()]
+ [xml.remove(child) for child in xml.iterchildren()]
class FeedException(Exception):
- pass
+ pass
def parse(data):
- # encoding
- match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
- if match:
- enc = match.groups()[0].lower()
- if not isinstance(data, unicode):
- data = data.decode(enc, 'ignore')
- data = data.encode(enc)
+ # encoding
+ match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
+ if match:
+ enc = match.groups()[0].lower()
+ if not isinstance(data, unicode):
+ data = data.decode(enc, 'ignore')
+ data = data.encode(enc)
- # parse
- parser = etree.XMLParser(recover=True)
- doc = etree.fromstring(data, parser)
+ # parse
+ parser = etree.XMLParser(recover=True)
+ doc = etree.fromstring(data, parser)
- # rss
- match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
- if len(match):
- mtable = { 'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
- 'atom03:feed': FeedParserAtom, 'atom:feed': FeedParserAtom }
- match = match[0]
- tag = tagNS(match.tag)
- if tag in mtable:
- return mtable[tag](doc, tag)
+ # rss
+ match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
+ if len(match):
+ mtable = { 'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
+ 'atom03:feed': FeedParserAtom, 'atom:feed': FeedParserAtom }
+ match = match[0]
+ tag = tagNS(match.tag)
+ if tag in mtable:
+ return mtable[tag](doc, tag)
- raise FeedException('unknown feed type')
+ raise FeedException('unknown feed type')
class FeedBase(object):
- """
- Base for xml-related classes, which provides simple wrappers around xpath
- selection and item creation
- """
+ """
+ Base for xml-related classes, which provides simple wrappers around xpath
+ selection and item creation
+ """
- def __getitem__(self, item):
- return getattr(self, item)
+ def __getitem__(self, item):
+ return getattr(self, item)
- def __setitem__(self, item, value):
- setattr(self, item, value)
+ def __setitem__(self, item, value):
+ setattr(self, item, value)
- def __delitem__(self, item):
- delattr(self, item)
+ def __delitem__(self, item):
+ delattr(self, item)
- def __iter__(self):
- for element in self.dic:
- value = self[element]
+ def __iter__(self):
+ for element in self.dic:
+ value = self[element]
- if isinstance(value, FeedList):
- value = [OrderedDict(x) for x in value]
- elif isinstance(value, datetime):
- value = value.isoformat()
+ if isinstance(value, FeedList):
+ value = [OrderedDict(x) for x in value]
+ elif isinstance(value, datetime):
+ value = value.isoformat()
- yield element, value
+ yield element, value
- def xpath(self, path):
- """ Test xpath rule on xml tree """
- return self.root.xpath(path, namespaces=NSMAP)
+ def xpath(self, path):
+ """ Test xpath rule on xml tree """
+ return self.root.xpath(path, namespaces=NSMAP)
- def xget(self, path):
- """ Returns the 1st xpath match """
- match = self.xpath(path)
- if len(match):
- return match[0]
- else:
- return None
+ def xget(self, path):
+ """ Returns the 1st xpath match """
+ match = self.xpath(path)
+ if len(match):
+ return match[0]
+ else:
+ return None
- def xval(self, path):
- """ Returns the .text of the 1st match """
- match = self.xget(path)
- if match is not None:
- return match.text or ""
- else:
- return ""
+ def xval(self, path):
+ """ Returns the .text of the 1st match """
+ match = self.xget(path)
+ if match is not None:
+ return match.text or ""
+ else:
+ return ""
- def xgetCreate(self, table):
- """ Returns an element, and creates it when not present """
- value = table[self.tag]
- if not isinstance(value, tuple):
- value = (value, value)
- new, xpath = value
- match = self.xget(xpath)
- if match is not None:
- return match
- else:
- element = etree.Element(tagNS(new))
- self.root.append(element)
- return element
+ def xgetCreate(self, table):
+ """ Returns an element, and creates it when not present """
+ value = table[self.tag]
+ if not isinstance(value, tuple):
+ value = (value, value)
+ new, xpath = value
+ match = self.xget(xpath)
+ if match is not None:
+ return match
+ else:
+ element = etree.Element(tagNS(new))
+ self.root.append(element)
+ return element
- def xdel(self, path):
- match = self.xget(path)
- if match is not None:
- return match.getparent().remove(match)
+ def xdel(self, path):
+ match = self.xget(path)
+ if match is not None:
+ return match.getparent().remove(match)
- def tostring(self, **k):
- """ Returns string using lxml. Arguments passed to tostring """
- return etree.tostring(self.xml, pretty_print=True, **k)
+ def tostring(self, **k):
+ """ Returns string using lxml. Arguments passed to tostring """
+ return etree.tostring(self.xml, pretty_print=True, **k)
class FeedDescriptor(object):
- """
- Descriptor which gives off elements based on "self.getName" and
- "self.setName" as getter/setters. Looks far better, and avoids duplicates
- """
- def __init__(self, name):
- self.name = name
- self.nname = name[0].upper() + name[1:]
+ """
+ Descriptor which gives off elements based on "self.getName" and
+ "self.setName" as getter/setters. Looks far better, and avoids duplicates
+ """
+ def __init__(self, name):
+ self.name = name
+ self.nname = name[0].upper() + name[1:]
- def __get__(self, instance, owner):
- getter = getattr(instance, 'get%s' % self.nname)
- return getter()
+ def __get__(self, instance, owner):
+ getter = getattr(instance, 'get%s' % self.nname)
+ return getter()
- def __set__(self, instance, value):
- setter = getattr(instance, 'set%s' % self.nname)
- return setter(value)
+ def __set__(self, instance, value):
+ setter = getattr(instance, 'set%s' % self.nname)
+ return setter(value)
- def __delete__(self, instance):
- deleter = getattr(instance, 'del%s' % self.nname)
- return deleter()
+ def __delete__(self, instance):
+ deleter = getattr(instance, 'del%s' % self.nname)
+ return deleter()
class FeedTime(FeedDescriptor):
- def __get__(self, instance, owner):
- getter = getattr(instance, 'get%s' % self.nname)
- raw = getter()
- try:
- time = parseTime(raw)
- return time
- except ValueError:
- return None
+ def __get__(self, instance, owner):
+ getter = getattr(instance, 'get%s' % self.nname)
+ raw = getter()
+ try:
+ time = parseTime(raw)
+ return time
+ except ValueError:
+ return None
- def __set__(self, instance, value):
- try:
- time = parseTime(value)
- raw = time.strftime(instance.timeFormat)
- setter = getattr(instance, 'set%s' % self.nname)
- return setter(raw)
- except ValueError:
- pass
+ def __set__(self, instance, value):
+ try:
+ time = parseTime(value)
+ raw = time.strftime(instance.timeFormat)
+ setter = getattr(instance, 'set%s' % self.nname)
+ return setter(raw)
+ except ValueError:
+ pass
class FeedBool(FeedDescriptor):
- def __get__(self, instance, owner):
- getter = getattr(instance, 'get%s' % self.nname)
- raw = getter()
- return (raw or '').lower() != 'false'
+ def __get__(self, instance, owner):
+ getter = getattr(instance, 'get%s' % self.nname)
+ raw = getter()
+ return (raw or '').lower() != 'false'
- def __set__(self, instance, value):
- raw = 'true' if value else 'false'
- setter = getattr(instance, 'set%s' % self.nname)
- return setter(raw)
+ def __set__(self, instance, value):
+ raw = 'true' if value else 'false'
+ setter = getattr(instance, 'set%s' % self.nname)
+ return setter(raw)
def parseTime(value):
- if isinstance(value, basestring):
- if re.match(r'^[0-9]+$', value):
- return datetime.fromtimestamp(int(value), tz.tzutc())
- else:
- return dateutil.parser.parse(value, tzinfos=tz.tzutc)
- elif isinstance(value, int):
- return datetime.fromtimestamp(value, tz.tzutc())
- elif isinstance(value, datetime):
- return value
- else:
- return False
+ if isinstance(value, basestring):
+ if re.match(r'^[0-9]+$', value):
+ return datetime.fromtimestamp(int(value), tz.tzutc())
+ else:
+ return dateutil.parser.parse(value, tzinfos=tz.tzutc)
+ elif isinstance(value, int):
+ return datetime.fromtimestamp(value, tz.tzutc())
+ elif isinstance(value, datetime):
+ return value
+ else:
+ return False
class FeedList(object):
- """
- Class to map a list of xml elements against a list of matching objects,
- while avoiding to recreate the same matching object over and over again. So
- as to avoid extra confusion, list's elements are called "children" here, so
- as not to use "items", which is already in use in RSS/Atom related code.
+ """
+ Class to map a list of xml elements against a list of matching objects,
+ while avoiding to recreate the same matching object over and over again. So
+ as to avoid extra confusion, list's elements are called "children" here, so
+ as not to use "items", which is already in use in RSS/Atom related code.
- Comes with its very own descriptor.
- """
- def __init__(self, parent, getter, tag, childClass):
- self.parent = parent
- self.getter = getter
- self.childClass = childClass
- self.tag = tag
- self._children = {} # id(xml) => FeedItem
+ Comes with its very own descriptor.
+ """
+ def __init__(self, parent, getter, tag, childClass):
+ self.parent = parent
+ self.getter = getter
+ self.childClass = childClass
+ self.tag = tag
+ self._children = {} # id(xml) => FeedItem
- def getChildren(self):
- children = self.getter()
- out = []
- for child in children:
- if id(child) in self._children:
- out.append(self._children[id(child)])
- else:
- new = self.childClass(child, self.tag)
- self._children[id(child)] = new
- out.append(new)
- return out
+ def getChildren(self):
+ children = self.getter()
+ out = []
+ for child in children:
+ if id(child) in self._children:
+ out.append(self._children[id(child)])
+ else:
+ new = self.childClass(child, self.tag)
+ self._children[id(child)] = new
+ out.append(new)
+ return out
- def append(self, cousin=None):
- new = self.childClass(tag=self.tag)
- self.parent.root.append(new.xml)
- self._children[id(new.xml)] = new
+ def append(self, cousin=None):
+ new = self.childClass(tag=self.tag)
+ self.parent.root.append(new.xml)
+ self._children[id(new.xml)] = new
- if cousin is None:
- return new
+ if cousin is None:
+ return new
- for key in self.childClass.__dict__:
- if key[:3] == 'set':
- attr = key[3:].lower()
- if hasattr(cousin, attr):
- setattr(new, attr, getattr(cousin, attr))
- elif attr in cousin:
- setattr(new, attr, cousin[attr])
+ for key in self.childClass.__dict__:
+ if key[:3] == 'set':
+ attr = key[3:].lower()
+ if hasattr(cousin, attr):
+ setattr(new, attr, getattr(cousin, attr))
+ elif attr in cousin:
+ setattr(new, attr, cousin[attr])
- return new
+ return new
- def __getitem__(self, key):
- return self.getChildren()[key]
+ def __getitem__(self, key):
+ return self.getChildren()[key]
- def __delitem__(self, key):
- child = self.getter()[key]
- if id(child) in self._children:
- self._children[id(child)].remove()
- del self._children[id(child)]
- else:
- child.getparent().remove(child)
+ def __delitem__(self, key):
+ child = self.getter()[key]
+ if id(child) in self._children:
+ self._children[id(child)].remove()
+ del self._children[id(child)]
+ else:
+ child.getparent().remove(child)
- def __len__(self):
- return len(self.getter())
+ def __len__(self):
+ return len(self.getter())
class FeedListDescriptor(object):
- """
- Descriptor for FeedList
- """
- def __init__(self, name):
- self.name = name
- self.items = {} # id(instance) => FeedList
+ """
+ Descriptor for FeedList
+ """
+ def __init__(self, name):
+ self.name = name
+ self.items = {} # id(instance) => FeedList
- def __get__(self, instance, owner=None):
- key = id(instance)
- if key in self.items:
- return self.items[key]
- else:
- getter = getattr(instance, 'get%s' % self.name.title())
- className = globals()[getattr(instance, '%sClass' % self.name)]
- self.items[key] = FeedList(instance, getter, instance.tag, className)
- return self.items[key]
+ def __get__(self, instance, owner=None):
+ key = id(instance)
+ if key in self.items:
+ return self.items[key]
+ else:
+ getter = getattr(instance, 'get%s' % self.name.title())
+ className = globals()[getattr(instance, '%sClass' % self.name)]
+ self.items[key] = FeedList(instance, getter, instance.tag, className)
+ return self.items[key]
- def __set__(self, instance, value):
- feedlist = self.__get__(instance)
- [x.remove() for x in [x for x in f.items]]
- [feedlist.append(x) for x in value]
+ def __set__(self, instance, value):
+ feedlist = self.__get__(instance)
+ [x.remove() for x in [x for x in f.items]]
+ [feedlist.append(x) for x in value]
class FeedParser(FeedBase):
- itemsClass = 'FeedItem'
- mimetype = 'application/xml'
- base = ''
- dic = ('title', 'desc', 'items')
+ itemsClass = 'FeedItem'
+ mimetype = 'application/xml'
+ base = ''
+ dic = ('title', 'desc', 'items')
- def __init__(self, xml=None, tag='atom:feed'):
- if xml is None:
- xml = etree.fromstring(self.base[tag])
- self.xml = xml
- self.root = self.xml.xpath("//atom03:feed|//atom:feed|//channel|//rssfake:channel", namespaces=NSMAP)[0]
- self.tag = tag
+ def __init__(self, xml=None, tag='atom:feed'):
+ if xml is None:
+ xml = etree.fromstring(self.base[tag])
+ self.xml = xml
+ self.root = self.xml.xpath("//atom03:feed|//atom:feed|//channel|//rssfake:channel", namespaces=NSMAP)[0]
+ self.tag = tag
- def getTitle(self):
- return ""
+ def getTitle(self):
+ return ""
- def setTitle(self, value):
- pass
+ def setTitle(self, value):
+ pass
- def delTitle(self):
- self.title = ""
+ def delTitle(self):
+ self.title = ""
- def getDesc(self):
- pass
+ def getDesc(self):
+ pass
- def setDesc(self, value):
- pass
+ def setDesc(self, value):
+ pass
- def delDesc(self):
- self.desc = ""
+ def delDesc(self):
+ self.desc = ""
- def getItems(self):
- return []
+ def getItems(self):
+ return []
- title = FeedDescriptor('title')
- description = desc = FeedDescriptor('desc')
- items = FeedListDescriptor('items')
+ title = FeedDescriptor('title')
+ description = desc = FeedDescriptor('desc')
+ items = FeedListDescriptor('items')
- def tostring(self, **k):
- return etree.tostring(self.xml.getroottree(), pretty_print=True, **k)
+ def tostring(self, **k):
+ return etree.tostring(self.xml.getroottree(), pretty_print=True, **k)
- def tojson(self, indent=None):
- return json.dumps(OrderedDict(self), indent=indent)
+ def tojson(self, indent=None):
+ return json.dumps(OrderedDict(self), indent=indent)
- def tocsv(self):
- out = StringIO()
- c = csv.writer(out, dialect=csv.excel)
- for item in self.items:
- row = [x[1].encode('utf-8') if isinstance(x[1], unicode) else x[1] for x in item if isinstance(x[1], basestring)]
- c.writerow(row)
- out.seek(0)
- return out.read()
+ def tocsv(self):
+ out = StringIO()
+ c = csv.writer(out, dialect=csv.excel)
+ for item in self.items:
+ row = [x[1].encode('utf-8') if isinstance(x[1], unicode) else x[1] for x in item if isinstance(x[1], basestring)]
+ c.writerow(row)
+ out.seek(0)
+ return out.read()
- def tohtml(self):
- if DictLoader is None:
- log('dep wheezy.template needed')
+ def tohtml(self):
+ if DictLoader is None:
+ log('dep wheezy.template needed')
- loader = DictLoader({'reader': open('reader.html.template').read()})
- engine = Engine(loader=loader, extensions=[CoreExtension()])
- template = engine.get_template('reader')
- return template.render({'feed':self}).encode('utf-8')
+ loader = DictLoader({'reader': open('reader.html.template').read()})
+ engine = Engine(loader=loader, extensions=[CoreExtension()])
+ template = engine.get_template('reader')
+ return template.render({'feed':self}).encode('utf-8')
class FeedParserRSS(FeedParser):
- """
- RSS Parser
- """
- itemsClass = 'FeedItemRSS'
- mimetype = 'application/rss+xml'
- base = { 'rdf:rdf': '',
- 'channel': ''}
+ """
+ RSS Parser
+ """
+ itemsClass = 'FeedItemRSS'
+ mimetype = 'application/rss+xml'
+ base = { 'rdf:rdf': '',
+ 'channel': ''}
- def getTitle(self):
- return self.xval('rssfake:title|title')
+ def getTitle(self):
+ return self.xval('rssfake:title|title')
- def setTitle(self, value):
- if not value:
- return self.xdel('rssfake:title|title')
+ def setTitle(self, value):
+ if not value:
+ return self.xdel('rssfake:title|title')
- table = { 'rdf:rdf': 'rssfake:title',
- 'channel': 'title'}
- element = self.xgetCreate(table)
- element.text = value
+ table = { 'rdf:rdf': 'rssfake:title',
+ 'channel': 'title'}
+ element = self.xgetCreate(table)
+ element.text = value
- def getDesc(self):
- return self.xval('rssfake:description|description')
+ def getDesc(self):
+ return self.xval('rssfake:description|description')
- def setDesc(self, value):
- if not value:
- return self.xdel('rssfake:description|description')
+ def setDesc(self, value):
+ if not value:
+ return self.xdel('rssfake:description|description')
- table = { 'rdf:rdf': 'rssfake:description',
- 'channel': 'description'}
- element = self.xgetCreate(table)
- element.text = value
+ table = { 'rdf:rdf': 'rssfake:description',
+ 'channel': 'description'}
+ element = self.xgetCreate(table)
+ element.text = value
- def getItems(self):
- return self.xpath('rssfake:item|item')
+ def getItems(self):
+ return self.xpath('rssfake:item|item')
class FeedParserAtom(FeedParser):
- """
- Atom Parser
- """
- itemsClass = 'FeedItemAtom'
- mimetype = 'application/atom+xml'
- base = { 'atom:feed': '',
- 'atom03:feed': ''}
+ """
+ Atom Parser
+ """
+ itemsClass = 'FeedItemAtom'
+ mimetype = 'application/atom+xml'
+ base = { 'atom:feed': '',
+ 'atom03:feed': ''}
- def getTitle(self):
- return self.xval('atom:title|atom03:title')
+ def getTitle(self):
+ return self.xval('atom:title|atom03:title')
- def setTitle(self, value):
- if not value:
- return self.xval('atom:title|atom03:title')
+ def setTitle(self, value):
+ if not value:
+ return self.xval('atom:title|atom03:title')
- table = { 'atom:feed': 'atom:title',
- 'atom03:feed': 'atom03:title'}
- element = self.xgetCreate(table)
- element.text = value
+ table = { 'atom:feed': 'atom:title',
+ 'atom03:feed': 'atom03:title'}
+ element = self.xgetCreate(table)
+ element.text = value
- def getDesc(self):
- return self.xval('atom:subtitle|atom03:subtitle')
+ def getDesc(self):
+ return self.xval('atom:subtitle|atom03:subtitle')
- def setDesc(self, value):
- if not value:
- return self.xdel('atom:subtitle|atom03:subtitle')
+ def setDesc(self, value):
+ if not value:
+ return self.xdel('atom:subtitle|atom03:subtitle')
- table = { 'atom:feed': 'atom:subtitle',
- 'atom03:feed': 'atom03:subtitle'}
- element = self.xgetCreate(table)
- element.text = value
+ table = { 'atom:feed': 'atom:subtitle',
+ 'atom03:feed': 'atom03:subtitle'}
+ element = self.xgetCreate(table)
+ element.text = value
- def getItems(self):
- return self.xpath('atom:entry|atom03:entry')
+ def getItems(self):
+ return self.xpath('atom:entry|atom03:entry')
class FeedItem(FeedBase):
- timeFormat = ''
- dic = ('title', 'link', 'desc', 'content', 'id', 'isPermaLink', 'time', 'updated')
+ timeFormat = ''
+ dic = ('title', 'link', 'desc', 'content', 'id', 'isPermaLink', 'time', 'updated')
- def __init__(self, xml=None, tag='atom:feed'):
- if xml is None:
- xml = Element(tagNS(self.base[tag]))
+ def __init__(self, xml=None, tag='atom:feed'):
+ if xml is None:
+ xml = Element(tagNS(self.base[tag]))
- self.root = self.xml = xml
- self.tag = tag
+ self.root = self.xml = xml
+ self.tag = tag
- def getTitle(self):
- return ""
+ def getTitle(self):
+ return ""
- def setTitle(self):
- pass
+ def setTitle(self):
+ pass
- def delTitle(self):
- self.title = ""
+ def delTitle(self):
+ self.title = ""
- def getLink(self):
- return ""
+ def getLink(self):
+ return ""
- def setLink(self, value):
- pass
+ def setLink(self, value):
+ pass
- def delLink(self):
- self.link = ""
+ def delLink(self):
+ self.link = ""
- def getIsPermaLink(self):
- return ""
+ def getIsPermaLink(self):
+ return ""
- def setIsPermaLink(self, value):
- pass
+ def setIsPermaLink(self, value):
+ pass
- def getDesc(self):
- return ""
+ def getDesc(self):
+ return ""
- def setDesc(self, value):
- pass
+ def setDesc(self, value):
+ pass
- def delDesc(self):
- self.desc = ""
+ def delDesc(self):
+ self.desc = ""
- def getContent(self):
- return ""
+ def getContent(self):
+ return ""
- def setContent(self, value):
- pass
+ def setContent(self, value):
+ pass
- def delContent(self):
- self.content = ""
+ def delContent(self):
+ self.content = ""
- def getId(self):
- return ""
+ def getId(self):
+ return ""
- def setId(self, value):
- pass
+ def setId(self, value):
+ pass
- def delId(self):
- self.id = ""
+ def delId(self):
+ self.id = ""
- def getTime(self):
- return None
+ def getTime(self):
+ return None
- def setTime(self, value):
- pass
+ def setTime(self, value):
+ pass
- def delTime(self):
- self.time = None
+ def delTime(self):
+ self.time = None
- def getUpdated(self):
- return None
+ def getUpdated(self):
+ return None
- def setUpdated(self, value):
- pass
+ def setUpdated(self, value):
+ pass
- def delUpdated(self):
- self.updated = None
+ def delUpdated(self):
+ self.updated = None
- title = FeedDescriptor('title')
- link = FeedDescriptor('link')
- description = desc = FeedDescriptor('desc')
- content = FeedDescriptor('content')
- id = FeedDescriptor('id')
- isPermaLink = FeedBool('isPermaLink')
- time = FeedTime('time')
- updated = FeedTime('updated')
+ title = FeedDescriptor('title')
+ link = FeedDescriptor('link')
+ description = desc = FeedDescriptor('desc')
+ content = FeedDescriptor('content')
+ id = FeedDescriptor('id')
+ isPermaLink = FeedBool('isPermaLink')
+ time = FeedTime('time')
+ updated = FeedTime('updated')
- def pushContent(self, value):
- if not self.desc and self.content:
- self.desc = self.content
+ def pushContent(self, value):
+ if not self.desc and self.content:
+ self.desc = self.content
- self.content = value
+ self.content = value
- def remove(self):
- self.xml.getparent().remove(self.xml)
+ def remove(self):
+ self.xml.getparent().remove(self.xml)
class FeedItemRSS(FeedItem):
- timeFormat = '%a, %d %b %Y %H:%M:%S %Z'
- base = { 'rdf:rdf': 'rssfake:item',
- 'channel': 'item'}
+ timeFormat = '%a, %d %b %Y %H:%M:%S %Z'
+ base = { 'rdf:rdf': 'rssfake:item',
+ 'channel': 'item'}
- def getTitle(self):
- return self.xval('rssfake:title|title')
+ def getTitle(self):
+ return self.xval('rssfake:title|title')
- def setTitle(self, value):
- if not value:
- return self.xdel('rssfake:title|title')
+ def setTitle(self, value):
+ if not value:
+ return self.xdel('rssfake:title|title')
- table = { 'rdf:rdf': 'rssfake:title',
- 'channel': 'title'}
- element = self.xgetCreate(table)
- element.text = value
+ table = { 'rdf:rdf': 'rssfake:title',
+ 'channel': 'title'}
+ element = self.xgetCreate(table)
+ element.text = value
- def getLink(self):
- return self.xval('rssfake:link|link')
+ def getLink(self):
+ return self.xval('rssfake:link|link')
- def setLink(self, value):
- if self.isPermaLink and self.id == self.link != value:
- self.isPermaLink = False
+ def setLink(self, value):
+ if self.isPermaLink and self.id == self.link != value:
+ self.isPermaLink = False
- table = { 'rdf:rdf': 'rssfake:link',
- 'channel': 'link'}
- element = self.xgetCreate(table)
- element.text = value
+ table = { 'rdf:rdf': 'rssfake:link',
+ 'channel': 'link'}
+ element = self.xgetCreate(table)
+ element.text = value
- def getDesc(self):
- return self.xval('rssfake:description|description')
+ def getDesc(self):
+ return self.xval('rssfake:description|description')
- def setDesc(self, value):
- if not value:
- return self.xdel('rssfake:description|description')
+ def setDesc(self, value):
+ if not value:
+ return self.xdel('rssfake:description|description')
- table = { 'rdf:rdf': 'rssfake:description',
- 'channel': 'description'}
- element = self.xgetCreate(table)
- element.text = value
+ table = { 'rdf:rdf': 'rssfake:description',
+ 'channel': 'description'}
+ element = self.xgetCreate(table)
+ element.text = value
- def getContent(self):
- return self.xval('content:encoded')
+ def getContent(self):
+ return self.xval('content:encoded')
- def setContent(self, value):
- if not value:
- return self.xdel('content:encoded')
+ def setContent(self, value):
+ if not value:
+ return self.xdel('content:encoded')
- table = { 'rdf:rdf': 'content:encoded',
- 'channel': 'content:encoded'}
- element = self.xgetCreate(table)
- element.text = value
+ table = { 'rdf:rdf': 'content:encoded',
+ 'channel': 'content:encoded'}
+ element = self.xgetCreate(table)
+ element.text = value
- def getId(self):
- return self.xval('rssfake:guid|guid')
+ def getId(self):
+ return self.xval('rssfake:guid|guid')
- def setId(self, value):
- if not value:
- return self.xdel('rssfake:guid|guid')
+ def setId(self, value):
+ if not value:
+ return self.xdel('rssfake:guid|guid')
- table = { 'rdf:rdf': 'rssfake:guid',
- 'channel': 'guid'}
- element = self.xgetCreate(table)
- element.text = value
+ table = { 'rdf:rdf': 'rssfake:guid',
+ 'channel': 'guid'}
+ element = self.xgetCreate(table)
+ element.text = value
- def getIsPermaLink(self):
- return self.xget('rssfake:guid/@isPermaLink|guid/@isPermaLink')
+ def getIsPermaLink(self):
+ return self.xget('rssfake:guid/@isPermaLink|guid/@isPermaLink')
- def setIsPermaLink(self, value):
- table = { 'rdf:rdf': 'rssfake:guid',
- 'channel': 'guid'}
- element = self.xgetCreate(table)
- element.attrib['isPermaLink'] = value
+ def setIsPermaLink(self, value):
+ table = { 'rdf:rdf': 'rssfake:guid',
+ 'channel': 'guid'}
+ element = self.xgetCreate(table)
+ element.attrib['isPermaLink'] = value
- def getTime(self):
- return self.xval('rssfake:pubDate|pubDate')
+ def getTime(self):
+ return self.xval('rssfake:pubDate|pubDate')
- def setTime(self, value):
- if not value:
- return self.xdel('rssfake:pubDate|pubDate')
+ def setTime(self, value):
+ if not value:
+ return self.xdel('rssfake:pubDate|pubDate')
- table = { 'rdf:rdf': 'rssfake:pubDate',
- 'channel': 'pubDate'}
- element = self.xgetCreate(table)
- element.text = value
+ table = { 'rdf:rdf': 'rssfake:pubDate',
+ 'channel': 'pubDate'}
+ element = self.xgetCreate(table)
+ element.text = value
class FeedItemAtom(FeedItem):
- timeFormat = '%Y-%m-%dT%H:%M:%SZ'
- base = { 'atom:feed': 'atom:entry',
- 'atom03:feed': 'atom03:entry'}
+ timeFormat = '%Y-%m-%dT%H:%M:%SZ'
+ base = { 'atom:feed': 'atom:entry',
+ 'atom03:feed': 'atom03:entry'}
- def getTitle(self):
- return self.xval('atom:title|atom03:title')
+ def getTitle(self):
+ return self.xval('atom:title|atom03:title')
- def setTitle(self, value):
- if not value:
- return self.xdel('atom:title|atom03:title')
+ def setTitle(self, value):
+ if not value:
+ return self.xdel('atom:title|atom03:title')
- table = { 'atom:feed': 'atom:title',
- 'atom03:feed': 'atom03:title'}
- element = self.xgetCreate(table)
- element.text = value
+ table = { 'atom:feed': 'atom:title',
+ 'atom03:feed': 'atom03:title'}
+ element = self.xgetCreate(table)
+ element.text = value
- def getLink(self):
- return self.xget('(atom:link|atom03:link)[@rel="alternate" or not(@rel)]/@href')
+ def getLink(self):
+ return self.xget('(atom:link|atom03:link)[@rel="alternate" or not(@rel)]/@href')
- def setLink(self, value):
- table = { 'atom:feed': ('atom:link', 'atom:link[@rel="alternate" or not(@rel)]'),
- 'atom03:feed': ('atom03:link', 'atom03:link[@rel="alternate" or not(@rel)]')}
- element = self.xgetCreate(table)
- element.attrib['href'] = value
+ def setLink(self, value):
+ table = { 'atom:feed': ('atom:link', 'atom:link[@rel="alternate" or not(@rel)]'),
+ 'atom03:feed': ('atom03:link', 'atom03:link[@rel="alternate" or not(@rel)]')}
+ element = self.xgetCreate(table)
+ element.attrib['href'] = value
- def getDesc(self):
- # default "type" is "text"
- element = self.xget('atom:summary|atom03:summary')
- if element is not None:
- return innerHTML(element)
- else:
- return ""
+ def getDesc(self):
+ # default "type" is "text"
+ element = self.xget('atom:summary|atom03:summary')
+ if element is not None:
+ return innerHTML(element)
+ else:
+ return ""
- def setDesc(self, value):
- if not value:
- return self.xdel('atom:summary|atom03:summary')
+ def setDesc(self, value):
+ if not value:
+ return self.xdel('atom:summary|atom03:summary')
- table = { 'atom:feed': 'atom:summary',
- 'atom03:feed': 'atom03:summary'}
- element = self.xgetCreate(table)
- if element.attrib.get('type', '') == 'xhtml':
- cleanNode(element)
- element.attrib['type'] = 'html'
- element.text = value
+ table = { 'atom:feed': 'atom:summary',
+ 'atom03:feed': 'atom03:summary'}
+ element = self.xgetCreate(table)
+ if element.attrib.get('type', '') == 'xhtml':
+ cleanNode(element)
+ element.attrib['type'] = 'html'
+ element.text = value
- def getContent(self):
- element = self.xget('atom:content|atom03:content')
- if element is not None:
- return innerHTML(element)
- else:
- return ""
+ def getContent(self):
+ element = self.xget('atom:content|atom03:content')
+ if element is not None:
+ return innerHTML(element)
+ else:
+ return ""
- def setContent(self, value):
- if not value:
- return self.xdel('atom:content|atom03:content')
+ def setContent(self, value):
+ if not value:
+ return self.xdel('atom:content|atom03:content')
- table = { 'atom:feed': 'atom:content',
- 'atom03:feed': 'atom03:content'}
- element = self.xgetCreate(table)
- if element.attrib.get('type', '') == 'xhtml':
- cleanNode(element)
- element.attrib['type'] = 'html'
- element.text = value
+ table = { 'atom:feed': 'atom:content',
+ 'atom03:feed': 'atom03:content'}
+ element = self.xgetCreate(table)
+ if element.attrib.get('type', '') == 'xhtml':
+ cleanNode(element)
+ element.attrib['type'] = 'html'
+ element.text = value
- def getId(self):
- return self.xval('atom:id|atom03:id')
+ def getId(self):
+ return self.xval('atom:id|atom03:id')
- def setId(self, value):
- if not value:
- return self.xdel('atom:id|atom03:id')
+ def setId(self, value):
+ if not value:
+ return self.xdel('atom:id|atom03:id')
- table = { 'atom:feed': 'atom:id',
- 'atom03:feed': 'atom03:id'}
- element = self.xgetCreate(table)
- element.text = value
+ table = { 'atom:feed': 'atom:id',
+ 'atom03:feed': 'atom03:id'}
+ element = self.xgetCreate(table)
+ element.text = value
- def getTime(self):
- return self.xval('atom:published|atom03:published')
+ def getTime(self):
+ return self.xval('atom:published|atom03:published')
- def setTime(self, value):
- if not value:
- return self.xdel('atom:published|atom03:published')
+ def setTime(self, value):
+ if not value:
+ return self.xdel('atom:published|atom03:published')
- table = { 'atom:feed': 'atom:published',
- 'atom03:feed': 'atom03:published'}
- element = self.xgetCreate(table)
- element.text = value
+ table = { 'atom:feed': 'atom:published',
+ 'atom03:feed': 'atom03:published'}
+ element = self.xgetCreate(table)
+ element.text = value
- def getUpdated(self):
- return self.xval('atom:updated|atom03:updated')
+ def getUpdated(self):
+ return self.xval('atom:updated|atom03:updated')
- def setUpdated(self, value):
- if not value:
- return self.xdel('atom:updated|atom03:updated')
+ def setUpdated(self, value):
+ if not value:
+ return self.xdel('atom:updated|atom03:updated')
- table = { 'atom:feed': 'atom:updated',
- 'atom03:feed': 'atom03:updated'}
- element = self.xgetCreate(table)
- element.text = value
+ table = { 'atom:feed': 'atom:updated',
+ 'atom03:feed': 'atom03:updated'}
+ element = self.xgetCreate(table)
+ element.text = value
diff --git a/morss/morss.py b/morss/morss.py
index 611cb0f..2b9884c 100644
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -31,21 +31,21 @@ from StringIO import StringIO
from readability import readability
from html2text import HTML2Text
-LIM_ITEM = 100 # deletes what's beyond
-LIM_TIME = 7 # deletes what's after
-MAX_ITEM = 50 # cache-only beyond
-MAX_TIME = 7 # cache-only after (in sec)
-DELAY = 10*60 # xml cache & ETag cache (in sec)
-TIMEOUT = 2 # http timeout (in sec)
-THREADS = 10 # number of threads (1 for single-threaded)
+LIM_ITEM = 100 # deletes what's beyond
+LIM_TIME = 7 # deletes what's after
+MAX_ITEM = 50 # cache-only beyond
+MAX_TIME = 7 # cache-only after (in sec)
+DELAY = 10*60 # xml cache & ETag cache (in sec)
+TIMEOUT = 2 # http timeout (in sec)
+THREADS = 10 # number of threads (1 for single-threaded)
DEBUG = False
UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)'
UA_HTML = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
-MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
- 'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
+MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
+ 'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
FBAPPID = ""
FBSECRET = ""
@@ -54,791 +54,791 @@ FBAPPTOKEN = FBAPPID + '|' + FBSECRET
PROTOCOL = ['http', 'https', 'ftp']
if 'SCRIPT_NAME' in os.environ:
- httplib.HTTPConnection.debuglevel = 1
+ httplib.HTTPConnection.debuglevel = 1
- import cgitb
- cgitb.enable()
+ import cgitb
+ cgitb.enable()
class MorssException(Exception):
- pass
+ pass
def log(txt, force=False):
- if DEBUG or force:
- if 'REQUEST_URI' in os.environ:
- open('morss.log', 'a').write("%s\n" % repr(txt))
- else:
- print repr(txt)
+ if DEBUG or force:
+ if 'REQUEST_URI' in os.environ:
+ open('morss.log', 'a').write("%s\n" % repr(txt))
+ else:
+ print repr(txt)
def lenHTML(txt):
- if len(txt):
- return len(lxml.html.fromstring(txt).text_content())
- else:
- return 0
+ if len(txt):
+ return len(lxml.html.fromstring(txt).text_content())
+ else:
+ return 0
def countWord(txt):
- if len(txt):
- return len(lxml.html.fromstring(txt).text_content().split())
- else:
- return 0
+ if len(txt):
+ return len(lxml.html.fromstring(txt).text_content().split())
+ else:
+ return 0
class Options:
- def __init__(self, options=None):
- self.options = options or []
+ def __init__(self, options=None):
+ self.options = options or []
- def __getattr__(self, key):
- return key in self.options
+ def __getattr__(self, key):
+ return key in self.options
- def __setitem__(self, key, value):
- self.options[key] = value
+ def __setitem__(self, key, value):
+ self.options[key] = value
- def __contains__(self, key):
- return key in self.options
+ def __contains__(self, key):
+ return key in self.options
class Cache:
- """ Light, error-prone caching system. """
- def __init__(self, folder=None, key='cache', lifespan=10*24*3600):
- self._key = key
- self._dir = folder
- self._lifespan = lifespan
+ """ Light, error-prone caching system. """
+ def __init__(self, folder=None, key='cache', lifespan=10*24*3600):
+ self._key = key
+ self._dir = folder
+ self._lifespan = lifespan
- self._cache = {}
+ self._cache = {}
- if self._dir is None:
- self._hash = "NO CACHE"
- return
+ if self._dir is None:
+ self._hash = "NO CACHE"
+ return
- maxsize = os.statvfs('./').f_namemax - len(self._dir) - 1 - 4 # ".tmp"
- self._hash = urllib.quote_plus(self._key)[:maxsize]
+ maxsize = os.statvfs('./').f_namemax - len(self._dir) - 1 - 4 # ".tmp"
+ self._hash = urllib.quote_plus(self._key)[:maxsize]
- self._file = self._dir + '/' + self._hash
- self._file_tmp = self._file + '.tmp'
+ self._file = self._dir + '/' + self._hash
+ self._file_tmp = self._file + '.tmp'
- if os.path.isfile(self._file):
- data = open(self._file).read()
- if data:
- self._cache = json.loads(data)
+ if os.path.isfile(self._file):
+ data = open(self._file).read()
+ if data:
+ self._cache = json.loads(data)
- def __del__(self):
- self.save()
+ def __del__(self):
+ self.save()
- def __contains__(self, key):
- return key in self._cache
+ def __contains__(self, key):
+ return key in self._cache
- def get(self, key):
- if key in self._cache:
- self._cache[key]['last'] = time.time()
- return self._cache[key]['value']
- else:
- return None
+ def get(self, key):
+ if key in self._cache:
+ self._cache[key]['last'] = time.time()
+ return self._cache[key]['value']
+ else:
+ return None
- def set(self, key, content):
- self._cache[key] = {'last': time.time(), 'value': content}
+ def set(self, key, content):
+ self._cache[key] = {'last': time.time(), 'value': content}
- __getitem__ = get
- __setitem__ = set
+ __getitem__ = get
+ __setitem__ = set
- def save(self):
- if len(self._cache) == 0 or self._dir is None:
- return
+ def save(self):
+ if len(self._cache) == 0 or self._dir is None:
+ return
- if not os.path.exists(self._dir):
- os.makedirs(self._dir)
+ if not os.path.exists(self._dir):
+ os.makedirs(self._dir)
- for i in self._cache.keys():
- if time.time() - self._cache[i]['last'] > self._lifespan > -1:
- del self._cache[i]
+ for i in self._cache.keys():
+ if time.time() - self._cache[i]['last'] > self._lifespan > -1:
+ del self._cache[i]
- out = json.dumps(self._cache, indent=4)
+ out = json.dumps(self._cache, indent=4)
- try:
- open(self._file_tmp, 'w+').write(out)
- os.rename(self._file_tmp, self._file)
- except IOError:
- log('failed to write cache to tmp file')
- except OSError:
- log('failed to move cache to file')
+ try:
+ open(self._file_tmp, 'w+').write(out)
+ os.rename(self._file_tmp, self._file)
+ except IOError:
+ log('failed to write cache to tmp file')
+ except OSError:
+ log('failed to move cache to file')
- def last(self, key):
- if key not in self._cache:
- return -1
+ def last(self, key):
+ if key not in self._cache:
+ return -1
- return self._cache[key]['last']
+ return self._cache[key]['last']
- def age(self, key):
- if key not in self._cache:
- return -1
+ def age(self, key):
+ if key not in self._cache:
+ return -1
- return time.time() - self.last(key)
+ return time.time() - self.last(key)
- def new(self, *arg, **karg):
- """ Returns a Cache object in the same directory """
- if arg[0] != self._key:
- return Cache(self._dir, *arg, **karg)
- else:
- return self
+ def new(self, *arg, **karg):
+ """ Returns a Cache object in the same directory """
+ if arg[0] != self._key:
+ return Cache(self._dir, *arg, **karg)
+ else:
+ return self
class SimpleDownload(urllib2.HTTPCookieProcessor):
- """
- Custom urllib2 handler to download a page, using etag/last-modified headers,
- to save bandwidth. The given headers are added back into the header on error
- 304 for easier use.
- """
- def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None, accept=None, strict=False):
- urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
- self.cache = cache
- self.etag = etag
- self.lastmodified = lastmodified
- self.useragent = useragent
- self.decode = decode
- self.accept = accept
- self.strict = strict
+ """
+ Custom urllib2 handler to download a page, using etag/last-modified headers,
+ to save bandwidth. The given headers are added back into the header on error
+ 304 for easier use.
+ """
+ def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None, accept=None, strict=False):
+ urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
+ self.cache = cache
+ self.etag = etag
+ self.lastmodified = lastmodified
+ self.useragent = useragent
+ self.decode = decode
+ self.accept = accept
+ self.strict = strict
- def http_request(self, req):
- urllib2.HTTPCookieProcessor.http_request(self, req)
- req.add_unredirected_header('Accept-Encoding', 'gzip')
- req.add_unredirected_header('User-Agent', self.useragent)
- if req.get_host() != 'feeds.feedburner.com':
- req.add_unredirected_header('Referer', 'http://%s' % req.get_host())
+ def http_request(self, req):
+ urllib2.HTTPCookieProcessor.http_request(self, req)
+ req.add_unredirected_header('Accept-Encoding', 'gzip')
+ req.add_unredirected_header('User-Agent', self.useragent)
+ if req.get_host() != 'feeds.feedburner.com':
+ req.add_unredirected_header('Referer', 'http://%s' % req.get_host())
- if self.cache:
- if self.etag:
- req.add_unredirected_header('If-None-Match', self.etag)
- if self.lastmodified:
- req.add_unredirected_header('If-Modified-Since', self.lastmodified)
+ if self.cache:
+ if self.etag:
+ req.add_unredirected_header('If-None-Match', self.etag)
+ if self.lastmodified:
+ req.add_unredirected_header('If-Modified-Since', self.lastmodified)
- if self.accept is not None:
- if isinstance(self.accept, basestring):
- self.accept = (self.accept,)
+ if self.accept is not None:
+ if isinstance(self.accept, basestring):
+ self.accept = (self.accept,)
- out = {}
- rank = 1.1
- for group in self.accept:
- rank = rank - 0.1
+ out = {}
+ rank = 1.1
+ for group in self.accept:
+ rank = rank - 0.1
- if isinstance(group, basestring):
- if group in MIMETYPE:
- group = MIMETYPE[group]
- else:
- out[group] = rank
- continue
+ if isinstance(group, basestring):
+ if group in MIMETYPE:
+ group = MIMETYPE[group]
+ else:
+ out[group] = rank
+ continue
- for mime in group:
- if mime not in out:
- out[mime] = rank
+ for mime in group:
+ if mime not in out:
+ out[mime] = rank
- if not self.strict:
- out['*/*'] = rank-0.1
+ if not self.strict:
+ out['*/*'] = rank-0.1
- string = ','.join([x+';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
- req.add_unredirected_header('Accept', string)
+ string = ','.join([x+';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
+ req.add_unredirected_header('Accept', string)
- return req
+ return req
- def http_error_304(self, req, fp, code, msg, headers):
- log('http cached')
- if self.etag:
- headers.addheader('etag', self.etag)
- if self.lastmodified:
- headers.addheader('last-modified', self.lastmodified)
- resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
- return resp
+ def http_error_304(self, req, fp, code, msg, headers):
+ log('http cached')
+ if self.etag:
+ headers.addheader('etag', self.etag)
+ if self.lastmodified:
+ headers.addheader('last-modified', self.lastmodified)
+ resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
+ return resp
- def http_response(self, req, resp):
- urllib2.HTTPCookieProcessor.http_response(self, req, resp)
- data = resp.read()
+ def http_response(self, req, resp):
+ urllib2.HTTPCookieProcessor.http_response(self, req, resp)
+ data = resp.read()
- if 200 <= resp.code < 300:
- # gzip
- if resp.headers.get('Content-Encoding') == 'gzip':
- log('un-gzip')
- data = GzipFile(fileobj=StringIO(data), mode='r').read()
+ if 200 <= resp.code < 300:
+ # gzip
+ if resp.headers.get('Content-Encoding') == 'gzip':
+ log('un-gzip')
+ data = GzipFile(fileobj=StringIO(data), mode='r').read()
- if 200 <= resp.code < 300 and resp.info().maintype == 'text':
- # redirect
- if resp.info().type in MIMETYPE['html']:
- match = re.search(r'(?i)]*?url=(http.*?)["\']', data)
- if match:
- newurl = match.groups()[0]
- log('redirect: %s' % newurl)
+ if 200 <= resp.code < 300 and resp.info().maintype == 'text':
+ # redirect
+ if resp.info().type in MIMETYPE['html']:
+ match = re.search(r'(?i)]*?url=(http.*?)["\']', data)
+ if match:
+ newurl = match.groups()[0]
+ log('redirect: %s' % newurl)
- newheaders = dict((k,v) for k,v in req.headers.items()
- if k.lower() not in ('content-length', 'content-type'))
- new = urllib2.Request(newurl,
- headers=newheaders,
- origin_req_host=req.get_origin_req_host(),
- unverifiable=True)
+ newheaders = dict((k,v) for k,v in req.headers.items()
+ if k.lower() not in ('content-length', 'content-type'))
+ new = urllib2.Request(newurl,
+ headers=newheaders,
+ origin_req_host=req.get_origin_req_host(),
+ unverifiable=True)
- return self.parent.open(new, timeout=req.timeout)
+ return self.parent.open(new, timeout=req.timeout)
- # encoding
- enc = detEncoding(data, resp)
+ # encoding
+ enc = detEncoding(data, resp)
- if enc:
- data = data.decode(enc, 'replace')
+ if enc:
+ data = data.decode(enc, 'replace')
- if not self.decode:
- data = data.encode(enc)
+ if not self.decode:
+ data = data.encode(enc)
- fp = StringIO(data)
- old_resp = resp
- resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
- resp.msg = old_resp.msg
+ fp = StringIO(data)
+ old_resp = resp
+ resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
+ resp.msg = old_resp.msg
- return resp
+ return resp
- https_response = http_response
- https_request = http_request
+ https_response = http_response
+ https_request = http_request
def detEncoding(data, con=None):
- if con is not None and con.headers.getparam('charset'):
- log('header')
- return con.headers.getparam('charset')
+ if con is not None and con.headers.getparam('charset'):
+ log('header')
+ return con.headers.getparam('charset')
- match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
- if match:
- log('meta.re')
- return match.groups()[0]
+ match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
+ if match:
+ log('meta.re')
+ return match.groups()[0]
- match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
- if match:
- return match.groups()[0].lower()
+ match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
+ if match:
+ return match.groups()[0].lower()
- return None
+ return None
def Fix(item, feedurl='/'):
- """ Improves feed items (absolute links, resolve feedburner links, etc) """
+ """ Improves feed items (absolute links, resolve feedburner links, etc) """
- # check unwanted uppercase title
- if len(item.title) > 20 and item.title.isupper():
- item.title = item.title.title()
+ # check unwanted uppercase title
+ if len(item.title) > 20 and item.title.isupper():
+ item.title = item.title.title()
- # check if it includes link
- if not item.link:
- log('no link')
- return item
+ # check if it includes link
+ if not item.link:
+ log('no link')
+ return item
- # wikipedia daily highlight
- if fnmatch(feedurl, 'http*://*.wikipedia.org/w/api.php?*&feedformat=atom'):
- match = lxml.html.fromstring(item.desc).xpath('//b/a/@href')
- if len(match):
- item.link = match[0]
- log(item.link)
+ # wikipedia daily highlight
+ if fnmatch(feedurl, 'http*://*.wikipedia.org/w/api.php?*&feedformat=atom'):
+ match = lxml.html.fromstring(item.desc).xpath('//b/a/@href')
+ if len(match):
+ item.link = match[0]
+ log(item.link)
- # check relative urls
- item.link = urlparse.urljoin(feedurl, item.link)
+ # check relative urls
+ item.link = urlparse.urljoin(feedurl, item.link)
- # google translate
- if fnmatch(item.link, 'http://translate.google.*/translate*u=*'):
- item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['u'][0]
- log(item.link)
+ # google translate
+ if fnmatch(item.link, 'http://translate.google.*/translate*u=*'):
+ item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['u'][0]
+ log(item.link)
- # google
- if fnmatch(item.link, 'http://www.google.*/url?q=*'):
- item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['q'][0]
- log(item.link)
+ # google
+ if fnmatch(item.link, 'http://www.google.*/url?q=*'):
+ item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['q'][0]
+ log(item.link)
- # google news
- if fnmatch(item.link, 'http://news.google.com/news/url*url=*'):
- item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['url'][0]
- log(item.link)
+ # google news
+ if fnmatch(item.link, 'http://news.google.com/news/url*url=*'):
+ item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['url'][0]
+ log(item.link)
- # facebook
- if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
- item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['u'][0]
- log(item.link)
+ # facebook
+ if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
+ item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['u'][0]
+ log(item.link)
- # feedburner
- feeds.NSMAP['feedburner'] = 'http://rssnamespace.org/feedburner/ext/1.0'
- match = item.xval('feedburner:origLink')
- if match:
- item.link = match
+ # feedburner
+ feeds.NSMAP['feedburner'] = 'http://rssnamespace.org/feedburner/ext/1.0'
+ match = item.xval('feedburner:origLink')
+ if match:
+ item.link = match
- # feedsportal
- match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link)
- if match:
- url = match.groups()[0].split('0')
- t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'H':',', 'I':'_', 'L':'http://', 'S':'www.', 'N':'.com', 'O':'.co.uk'}
- item.link = ''.join([(t[s[0]] if s[0] in t else '=') + s[1:] for s in url[1:]])
- log(item.link)
+ # feedsportal
+ match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link)
+ if match:
+ url = match.groups()[0].split('0')
+ t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'H':',', 'I':'_', 'L':'http://', 'S':'www.', 'N':'.com', 'O':'.co.uk'}
+ item.link = ''.join([(t[s[0]] if s[0] in t else '=') + s[1:] for s in url[1:]])
+ log(item.link)
- # reddit
- if urlparse.urlparse(feedurl).netloc == 'www.reddit.com':
- match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
- if len(match):
- item.link = match[0]
- log(item.link)
+ # reddit
+ if urlparse.urlparse(feedurl).netloc == 'www.reddit.com':
+ match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
+ if len(match):
+ item.link = match[0]
+ log(item.link)
- return item
+ return item
def Fill(item, cache, feedurl='/', fast=False):
- """ Returns True when it has done its best """
+ """ Returns True when it has done its best """
- if not item.link:
- log('no link')
- return item
+ if not item.link:
+ log('no link')
+ return item
- log(item.link)
+ log(item.link)
- # content already provided?
- count_content = countWord(item.content)
- count_desc = countWord(item.desc)
+ # content already provided?
+ count_content = countWord(item.content)
+ count_desc = countWord(item.desc)
- if max(count_content, count_desc) > 500:
- if count_desc > count_content:
- item.content = item.desc
- del item.desc
- log('reversed sizes')
- log('long enough')
- return True
+ if max(count_content, count_desc) > 500:
+ if count_desc > count_content:
+ item.content = item.desc
+ del item.desc
+ log('reversed sizes')
+ log('long enough')
+ return True
- if count_content > 5*count_desc > 0 and count_content > 50:
- log('content bigger enough')
- return True
+ if count_content > 5*count_desc > 0 and count_content > 50:
+ log('content bigger enough')
+ return True
- link = item.link
+ link = item.link
- # twitter
- if urlparse.urlparse(feedurl).netloc == 'twitter.com':
- match = lxml.html.fromstring(item.content).xpath('//a/@data-expanded-url')
- if len(match):
- link = match[0]
- log(link)
- else:
- link = None
+ # twitter
+ if urlparse.urlparse(feedurl).netloc == 'twitter.com':
+ match = lxml.html.fromstring(item.content).xpath('//a/@data-expanded-url')
+ if len(match):
+ link = match[0]
+ log(link)
+ else:
+ link = None
- # facebook
- if urlparse.urlparse(feedurl).netloc == 'graph.facebook.com':
- match = lxml.html.fromstring(item.content).xpath('//a/@href')
- if len(match) and urlparse.urlparse(match[0]).netloc != 'www.facebook.com':
- link = match[0]
- log(link)
- else:
- link = None
+ # facebook
+ if urlparse.urlparse(feedurl).netloc == 'graph.facebook.com':
+ match = lxml.html.fromstring(item.content).xpath('//a/@href')
+ if len(match) and urlparse.urlparse(match[0]).netloc != 'www.facebook.com':
+ link = match[0]
+ log(link)
+ else:
+ link = None
- if link is None:
- log('no used link')
- return True
+ if link is None:
+ log('no used link')
+ return True
- # check cache and previous errors
- if link in cache:
- content = cache.get(link)
- match = re.search(r'^error-([a-z]{2,10})$', content)
- if match:
- if cache.age(link) > DELAY:
- log('cached error: %s' % match.groups()[0])
- return True
- else:
- log('old error')
- else:
- log('cached')
- item.pushContent(cache.get(link))
- return True
+ # check cache and previous errors
+ if link in cache:
+ content = cache.get(link)
+ match = re.search(r'^error-([a-z]{2,10})$', content)
+ if match:
+ if cache.age(link) > DELAY:
+ log('cached error: %s' % match.groups()[0])
+ return True
+ else:
+ log('old error')
+ else:
+ log('cached')
+ item.pushContent(cache.get(link))
+ return True
- # super-fast mode
- if fast:
- log('skipped')
- return False
+ # super-fast mode
+ if fast:
+ log('skipped')
+ return False
- # download
- try:
- url = link.encode('utf-8')
- con = urllib2.build_opener(SimpleDownload(accept=('html', 'text/*'), strict=True)).open(url, timeout=TIMEOUT)
- data = con.read()
- except (IOError, httplib.HTTPException) as e:
- log('http error: %s' % e.message)
- cache.set(link, 'error-http')
- return True
+ # download
+ try:
+ url = link.encode('utf-8')
+ con = urllib2.build_opener(SimpleDownload(accept=('html', 'text/*'), strict=True)).open(url, timeout=TIMEOUT)
+ data = con.read()
+ except (IOError, httplib.HTTPException) as e:
+ log('http error: %s' % e.message)
+ cache.set(link, 'error-http')
+ return True
- if con.info().type not in MIMETYPE['html'] and con.info().type != 'text/plain':
- log('non-text page')
- cache.set(link, 'error-type')
- return True
+ if con.info().type not in MIMETYPE['html'] and con.info().type != 'text/plain':
+ log('non-text page')
+ cache.set(link, 'error-type')
+ return True
- out = readability.Document(data, url=con.url).summary(True)
+ out = readability.Document(data, url=con.url).summary(True)
- if countWord(out) > max(count_content, count_desc) > 0:
- item.pushContent(out)
- cache.set(link, out)
- else:
- log('not bigger enough')
- cache.set(link, 'error-length')
- return True
+ if countWord(out) > max(count_content, count_desc) > 0:
+ item.pushContent(out)
+ cache.set(link, out)
+ else:
+ log('not bigger enough')
+ cache.set(link, 'error-length')
+ return True
- return True
+ return True
def Init(url, cachePath, options):
- # url clean up
- log(url)
+ # url clean up
+ log(url)
- if url is None:
- raise MorssException('No url provided')
+ if url is None:
+ raise MorssException('No url provided')
- if urlparse.urlparse(url).scheme not in PROTOCOL:
- url = 'http://' + url
- log(url)
+ if urlparse.urlparse(url).scheme not in PROTOCOL:
+ url = 'http://' + url
+ log(url)
- url = url.replace(' ', '%20')
+ url = url.replace(' ', '%20')
- # cache
- cache = Cache(cachePath, url)
- log(cache._hash)
+ # cache
+ cache = Cache(cachePath, url)
+ log(cache._hash)
- return (url, cache)
+ return (url, cache)
def Fetch(url, cache, options):
- # do some useful facebook work
- feedify.PreWorker(url, cache)
+ # do some useful facebook work
+ feedify.PreWorker(url, cache)
- if 'redirect' in cache:
- url = cache.get('redirect')
- log('url redirect')
- log(url)
+ if 'redirect' in cache:
+ url = cache.get('redirect')
+ log('url redirect')
+ log(url)
- # fetch feed
- if not options.theforce and 'xml' in cache and cache.age('xml') < DELAY and 'style' in cache:
- log('xml cached')
- xml = cache.get('xml')
- style = cache.get('style')
- else:
- try:
- opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), accept=('xml','html'))
- con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT*2)
- xml = con.read()
- except (IOError, httplib.HTTPException):
- raise MorssException('Error downloading feed')
+ # fetch feed
+ if not options.theforce and 'xml' in cache and cache.age('xml') < DELAY and 'style' in cache:
+ log('xml cached')
+ xml = cache.get('xml')
+ style = cache.get('style')
+ else:
+ try:
+ opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), accept=('xml','html'))
+ con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT*2)
+ xml = con.read()
+ except (IOError, httplib.HTTPException):
+ raise MorssException('Error downloading feed')
- cache.set('xml', xml)
- cache.set('etag', con.headers.getheader('etag'))
- cache.set('lastmodified', con.headers.getheader('last-modified'))
+ cache.set('xml', xml)
+ cache.set('etag', con.headers.getheader('etag'))
+ cache.set('lastmodified', con.headers.getheader('last-modified'))
- if url.startswith('https://itunes.apple.com/lookup?id='):
- style = 'itunes'
- elif xml.startswith(' lim_time >= 0 or i+1 > lim_item >= 0:
- log('dropped')
- item.remove()
- return
+ def worker(i, item):
+ if time.time() - startTime > lim_time >= 0 or i+1 > lim_item >= 0:
+ log('dropped')
+ item.remove()
+ return
- item = Fix(item, url)
+ item = Fix(item, url)
- if time.time() - startTime > max_time >= 0 or i+1 > max_item >= 0:
- if not options.proxy:
- if Fill(item, cache, url, True) is False:
- item.remove()
- return
- else:
- if not options.proxy:
- Fill(item, cache, url)
+ if time.time() - startTime > max_time >= 0 or i+1 > max_item >= 0:
+ if not options.proxy:
+ if Fill(item, cache, url, True) is False:
+ item.remove()
+ return
+ else:
+ if not options.proxy:
+ Fill(item, cache, url)
- queue = Queue.Queue()
+ queue = Queue.Queue()
- for i in range(THREADS):
- t = threading.Thread(target=runner, args=(queue,))
- t.daemon = True
- t.start()
+ for i in range(THREADS):
+ t = threading.Thread(target=runner, args=(queue,))
+ t.daemon = True
+ t.start()
- for i, item in enumerate(rss.items):
- queue.put([i, item])
+ for i, item in enumerate(rss.items):
+ queue.put([i, item])
- queue.join()
- cache.save()
+ queue.join()
+ cache.save()
- if options.ad:
- new = rss.items.append()
- new.title = "Are you hungry?"
- new.desc = "Eat some Galler chocolate :)"
- new.link = "http://www.galler.com/"
- new.time = "5 Oct 2013 22:42"
+ if options.ad:
+ new = rss.items.append()
+ new.title = "Are you hungry?"
+ new.desc = "Eat some Galler chocolate :)"
+ new.link = "http://www.galler.com/"
+ new.time = "5 Oct 2013 22:42"
- log(len(rss.items))
- log(time.time() - startTime)
+ log(len(rss.items))
+ log(time.time() - startTime)
- return rss
+ return rss
def After(rss, options):
- for i, item in enumerate(rss.items):
+ for i, item in enumerate(rss.items):
- if item.desc and item.content:
- if options.clip:
- item.content = item.desc + "
* * *
" + item.content
- del item.desc
- if not options.keep:
- del item.desc
+ if item.desc and item.content:
+ if options.clip:
+ item.content = item.desc + "
* * *
" + item.content
+ del item.desc
+ if not options.keep:
+ del item.desc
- if options.nolink and item.content:
- content = lxml.html.fromstring(item.content)
- for link in content.xpath('//a'):
- log(link.text_content())
- link.drop_tag()
- item.content = lxml.etree.tostring(content)
+ if options.nolink and item.content:
+ content = lxml.html.fromstring(item.content)
+ for link in content.xpath('//a'):
+ log(link.text_content())
+ link.drop_tag()
+ item.content = lxml.etree.tostring(content)
- if options.noref:
- item.link = ''
+ if options.noref:
+ item.link = ''
- if options.md:
- conv = HTML2Text(baseurl=item.link)
- conv.unicode_snob = True
+ if options.md:
+ conv = HTML2Text(baseurl=item.link)
+ conv.unicode_snob = True
- if item.desc:
- item.desc = conv.handle(item.desc)
- if item.content:
- item.content = conv.handle(item.content)
+ if item.desc:
+ item.desc = conv.handle(item.desc)
+ if item.content:
+ item.content = conv.handle(item.content)
- if options.json:
- if options.indent:
- return rss.tojson(indent=4)
- else:
- return rss.tojson()
- elif options.csv:
- return rss.tocsv()
- elif options.reader:
- return rss.tohtml()
- else:
- return rss.tostring(xml_declaration=True, encoding='UTF-8')
+ if options.json:
+ if options.indent:
+ return rss.tojson(indent=4)
+ else:
+ return rss.tojson()
+ elif options.csv:
+ return rss.tocsv()
+ elif options.reader:
+ return rss.tohtml()
+ else:
+ return rss.tostring(xml_declaration=True, encoding='UTF-8')
def process(url, cache=None, options=None):
- if options == None:
- options = []
+ if options == None:
+ options = []
- options = Options(options)
- url, cache = Init(url, cache, options)
- rss = Fetch(url, cache, options)
- rss = Gather(rss, url, cache, options)
+ options = Options(options)
+ url, cache = Init(url, cache, options)
+ rss = Fetch(url, cache, options)
+ rss = Gather(rss, url, cache, options)
- return After(rss, options)
+ return After(rss, options)
def cgi_app(environ, start_response):
- # get options
- if 'REQUEST_URI' in environ:
- url = environ['REQUEST_URI'][1:]
- else:
- url = environ['PATH_INFO'][1:]
+ # get options
+ if 'REQUEST_URI' in environ:
+ url = environ['REQUEST_URI'][1:]
+ else:
+ url = environ['PATH_INFO'][1:]
- url = re.sub(r'^/?morss.py/', '', url)
+ url = re.sub(r'^/?morss.py/', '', url)
- if url.startswith(':'):
- options = url.split('/')[0].split(':')[1:]
- url = url.split('/', 1)[1]
- else:
- options = []
+ if url.startswith(':'):
+ options = url.split('/')[0].split(':')[1:]
+ url = url.split('/', 1)[1]
+ else:
+ options = []
- # init
- options = Options(options)
- headers = {}
+ # init
+ options = Options(options)
+ headers = {}
- global DEBUG
- DEBUG = options.debug
+ global DEBUG
+ DEBUG = options.debug
- if 'HTTP_IF_NONE_MATCH' in environ:
- if not options.force and not options.facebook and time.time() - int(environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY:
- headers['status'] = '304 Not Modified'
- start_response(headers['status'], headers.items())
- log(url)
- log('etag good')
- return []
+ if 'HTTP_IF_NONE_MATCH' in environ:
+ if not options.force and not options.facebook and time.time() - int(environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY:
+ headers['status'] = '304 Not Modified'
+ start_response(headers['status'], headers.items())
+ log(url)
+ log('etag good')
+ return []
- # headers
- headers['status'] = '200 OK'
- headers['etag'] = '"%s"' % int(time.time())
+ # headers
+ headers['status'] = '200 OK'
+ headers['etag'] = '"%s"' % int(time.time())
- if options.html or options.reader:
- headers['content-type'] = 'text/html'
- elif options.debug or options.txt:
- headers['content-type'] = 'text/plain'
- elif options.json:
- headers['content-type'] = 'application/json'
- elif options.csv:
- headers['content-type'] = 'text/csv'
- headers['content-disposition'] = 'attachment; filename="feed.csv"'
- else:
- headers['content-type'] = 'text/xml'
+ if options.html or options.reader:
+ headers['content-type'] = 'text/html'
+ elif options.debug or options.txt:
+ headers['content-type'] = 'text/plain'
+ elif options.json:
+ headers['content-type'] = 'application/json'
+ elif options.csv:
+ headers['content-type'] = 'text/csv'
+ headers['content-disposition'] = 'attachment; filename="feed.csv"'
+ else:
+ headers['content-type'] = 'text/xml'
- url, cache = Init(url, os.getcwd() + '/cache', options)
+ url, cache = Init(url, os.getcwd() + '/cache', options)
- if options.facebook:
- doFacebook(url, environ, headers, options, cache)
- start_response(headers['status'], headers.items())
- return
+ if options.facebook:
+ doFacebook(url, environ, headers, options, cache)
+ start_response(headers['status'], headers.items())
+ return
- # get the work done
- RSS = Fetch(url, cache, options)
+ # get the work done
+ RSS = Fetch(url, cache, options)
- if headers['content-type'] == 'text/xml':
- headers['content-type'] = RSS.mimetype
+ if headers['content-type'] == 'text/xml':
+ headers['content-type'] = RSS.mimetype
- start_response(headers['status'], headers.items())
+ start_response(headers['status'], headers.items())
- RSS = Gather(RSS, url, cache, options)
+ RSS = Gather(RSS, url, cache, options)
- if not DEBUG and not options.silent:
- return After(RSS, options)
+ if not DEBUG and not options.silent:
+ return After(RSS, options)
- log('done')
+ log('done')
def cgi_wrapper(environ, start_response):
- # simple http server for html and css
- files = {
- '': 'text/html',
- 'index.html': 'text/html'}
+ # simple http server for html and css
+ files = {
+ '': 'text/html',
+ 'index.html': 'text/html'}
- if 'REQUEST_URI' in environ:
- url = environ['REQUEST_URI'][1:]
- else:
- url = environ['PATH_INFO'][1:]
+ if 'REQUEST_URI' in environ:
+ url = environ['REQUEST_URI'][1:]
+ else:
+ url = environ['PATH_INFO'][1:]
- if url in files:
- headers = {}
+ if url in files:
+ headers = {}
- if url == '':
- url = 'index.html'
+ if url == '':
+ url = 'index.html'
- if os.path.isfile(url):
- headers['status'] = '200 OK'
- headers['content-type'] = files[url]
- start_response(headers['status'], headers.items())
- return open(url, 'rb').read()
- else:
- headers['status'] = '404 Not found'
- start_response(headers['status'], headers.items())
- return ''
+ if os.path.isfile(url):
+ headers['status'] = '200 OK'
+ headers['content-type'] = files[url]
+ start_response(headers['status'], headers.items())
+ return open(url, 'rb').read()
+ else:
+ headers['status'] = '404 Not found'
+ start_response(headers['status'], headers.items())
+ return ''
- # actual morss use
- try:
- return cgi_app(environ, start_response) or []
- except (KeyboardInterrupt, SystemExit):
- raise
- except Exception as e:
- headers = {}
- headers['status'] = '500 Oops'
- headers['content-type'] = 'text/plain'
- start_response(headers['status'], headers.items(), sys.exc_info())
- log('ERROR: %s' % e.message, force=True)
- return 'An error happened'
+ # actual morss use
+ try:
+ return cgi_app(environ, start_response) or []
+ except (KeyboardInterrupt, SystemExit):
+ raise
+ except Exception as e:
+ headers = {}
+ headers['status'] = '500 Oops'
+ headers['content-type'] = 'text/plain'
+ start_response(headers['status'], headers.items(), sys.exc_info())
+ log('ERROR: %s' % e.message, force=True)
+ return 'An error happened'
def cli_app():
- options = Options(sys.argv[1:-1])
- url = sys.argv[-1]
+ options = Options(sys.argv[1:-1])
+ url = sys.argv[-1]
- global DEBUG
- DEBUG = options.debug
+ global DEBUG
+ DEBUG = options.debug
- url, cache = Init(url, os.path.expanduser('~/.cache/morss'), options)
- RSS = Fetch(url, cache, options)
- RSS = Gather(RSS, url, cache, options)
+ url, cache = Init(url, os.path.expanduser('~/.cache/morss'), options)
+ RSS = Fetch(url, cache, options)
+ RSS = Gather(RSS, url, cache, options)
- if not DEBUG and not options.silent:
- print After(RSS, options)
+ if not DEBUG and not options.silent:
+ print After(RSS, options)
- log('done')
+ log('done')
def doFacebook(url, environ, headers, options, cache):
- log('fb stuff')
+ log('fb stuff')
- query = urlparse.urlparse(url).query
+ query = urlparse.urlparse(url).query
- if 'code' in query:
- # get real token from code
- code = urlparse.parse_qs(query)['code'][0]
- eurl = "https://graph.facebook.com/oauth/access_token?client_id={app_id}&redirect_uri={redirect_uri}&client_secret={app_secret}&code={code_parameter}".format(app_id=FBAPPID, app_secret=FBSECRET, code_parameter=code, redirect_uri=environ['SCRIPT_URI'])
- token = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())['access_token'][0]
+ if 'code' in query:
+ # get real token from code
+ code = urlparse.parse_qs(query)['code'][0]
+ eurl = "https://graph.facebook.com/oauth/access_token?client_id={app_id}&redirect_uri={redirect_uri}&client_secret={app_secret}&code={code_parameter}".format(app_id=FBAPPID, app_secret=FBSECRET, code_parameter=code, redirect_uri=environ['SCRIPT_URI'])
+ token = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())['access_token'][0]
- # get long-lived access token
- eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(app_id=FBAPPID, app_secret=FBSECRET, short_lived_token=token)
- values = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())
+ # get long-lived access token
+ eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(app_id=FBAPPID, app_secret=FBSECRET, short_lived_token=token)
+ values = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())
- ltoken = values['access_token'][0]
- expires = int(time.time() + int(values['expires'][0]))
+ ltoken = values['access_token'][0]
+ expires = int(time.time() + int(values['expires'][0]))
- headers['set-cookie'] = 'token={token}; Path=/'.format(token=ltoken)
+ headers['set-cookie'] = 'token={token}; Path=/'.format(token=ltoken)
- # headers
- headers['status'] = '303 See Other'
- headers['location'] = 'http://{domain}/'.format(domain=environ['SERVER_NAME'])
+ # headers
+ headers['status'] = '303 See Other'
+ headers['location'] = 'http://{domain}/'.format(domain=environ['SERVER_NAME'])
- log('fb done')
- return
+ log('fb done')
+ return
def main():
- if 'REQUEST_URI' in os.environ:
- wsgiref.handlers.CGIHandler().run(cgi_wrapper)
+ if 'REQUEST_URI' in os.environ:
+ wsgiref.handlers.CGIHandler().run(cgi_wrapper)
- elif len(sys.argv) <= 1:
- httpd = wsgiref.simple_server.make_server('', 8080, cgi_wrapper)
- httpd.serve_forever()
+ elif len(sys.argv) <= 1:
+ httpd = wsgiref.simple_server.make_server('', 8080, cgi_wrapper)
+ httpd.serve_forever()
- else:
- try:
- cli_app()
- except (KeyboardInterrupt, SystemExit):
- raise
- except Exception as e:
- print 'ERROR: %s' % e.message
+ else:
+ try:
+ cli_app()
+ except (KeyboardInterrupt, SystemExit):
+ raise
+ except Exception as e:
+ print 'ERROR: %s' % e.message
if __name__ == '__main__':
- main()
+ main()
diff --git a/setup.py b/setup.py
index ef764d3..7715db9 100644
--- a/setup.py
+++ b/setup.py
@@ -1,13 +1,13 @@
from setuptools import setup, find_packages
package_name = 'morss'
-setup( name=package_name,
- description='Get full-text RSS feeds',
- author='pictuga',
- author_email='contact at author name dot com',
- url='http://morss.it/',
- license='GPL 3+',
- package_dir={package_name: package_name},
- packages=find_packages(),
- package_data={package_name: ['feedify.ini']},
- test_suite=package_name + '.tests')
+setup( name=package_name,
+ description='Get full-text RSS feeds',
+ author='pictuga',
+ author_email='contact at author name dot com',
+ url='http://morss.it/',
+ license='GPL 3+',
+ package_dir={package_name: package_name},
+ packages=find_packages(),
+ package_data={package_name: ['feedify.ini']},
+ test_suite=package_name + '.tests')