diff --git a/morss/feedify.py b/morss/feedify.py
index 6271817..b7dd3d7 100644
--- a/morss/feedify.py
+++ b/morss/feedify.py
@@ -1,23 +1,25 @@
#!/usr/bin/env python
+import re
+import json
+import urlparse
+import urllib2
+
from ConfigParser import ConfigParser
from fnmatch import fnmatch
+import lxml.html
+
import feeds
import morss
-import re
-
-import urllib2
-import lxml.html
-import json
-import urlparse
-def toclass(query):
+def to_class(query):
pattern = r'\[class=([^\]]+)\]'
repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
return re.sub(pattern, repl, query)
-def getRule(link):
+
+def get_rule(link):
config = ConfigParser()
config.read('feedify.ini')
@@ -29,10 +31,12 @@ def getRule(link):
return values
return False
-def supported(link):
- return getRule(link) is not False
-def formatString(string, getter, error=False):
+def supported(link):
+ return get_rule(link) is not False
+
+
+def format_string(string, getter, error=False):
out = ""
char = string[0]
@@ -42,41 +46,42 @@ def formatString(string, getter, error=False):
match = follow.partition('"')
out = match[0]
if len(match) >= 2:
- next = match[2]
+ next_match = match[2]
else:
- next = None
+ next_match = None
elif char == '{':
match = follow.partition('}')
try:
- test = formatString(match[0], getter, True)
- except ValueError, KeyError:
+ test = format_string(match[0], getter, True)
+ except (ValueError, KeyError):
pass
else:
out = test
- next = match[2]
+ next_match = match[2]
elif char == ' ':
- next = follow
+ next_match = follow
elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string):
match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups()
- rawValue = getter(match[0])
- if not isinstance(rawValue, basestring):
+ raw_value = getter(match[0])
+ if not isinstance(raw_value, basestring):
if match[1] is not None:
- out = match[1].join(rawValue)
+ out = match[1].join(raw_value)
else:
- out = ''.join(rawValue)
+ out = ''.join(raw_value)
if not out and error:
raise ValueError
- next = match[2]
+ next_match = match[2]
else:
raise ValueError('bogus string')
- if next is not None and len(next):
- return out + formatString(next, getter, error)
+ if next_match is not None and len(next_match):
+ return out + format_string(next_match, getter, error)
else:
return out
-def PreWorker(url, cache):
+
+def pre_worker(url, cache):
if urlparse.urlparse(url).netloc == 'itunes.apple.com':
match = re.search('/id([0-9]+)(\?.*)?$', url)
if match:
@@ -84,6 +89,7 @@ def PreWorker(url, cache):
redirect = 'https://itunes.apple.com/lookup?id={id}'.format(id=iid)
cache.set('redirect', redirect)
+
class Builder(object):
def __init__(self, link, data=None, cache=False):
self.link = link
@@ -93,11 +99,11 @@ class Builder(object):
data = urllib2.urlopen(link).read()
self.data = data
- self.rule = getRule(link)
+ self.rule = get_rule(link)
if self.rule['mode'] == 'xpath':
if not isinstance(self.data, unicode):
- self.data = self.data.decode(morss.detEncoding(self.data), 'replace')
+ self.data = self.data.decode(morss.detect_encoding(self.data), 'replace')
self.doc = lxml.html.fromstring(self.data)
elif self.rule['mode'] == 'json':
self.doc = json.loads(data)
@@ -106,7 +112,7 @@ class Builder(object):
def raw(self, html, expr):
if self.rule['mode'] == 'xpath':
- return html.xpath(toclass(expr))
+ return html.xpath(to_class(expr))
elif self.rule['mode'] == 'json':
a = [html]
@@ -119,7 +125,7 @@ class Builder(object):
if kids is None:
pass
elif isinstance(kids, list):
- [b.append(i) for i in kids]
+ b += kids
elif isinstance(kids, basestring):
b.append(kids.replace('\n', '
'))
else:
@@ -128,7 +134,7 @@ class Builder(object):
if match[1] is None:
a = b
else:
- if len(b)-1 >= int(match[1]):
+ if len(b) - 1 >= int(match[1]):
a = [b[int(match[1])]]
else:
a = []
@@ -150,7 +156,7 @@ class Builder(object):
def string(self, html, expr):
getter = lambda x: self.strings(html, x)
- return formatString(self.rule[expr], getter)
+ return format_string(self.rule[expr], getter)
def build(self):
if 'title' in self.rule:
@@ -160,23 +166,22 @@ class Builder(object):
matches = self.raw(self.doc, self.rule['items'])
if matches and len(matches):
for item in matches:
- feedItem = {}
+ feed_item = {}
if 'item_title' in self.rule:
- feedItem['title'] = self.string(item, 'item_title')
+ feed_item['title'] = self.string(item, 'item_title')
if 'item_link' in self.rule:
url = self.string(item, 'item_link')
url = urlparse.urljoin(self.link, url)
- feedItem['link'] = url
+ feed_item['link'] = url
if 'item_desc' in self.rule:
- feedItem['desc'] = self.string(item, 'item_desc')
+ feed_item['desc'] = self.string(item, 'item_desc')
if 'item_content' in self.rule:
- feedItem['content'] = self.string(item, 'item_content')
+ feed_item['content'] = self.string(item, 'item_content')
if 'item_time' in self.rule:
- feedItem['updated'] = self.string(item, 'item_time')
+ feed_item['updated'] = self.string(item, 'item_time')
if 'item_id' in self.rule:
- feedItem['id'] = self.string(item, 'item_id')
- feedItem['isPermaLink'] = False
-
- self.feed.items.append(feedItem)
+ feed_item['id'] = self.string(item, 'item_id')
+ feed_item['isPermaLink'] = False
+ self.feed.items.append(feed_item)
diff --git a/morss/feeds.py b/morss/feeds.py
index f18232a..093b5ed 100644
--- a/morss/feeds.py
+++ b/morss/feeds.py
@@ -1,14 +1,16 @@
#!/usr/bin/env python
-from lxml import etree
from datetime import datetime
-import dateutil.parser
-from dateutil import tz
-import re
-
from StringIO import StringIO
+
+import re
import json
import csv
+import urllib2
+
+from lxml import etree
+from dateutil import tz
+import dateutil.parser
try:
from wheezy.template.engine import Engine
@@ -26,21 +28,22 @@ except ImportError:
Element = etree.Element
-NSMAP = {'atom': 'http://www.w3.org/2005/Atom',
- 'atom03': 'http://purl.org/atom/ns#',
- 'media': 'http://search.yahoo.com/mrss/',
- 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
- 'slash': 'http://purl.org/rss/1.0/modules/slash/',
- 'dc': 'http://purl.org/dc/elements/1.1/',
- 'content': 'http://purl.org/rss/1.0/modules/content/',
- 'rssfake': 'http://purl.org/rss/1.0/'}
+NSMAP = {'atom': 'http://www.w3.org/2005/Atom',
+ 'atom03': 'http://purl.org/atom/ns#',
+ 'media': 'http://search.yahoo.com/mrss/',
+ 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
+ 'slash': 'http://purl.org/rss/1.0/modules/slash/',
+ 'dc': 'http://purl.org/dc/elements/1.1/',
+ 'content': 'http://purl.org/rss/1.0/modules/content/',
+ 'rssfake': 'http://purl.org/rss/1.0/'}
+
def load(url):
- import urllib2
d = urllib2.urlopen(url).read()
return parse(d)
-def tagNS(tag, nsmap=NSMAP):
+
+def tag_NS(tag, nsmap=NSMAP):
match = re.search(r'^\{([^\}]+)\}(.*)$', tag)
if match:
match = match.groups()
@@ -55,15 +58,19 @@ def tagNS(tag, nsmap=NSMAP):
return "{%s}%s" % (nsmap[match[0]], match[1].lower())
return tag
-def innerHTML(xml):
+
+def inner_html(xml):
return (xml.text or '') + ''.join([etree.tostring(child) for child in xml.iterchildren()])
-def cleanNode(xml):
+
+def clean_node(xml):
[xml.remove(child) for child in xml.iterchildren()]
+
class FeedException(Exception):
pass
+
def parse(data):
# encoding
match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
@@ -80,15 +87,16 @@ def parse(data):
# rss
match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
if len(match):
- mtable = { 'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
- 'atom03:feed': FeedParserAtom, 'atom:feed': FeedParserAtom }
+ m_table = {'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
+ 'atom03:feed': FeedParserAtom, 'atom:feed': FeedParserAtom}
match = match[0]
- tag = tagNS(match.tag)
- if tag in mtable:
- return mtable[tag](doc, tag)
+ tag = tag_NS(match.tag)
+ if tag in m_table:
+ return m_table[tag](doc, tag)
raise FeedException('unknown feed type')
+
class FeedBase(object):
"""
Base for xml-related classes, which provides simple wrappers around xpath
@@ -135,7 +143,7 @@ class FeedBase(object):
else:
return ""
- def xgetCreate(self, table):
+ def xget_create(self, table):
""" Returns an element, and creates it when not present """
value = table[self.tag]
if not isinstance(value, tuple):
@@ -145,7 +153,7 @@ class FeedBase(object):
if match is not None:
return match
else:
- element = etree.Element(tagNS(new))
+ element = etree.Element(tag_NS(new))
self.root.append(element)
return element
@@ -158,58 +166,62 @@ class FeedBase(object):
""" Returns string using lxml. Arguments passed to tostring """
return etree.tostring(self.xml, pretty_print=True, **k)
+
class FeedDescriptor(object):
"""
Descriptor which gives off elements based on "self.getName" and
"self.setName" as getter/setters. Looks far better, and avoids duplicates
"""
+
def __init__(self, name):
self.name = name
- self.nname = name[0].upper() + name[1:]
def __get__(self, instance, owner):
- getter = getattr(instance, 'get%s' % self.nname)
+ getter = getattr(instance, 'get_%s' % self.name)
return getter()
def __set__(self, instance, value):
- setter = getattr(instance, 'set%s' % self.nname)
+ setter = getattr(instance, 'set_%s' % self.name)
return setter(value)
def __delete__(self, instance):
- deleter = getattr(instance, 'del%s' % self.nname)
+ deleter = getattr(instance, 'del_%s' % self.name)
return deleter()
+
class FeedTime(FeedDescriptor):
def __get__(self, instance, owner):
- getter = getattr(instance, 'get%s' % self.nname)
+ getter = getattr(instance, 'get_%s' % self.name)
raw = getter()
try:
- time = parseTime(raw)
+ time = parse_time(raw)
return time
except ValueError:
return None
def __set__(self, instance, value):
try:
- time = parseTime(value)
+ time = parse_time(value)
raw = time.strftime(instance.timeFormat)
- setter = getattr(instance, 'set%s' % self.nname)
+ setter = getattr(instance, 'set_%s' % self.name)
return setter(raw)
except ValueError:
pass
+
class FeedBool(FeedDescriptor):
def __get__(self, instance, owner):
- getter = getattr(instance, 'get%s' % self.nname)
+ getter = getattr(instance, 'get_%s' % self.name)
raw = getter()
return (raw or '').lower() != 'false'
def __set__(self, instance, value):
raw = 'true' if value else 'false'
- setter = getattr(instance, 'set%s' % self.nname)
+ setter = getattr(instance, 'set_%s' % self.name)
return setter(raw)
-def parseTime(value):
+
+def parse_time(value):
if isinstance(value, basestring):
if re.match(r'^[0-9]+$', value):
return datetime.fromtimestamp(int(value), tz.tzutc())
@@ -222,6 +234,7 @@ def parseTime(value):
else:
return False
+
class FeedList(object):
"""
Class to map a list of xml elements against a list of matching objects,
@@ -231,14 +244,15 @@ class FeedList(object):
Comes with its very own descriptor.
"""
- def __init__(self, parent, getter, tag, childClass):
+
+ def __init__(self, parent, getter, tag, child_class):
self.parent = parent
self.getter = getter
- self.childClass = childClass
+ self.childClass = child_class
self.tag = tag
- self._children = {} # id(xml) => FeedItem
+ self._children = {} # id(xml) => FeedItem
- def getChildren(self):
+ def get_children(self):
children = self.getter()
out = []
for child in children:
@@ -269,7 +283,7 @@ class FeedList(object):
return new
def __getitem__(self, key):
- return self.getChildren()[key]
+ return self.get_children()[key]
def __delitem__(self, key):
child = self.getter()[key]
@@ -282,22 +296,24 @@ class FeedList(object):
def __len__(self):
return len(self.getter())
+
class FeedListDescriptor(object):
"""
Descriptor for FeedList
"""
+
def __init__(self, name):
self.name = name
- self.items = {} # id(instance) => FeedList
+ self.items = {} # id(instance) => FeedList
def __get__(self, instance, owner=None):
key = id(instance)
if key in self.items:
return self.items[key]
else:
- getter = getattr(instance, 'get%s' % self.name.title())
- className = globals()[getattr(instance, '%sClass' % self.name)]
- self.items[key] = FeedList(instance, getter, instance.tag, className)
+ getter = getattr(instance, 'get_%s' % self.name)
+ class_name = globals()[getattr(instance, '%sClass' % self.name)]
+ self.items[key] = FeedList(instance, getter, instance.tag, class_name)
return self.items[key]
def __set__(self, instance, value):
@@ -305,6 +321,7 @@ class FeedListDescriptor(object):
[x.remove() for x in [x for x in f.items]]
[feedlist.append(x) for x in value]
+
class FeedParser(FeedBase):
itemsClass = 'FeedItem'
mimetype = 'application/xml'
@@ -318,27 +335,25 @@ class FeedParser(FeedBase):
self.root = self.xml.xpath("//atom03:feed|//atom:feed|//channel|//rssfake:channel", namespaces=NSMAP)[0]
self.tag = tag
- def getTitle(self):
+ def get_title(self):
return ""
- def setTitle(self, value):
+ def set_title(self, value):
pass
- def delTitle(self):
+ def del_title(self):
self.title = ""
-
- def getDesc(self):
+ def get_desc(self):
pass
- def setDesc(self, value):
+ def set_desc(self, value):
pass
- def delDesc(self):
+ def del_desc(self):
self.desc = ""
-
- def getItems(self):
+ def get_items(self):
return []
title = FeedDescriptor('title')
@@ -355,7 +370,8 @@ class FeedParser(FeedBase):
out = StringIO()
c = csv.writer(out, dialect=csv.excel)
for item in self.items:
- row = [x[1].encode('utf-8') if isinstance(x[1], unicode) else x[1] for x in item if isinstance(x[1], basestring)]
+ row = [x[1].encode('utf-8') if isinstance(x[1], unicode) else x[1] for x in item if
+ isinstance(x[1], basestring)]
c.writerow(row)
out.seek(0)
return out.read()
@@ -367,7 +383,8 @@ class FeedParser(FeedBase):
loader = DictLoader({'reader': open('reader.html.template').read()})
engine = Engine(loader=loader, extensions=[CoreExtension()])
template = engine.get_template('reader')
- return template.render({'feed':self}).encode('utf-8')
+ return template.render({'feed': self}).encode('utf-8')
+
class FeedParserRSS(FeedParser):
"""
@@ -375,161 +392,153 @@ class FeedParserRSS(FeedParser):
"""
itemsClass = 'FeedItemRSS'
mimetype = 'application/rss+xml'
- base = { 'rdf:rdf': '',
- 'channel': ''}
+ base = {
+ 'rdf:rdf': '',
+ 'channel': ''}
- def getTitle(self):
+ def get_title(self):
return self.xval('rssfake:title|title')
- def setTitle(self, value):
+ def set_title(self, value):
if not value:
return self.xdel('rssfake:title|title')
- table = { 'rdf:rdf': 'rssfake:title',
- 'channel': 'title'}
- element = self.xgetCreate(table)
+ table = {'rdf:rdf': 'rssfake:title',
+ 'channel': 'title'}
+ element = self.xget_create(table)
element.text = value
-
- def getDesc(self):
+ def get_desc(self):
return self.xval('rssfake:description|description')
- def setDesc(self, value):
+ def set_desc(self, value):
if not value:
return self.xdel('rssfake:description|description')
- table = { 'rdf:rdf': 'rssfake:description',
- 'channel': 'description'}
- element = self.xgetCreate(table)
+ table = {'rdf:rdf': 'rssfake:description',
+ 'channel': 'description'}
+ element = self.xget_create(table)
element.text = value
-
- def getItems(self):
+ def get_items(self):
return self.xpath('rssfake:item|item')
+
class FeedParserAtom(FeedParser):
"""
Atom Parser
"""
itemsClass = 'FeedItemAtom'
mimetype = 'application/atom+xml'
- base = { 'atom:feed': '',
- 'atom03:feed': ''}
+ base = {'atom:feed': '',
+ 'atom03:feed': ''}
- def getTitle(self):
+ def get_title(self):
return self.xval('atom:title|atom03:title')
- def setTitle(self, value):
+ def set_title(self, value):
if not value:
return self.xval('atom:title|atom03:title')
- table = { 'atom:feed': 'atom:title',
- 'atom03:feed': 'atom03:title'}
- element = self.xgetCreate(table)
+ table = {'atom:feed': 'atom:title',
+ 'atom03:feed': 'atom03:title'}
+ element = self.xget_create(table)
element.text = value
-
- def getDesc(self):
+ def get_desc(self):
return self.xval('atom:subtitle|atom03:subtitle')
- def setDesc(self, value):
+ def set_desc(self, value):
if not value:
return self.xdel('atom:subtitle|atom03:subtitle')
- table = { 'atom:feed': 'atom:subtitle',
- 'atom03:feed': 'atom03:subtitle'}
- element = self.xgetCreate(table)
+ table = {'atom:feed': 'atom:subtitle',
+ 'atom03:feed': 'atom03:subtitle'}
+ element = self.xget_create(table)
element.text = value
-
- def getItems(self):
+ def get_items(self):
return self.xpath('atom:entry|atom03:entry')
+
class FeedItem(FeedBase):
timeFormat = ''
- dic = ('title', 'link', 'desc', 'content', 'id', 'isPermaLink', 'time', 'updated')
+ dic = ('title', 'link', 'desc', 'content', 'id', 'is_permalink', 'time', 'updated')
def __init__(self, xml=None, tag='atom:feed'):
if xml is None:
- xml = Element(tagNS(self.base[tag]))
+ xml = Element(tag_NS(self.base[tag]))
self.root = self.xml = xml
self.tag = tag
- def getTitle(self):
+ def get_title(self):
return ""
- def setTitle(self):
+ def set_title(self, value):
pass
- def delTitle(self):
+ def del_title(self):
self.title = ""
-
- def getLink(self):
+ def get_link(self):
return ""
- def setLink(self, value):
+ def set_link(self, value):
pass
- def delLink(self):
+ def del_link(self):
self.link = ""
-
- def getIsPermaLink(self):
+ def get_is_permalink(self):
return ""
- def setIsPermaLink(self, value):
+ def set_is_permalink(self, value):
pass
-
- def getDesc(self):
+ def get_desc(self):
return ""
- def setDesc(self, value):
+ def set_desc(self, value):
pass
- def delDesc(self):
+ def del_desc(self):
self.desc = ""
-
- def getContent(self):
+ def get_content(self):
return ""
- def setContent(self, value):
+ def set_content(self, value):
pass
- def delContent(self):
+ def del_content(self):
self.content = ""
-
- def getId(self):
+ def get_id(self):
return ""
- def setId(self, value):
+ def set_id(self, value):
pass
- def delId(self):
+ def del_id(self):
self.id = ""
-
- def getTime(self):
+ def get_time(self):
return None
- def setTime(self, value):
+ def set_time(self, value):
pass
def delTime(self):
self.time = None
-
- def getUpdated(self):
+ def get_updated(self):
return None
- def setUpdated(self, value):
+ def set_updated(self, value):
pass
- def delUpdated(self):
+ def del_updated(self):
self.updated = None
title = FeedDescriptor('title')
@@ -537,11 +546,11 @@ class FeedItem(FeedBase):
description = desc = FeedDescriptor('desc')
content = FeedDescriptor('content')
id = FeedDescriptor('id')
- isPermaLink = FeedBool('isPermaLink')
+ is_permalink = FeedBool('is_permalink')
time = FeedTime('time')
updated = FeedTime('updated')
- def pushContent(self, value):
+ def push_content(self, value):
if not self.desc and self.content:
self.desc = self.content
@@ -550,201 +559,192 @@ class FeedItem(FeedBase):
def remove(self):
self.xml.getparent().remove(self.xml)
+
class FeedItemRSS(FeedItem):
timeFormat = '%a, %d %b %Y %H:%M:%S %Z'
- base = { 'rdf:rdf': 'rssfake:item',
- 'channel': 'item'}
+ base = {'rdf:rdf': 'rssfake:item',
+ 'channel': 'item'}
- def getTitle(self):
+ def get_title(self):
return self.xval('rssfake:title|title')
- def setTitle(self, value):
+ def set_title(self, value):
if not value:
return self.xdel('rssfake:title|title')
- table = { 'rdf:rdf': 'rssfake:title',
- 'channel': 'title'}
- element = self.xgetCreate(table)
+ table = {'rdf:rdf': 'rssfake:title',
+ 'channel': 'title'}
+ element = self.xget_create(table)
element.text = value
-
- def getLink(self):
+ def get_link(self):
return self.xval('rssfake:link|link')
- def setLink(self, value):
- if self.isPermaLink and self.id == self.link != value:
- self.isPermaLink = False
+ def set_link(self, value):
+ if self.is_permalink and self.id == self.link != value:
+ self.is_permalink = False
- table = { 'rdf:rdf': 'rssfake:link',
- 'channel': 'link'}
- element = self.xgetCreate(table)
+ table = {'rdf:rdf': 'rssfake:link',
+ 'channel': 'link'}
+ element = self.xget_create(table)
element.text = value
- def getDesc(self):
+ def get_desc(self):
return self.xval('rssfake:description|description')
- def setDesc(self, value):
+ def set_desc(self, value):
if not value:
return self.xdel('rssfake:description|description')
- table = { 'rdf:rdf': 'rssfake:description',
- 'channel': 'description'}
- element = self.xgetCreate(table)
+ table = {'rdf:rdf': 'rssfake:description',
+ 'channel': 'description'}
+ element = self.xget_create(table)
element.text = value
-
- def getContent(self):
+ def get_content(self):
return self.xval('content:encoded')
- def setContent(self, value):
+ def set_content(self, value):
if not value:
return self.xdel('content:encoded')
- table = { 'rdf:rdf': 'content:encoded',
- 'channel': 'content:encoded'}
- element = self.xgetCreate(table)
+ table = {'rdf:rdf': 'content:encoded',
+ 'channel': 'content:encoded'}
+ element = self.xget_create(table)
element.text = value
-
- def getId(self):
+ def get_id(self):
return self.xval('rssfake:guid|guid')
- def setId(self, value):
+ def set_id(self, value):
if not value:
return self.xdel('rssfake:guid|guid')
- table = { 'rdf:rdf': 'rssfake:guid',
- 'channel': 'guid'}
- element = self.xgetCreate(table)
+ table = {'rdf:rdf': 'rssfake:guid',
+ 'channel': 'guid'}
+ element = self.xget_create(table)
element.text = value
-
- def getIsPermaLink(self):
+ def get_is_permalink(self):
return self.xget('rssfake:guid/@isPermaLink|guid/@isPermaLink')
- def setIsPermaLink(self, value):
- table = { 'rdf:rdf': 'rssfake:guid',
- 'channel': 'guid'}
- element = self.xgetCreate(table)
+ def set_is_permalink(self, value):
+ table = {'rdf:rdf': 'rssfake:guid',
+ 'channel': 'guid'}
+ element = self.xget_create(table)
element.attrib['isPermaLink'] = value
-
- def getTime(self):
+ def get_time(self):
return self.xval('rssfake:pubDate|pubDate')
- def setTime(self, value):
+ def set_time(self, value):
if not value:
return self.xdel('rssfake:pubDate|pubDate')
- table = { 'rdf:rdf': 'rssfake:pubDate',
- 'channel': 'pubDate'}
- element = self.xgetCreate(table)
+ table = {'rdf:rdf': 'rssfake:pubDate',
+ 'channel': 'pubDate'}
+ element = self.xget_create(table)
element.text = value
+
class FeedItemAtom(FeedItem):
timeFormat = '%Y-%m-%dT%H:%M:%SZ'
- base = { 'atom:feed': 'atom:entry',
- 'atom03:feed': 'atom03:entry'}
+ base = {'atom:feed': 'atom:entry',
+ 'atom03:feed': 'atom03:entry'}
- def getTitle(self):
+ def get_title(self):
return self.xval('atom:title|atom03:title')
- def setTitle(self, value):
+ def set_title(self, value):
if not value:
return self.xdel('atom:title|atom03:title')
- table = { 'atom:feed': 'atom:title',
- 'atom03:feed': 'atom03:title'}
- element = self.xgetCreate(table)
+ table = {'atom:feed': 'atom:title',
+ 'atom03:feed': 'atom03:title'}
+ element = self.xget_create(table)
element.text = value
-
- def getLink(self):
+ def get_link(self):
return self.xget('(atom:link|atom03:link)[@rel="alternate" or not(@rel)]/@href')
- def setLink(self, value):
- table = { 'atom:feed': ('atom:link', 'atom:link[@rel="alternate" or not(@rel)]'),
- 'atom03:feed': ('atom03:link', 'atom03:link[@rel="alternate" or not(@rel)]')}
- element = self.xgetCreate(table)
+ def set_link(self, value):
+ table = {'atom:feed': ('atom:link', 'atom:link[@rel="alternate" or not(@rel)]'),
+ 'atom03:feed': ('atom03:link', 'atom03:link[@rel="alternate" or not(@rel)]')}
+ element = self.xget_create(table)
element.attrib['href'] = value
-
- def getDesc(self):
+ def get_desc(self):
# default "type" is "text"
element = self.xget('atom:summary|atom03:summary')
if element is not None:
- return innerHTML(element)
+ return inner_html(element)
else:
return ""
- def setDesc(self, value):
+ def set_desc(self, value):
if not value:
return self.xdel('atom:summary|atom03:summary')
- table = { 'atom:feed': 'atom:summary',
- 'atom03:feed': 'atom03:summary'}
- element = self.xgetCreate(table)
+ table = {'atom:feed': 'atom:summary',
+ 'atom03:feed': 'atom03:summary'}
+ element = self.xget_create(table)
if element.attrib.get('type', '') == 'xhtml':
- cleanNode(element)
+ clean_node(element)
element.attrib['type'] = 'html'
element.text = value
-
- def getContent(self):
+ def get_content(self):
element = self.xget('atom:content|atom03:content')
if element is not None:
- return innerHTML(element)
+ return inner_html(element)
else:
return ""
- def setContent(self, value):
+ def set_content(self, value):
if not value:
return self.xdel('atom:content|atom03:content')
- table = { 'atom:feed': 'atom:content',
- 'atom03:feed': 'atom03:content'}
- element = self.xgetCreate(table)
+ table = {'atom:feed': 'atom:content',
+ 'atom03:feed': 'atom03:content'}
+ element = self.xget_create(table)
if element.attrib.get('type', '') == 'xhtml':
- cleanNode(element)
+ clean_node(element)
element.attrib['type'] = 'html'
element.text = value
-
- def getId(self):
+ def get_id(self):
return self.xval('atom:id|atom03:id')
- def setId(self, value):
+ def set_id(self, value):
if not value:
return self.xdel('atom:id|atom03:id')
- table = { 'atom:feed': 'atom:id',
- 'atom03:feed': 'atom03:id'}
- element = self.xgetCreate(table)
+ table = {'atom:feed': 'atom:id',
+ 'atom03:feed': 'atom03:id'}
+ element = self.xget_create(table)
element.text = value
-
- def getTime(self):
+ def get_time(self):
return self.xval('atom:published|atom03:published')
- def setTime(self, value):
+ def set_time(self, value):
if not value:
return self.xdel('atom:published|atom03:published')
- table = { 'atom:feed': 'atom:published',
- 'atom03:feed': 'atom03:published'}
- element = self.xgetCreate(table)
+ table = {'atom:feed': 'atom:published',
+ 'atom03:feed': 'atom03:published'}
+ element = self.xget_create(table)
element.text = value
-
- def getUpdated(self):
+ def get_updated(self):
return self.xval('atom:updated|atom03:updated')
- def setUpdated(self, value):
+ def set_updated(self, value):
if not value:
return self.xdel('atom:updated|atom03:updated')
- table = { 'atom:feed': 'atom:updated',
- 'atom03:feed': 'atom03:updated'}
- element = self.xgetCreate(table)
+ table = {'atom:feed': 'atom:updated',
+ 'atom03:feed': 'atom03:updated'}
+ element = self.xget_create(table)
element.text = value
diff --git a/morss/morss.py b/morss/morss.py
index 2b9884c..0299fec 100644
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -31,21 +31,22 @@ from StringIO import StringIO
from readability import readability
from html2text import HTML2Text
-LIM_ITEM = 100 # deletes what's beyond
-LIM_TIME = 7 # deletes what's after
-MAX_ITEM = 50 # cache-only beyond
-MAX_TIME = 7 # cache-only after (in sec)
-DELAY = 10*60 # xml cache & ETag cache (in sec)
-TIMEOUT = 2 # http timeout (in sec)
-THREADS = 10 # number of threads (1 for single-threaded)
+LIM_ITEM = 100 # deletes what's beyond
+LIM_TIME = 7 # deletes what's after
+MAX_ITEM = 50 # cache-only beyond
+MAX_TIME = 7 # cache-only after (in sec)
+DELAY = 10 * 60 # xml cache & ETag cache (in sec)
+TIMEOUT = 2 # http timeout (in sec)
+THREADS = 10 # number of threads (1 for single-threaded)
DEBUG = False
UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)'
UA_HTML = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
-MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
- 'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
+MIMETYPE = {
+ 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
+ 'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
FBAPPID = ""
FBSECRET = ""
@@ -57,11 +58,14 @@ if 'SCRIPT_NAME' in os.environ:
httplib.HTTPConnection.debuglevel = 1
import cgitb
+
cgitb.enable()
+
class MorssException(Exception):
pass
+
def log(txt, force=False):
if DEBUG or force:
if 'REQUEST_URI' in os.environ:
@@ -70,17 +74,18 @@ def log(txt, force=False):
print repr(txt)
-def lenHTML(txt):
+def len_html(txt):
if len(txt):
return len(lxml.html.fromstring(txt).text_content())
else:
return 0
-def countWord(txt):
+
+def count_words(txt):
if len(txt):
return len(lxml.html.fromstring(txt).text_content().split())
- else:
- return 0
+ return 0
+
class Options:
def __init__(self, options=None):
@@ -95,9 +100,11 @@ class Options:
def __contains__(self, key):
return key in self.options
+
class Cache:
""" Light, error-prone caching system. """
- def __init__(self, folder=None, key='cache', lifespan=10*24*3600):
+
+ def __init__(self, folder=None, key='cache', lifespan=10 * 24 * 3600):
self._key = key
self._dir = folder
self._lifespan = lifespan
@@ -108,7 +115,7 @@ class Cache:
self._hash = "NO CACHE"
return
- maxsize = os.statvfs('./').f_namemax - len(self._dir) - 1 - 4 # ".tmp"
+ maxsize = os.statvfs('./').f_namemax - len(self._dir) - 1 - 4 # ".tmp"
self._hash = urllib.quote_plus(self._key)[:maxsize]
self._file = self._dir + '/' + self._hash
@@ -178,13 +185,16 @@ class Cache:
else:
return self
+
class SimpleDownload(urllib2.HTTPCookieProcessor):
"""
Custom urllib2 handler to download a page, using etag/last-modified headers,
to save bandwidth. The given headers are added back into the header on error
304 for easier use.
"""
- def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None, accept=None, strict=False):
+
+ def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None,
+ accept=None, strict=False):
urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
self.cache = cache
self.etag = etag
@@ -214,7 +224,7 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
out = {}
rank = 1.1
for group in self.accept:
- rank = rank - 0.1
+ rank -= 0.1
if isinstance(group, basestring):
if group in MIMETYPE:
@@ -228,9 +238,9 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
out[mime] = rank
if not self.strict:
- out['*/*'] = rank-0.1
+ out['*/*'] = rank - 0.1
- string = ','.join([x+';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
+ string = ','.join([x + ';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
req.add_unredirected_header('Accept', string)
return req
@@ -259,20 +269,20 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
if resp.info().type in MIMETYPE['html']:
match = re.search(r'(?i)]*?url=(http.*?)["\']', data)
if match:
- newurl = match.groups()[0]
- log('redirect: %s' % newurl)
+ new_url = match.groups()[0]
+ log('redirect: %s' % new_url)
- newheaders = dict((k,v) for k,v in req.headers.items()
- if k.lower() not in ('content-length', 'content-type'))
- new = urllib2.Request(newurl,
- headers=newheaders,
- origin_req_host=req.get_origin_req_host(),
- unverifiable=True)
+ new_headers = dict((k, v) for k, v in req.headers.items()
+ if k.lower() not in ('content-length', 'content-type'))
+ new = urllib2.Request(new_url,
+ headers=new_headers,
+ origin_req_host=req.get_origin_req_host(),
+ unverifiable=True)
return self.parent.open(new, timeout=req.timeout)
# encoding
- enc = detEncoding(data, resp)
+ enc = detect_encoding(data, resp)
if enc:
data = data.decode(enc, 'replace')
@@ -290,7 +300,8 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
https_response = http_response
https_request = http_request
-def detEncoding(data, con=None):
+
+def detect_encoding(data, con=None):
if con is not None and con.headers.getparam('charset'):
log('header')
return con.headers.getparam('charset')
@@ -306,6 +317,7 @@ def detEncoding(data, con=None):
return None
+
def Fix(item, feedurl='/'):
""" Improves feed items (absolute links, resolve feedburner links, etc) """
@@ -358,7 +370,8 @@ def Fix(item, feedurl='/'):
match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link)
if match:
url = match.groups()[0].split('0')
- t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'H':',', 'I':'_', 'L':'http://', 'S':'www.', 'N':'.com', 'O':'.co.uk'}
+ t = {'A': '0', 'B': '.', 'C': '/', 'D': '?', 'E': '-', 'H': ',', 'I': '_', 'L': 'http://', 'S': 'www.',
+ 'N': '.com', 'O': '.co.uk'}
item.link = ''.join([(t[s[0]] if s[0] in t else '=') + s[1:] for s in url[1:]])
log(item.link)
@@ -371,6 +384,7 @@ def Fix(item, feedurl='/'):
return item
+
def Fill(item, cache, feedurl='/', fast=False):
""" Returns True when it has done its best """
@@ -381,8 +395,8 @@ def Fill(item, cache, feedurl='/', fast=False):
log(item.link)
# content already provided?
- count_content = countWord(item.content)
- count_desc = countWord(item.desc)
+ count_content = count_words(item.content)
+ count_desc = count_words(item.desc)
if max(count_content, count_desc) > 500:
if count_desc > count_content:
@@ -392,7 +406,7 @@ def Fill(item, cache, feedurl='/', fast=False):
log('long enough')
return True
- if count_content > 5*count_desc > 0 and count_content > 50:
+ if count_content > 5 * count_desc > 0 and count_content > 50:
log('content bigger enough')
return True
@@ -432,7 +446,7 @@ def Fill(item, cache, feedurl='/', fast=False):
log('old error')
else:
log('cached')
- item.pushContent(cache.get(link))
+ item.push_content(cache.get(link))
return True
# super-fast mode
@@ -457,8 +471,8 @@ def Fill(item, cache, feedurl='/', fast=False):
out = readability.Document(data, url=con.url).summary(True)
- if countWord(out) > max(count_content, count_desc) > 0:
- item.pushContent(out)
+ if count_words(out) > max(count_content, count_desc) > 0:
+ item.push_content(out)
cache.set(link, out)
else:
log('not bigger enough')
@@ -467,7 +481,8 @@ def Fill(item, cache, feedurl='/', fast=False):
return True
-def Init(url, cachePath, options):
+
+def Init(url, cache_path, options):
# url clean up
log(url)
@@ -481,14 +496,15 @@ def Init(url, cachePath, options):
url = url.replace(' ', '%20')
# cache
- cache = Cache(cachePath, url)
+ cache = Cache(cache_path, url)
log(cache._hash)
return (url, cache)
+
def Fetch(url, cache, options):
# do some useful facebook work
- feedify.PreWorker(url, cache)
+ feedify.pre_worker(url, cache)
if 'redirect' in cache:
url = cache.get('redirect')
@@ -502,8 +518,9 @@ def Fetch(url, cache, options):
style = cache.get('style')
else:
try:
- opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), accept=('xml','html'))
- con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT*2)
+ opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'),
+ accept=('xml', 'html'))
+ con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT * 2)
xml = con.read()
except (IOError, httplib.HTTPException):
raise MorssException('Error downloading feed')
@@ -540,7 +557,8 @@ def Fetch(url, cache, options):
feed.build()
rss = feed.feed
elif style == 'html':
- match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
+ match = lxml.html.fromstring(xml).xpath(
+ "//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
if len(match):
link = urlparse.urljoin(url, match[0])
log('rss redirect: %s' % link)
@@ -552,13 +570,13 @@ def Fetch(url, cache, options):
log('random page')
raise MorssException('Link provided is not a valid feed')
-
cache.save()
return rss
+
def Gather(rss, url, cache, options):
size = len(rss.items)
- startTime = time.time()
+ start_time = time.time()
# custom settings
lim_item = LIM_ITEM
@@ -580,14 +598,14 @@ def Gather(rss, url, cache, options):
queue.task_done()
def worker(i, item):
- if time.time() - startTime > lim_time >= 0 or i+1 > lim_item >= 0:
+ if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0:
log('dropped')
item.remove()
return
item = Fix(item, url)
- if time.time() - startTime > max_time >= 0 or i+1 > max_item >= 0:
+ if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
if not options.proxy:
if Fill(item, cache, url, True) is False:
item.remove()
@@ -617,10 +635,11 @@ def Gather(rss, url, cache, options):
new.time = "5 Oct 2013 22:42"
log(len(rss.items))
- log(time.time() - startTime)
+ log(time.time() - start_time)
return rss
+
def After(rss, options):
for i, item in enumerate(rss.items):
@@ -662,8 +681,9 @@ def After(rss, options):
else:
return rss.tostring(xml_declaration=True, encoding='UTF-8')
+
def process(url, cache=None, options=None):
- if options == None:
+ if not options:
options = []
options = Options(options)
@@ -673,6 +693,7 @@ def process(url, cache=None, options=None):
return After(rss, options)
+
def cgi_app(environ, start_response):
# get options
if 'REQUEST_URI' in environ:
@@ -696,7 +717,8 @@ def cgi_app(environ, start_response):
DEBUG = options.debug
if 'HTTP_IF_NONE_MATCH' in environ:
- if not options.force and not options.facebook and time.time() - int(environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY:
+ if not options.force and not options.facebook and time.time() - int(
+ environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY:
headers['status'] = '304 Not Modified'
start_response(headers['status'], headers.items())
log(url)
@@ -722,30 +744,31 @@ def cgi_app(environ, start_response):
url, cache = Init(url, os.getcwd() + '/cache', options)
if options.facebook:
- doFacebook(url, environ, headers, options, cache)
+ do_facebook(url, environ, headers, options, cache)
start_response(headers['status'], headers.items())
return
# get the work done
- RSS = Fetch(url, cache, options)
+ rss = Fetch(url, cache, options)
if headers['content-type'] == 'text/xml':
- headers['content-type'] = RSS.mimetype
+ headers['content-type'] = rss.mimetype
start_response(headers['status'], headers.items())
- RSS = Gather(RSS, url, cache, options)
+ rss = Gather(rss, url, cache, options)
if not DEBUG and not options.silent:
- return After(RSS, options)
+ return After(rss, options)
log('done')
+
def cgi_wrapper(environ, start_response):
# simple http server for html and css
files = {
- '': 'text/html',
- 'index.html': 'text/html'}
+ '': 'text/html',
+ 'index.html': 'text/html'}
if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:]
@@ -774,13 +797,12 @@ def cgi_wrapper(environ, start_response):
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
- headers = {}
- headers['status'] = '500 Oops'
- headers['content-type'] = 'text/plain'
+ headers = {'status': '500 Oops', 'content-type': 'text/plain'}
start_response(headers['status'], headers.items(), sys.exc_info())
log('ERROR: %s' % e.message, force=True)
return 'An error happened'
+
def cli_app():
options = Options(sys.argv[1:-1])
url = sys.argv[-1]
@@ -789,15 +811,16 @@ def cli_app():
DEBUG = options.debug
url, cache = Init(url, os.path.expanduser('~/.cache/morss'), options)
- RSS = Fetch(url, cache, options)
- RSS = Gather(RSS, url, cache, options)
+ rss = Fetch(url, cache, options)
+ rss = Gather(rss, url, cache, options)
if not DEBUG and not options.silent:
- print After(RSS, options)
+ print After(rss, options)
log('done')
-def doFacebook(url, environ, headers, options, cache):
+
+def do_facebook(url, environ, headers, options, cache):
log('fb stuff')
query = urlparse.urlparse(url).query
@@ -805,11 +828,13 @@ def doFacebook(url, environ, headers, options, cache):
if 'code' in query:
# get real token from code
code = urlparse.parse_qs(query)['code'][0]
- eurl = "https://graph.facebook.com/oauth/access_token?client_id={app_id}&redirect_uri={redirect_uri}&client_secret={app_secret}&code={code_parameter}".format(app_id=FBAPPID, app_secret=FBSECRET, code_parameter=code, redirect_uri=environ['SCRIPT_URI'])
+ eurl = "https://graph.facebook.com/oauth/access_token?client_id={app_id}&redirect_uri={redirect_uri}&client_secret={app_secret}&code={code_parameter}".format(
+ app_id=FBAPPID, app_secret=FBSECRET, code_parameter=code, redirect_uri=environ['SCRIPT_URI'])
token = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())['access_token'][0]
# get long-lived access token
- eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(app_id=FBAPPID, app_secret=FBSECRET, short_lived_token=token)
+ eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(
+ app_id=FBAPPID, app_secret=FBSECRET, short_lived_token=token)
values = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())
ltoken = values['access_token'][0]
@@ -824,6 +849,7 @@ def doFacebook(url, environ, headers, options, cache):
log('fb done')
return
+
def main():
if 'REQUEST_URI' in os.environ:
wsgiref.handlers.CGIHandler().run(cgi_wrapper)
diff --git a/setup.py b/setup.py
index 7715db9..8a75267 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,8 @@
from setuptools import setup, find_packages
package_name = 'morss'
-setup( name=package_name,
+setup(
+ name=package_name,
description='Get full-text RSS feeds',
author='pictuga',
author_email='contact at author name dot com',