Make most of the code pep8-compliant

Thanks a lot to github.com/SamuelMarks for his nice work
master
pictuga 2014-06-22 01:59:01 +02:00
parent da0a8feadd
commit f01efb7334
4 changed files with 348 additions and 316 deletions

View File

@ -1,23 +1,25 @@
#!/usr/bin/env python
import re
import json
import urlparse
import urllib2
from ConfigParser import ConfigParser
from fnmatch import fnmatch
import lxml.html
import feeds
import morss
import re
import urllib2
import lxml.html
import json
import urlparse
def toclass(query):
def to_class(query):
pattern = r'\[class=([^\]]+)\]'
repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
return re.sub(pattern, repl, query)
def getRule(link):
def get_rule(link):
config = ConfigParser()
config.read('feedify.ini')
@ -29,10 +31,12 @@ def getRule(link):
return values
return False
def supported(link):
return getRule(link) is not False
def formatString(string, getter, error=False):
def supported(link):
return get_rule(link) is not False
def format_string(string, getter, error=False):
out = ""
char = string[0]
@ -42,41 +46,42 @@ def formatString(string, getter, error=False):
match = follow.partition('"')
out = match[0]
if len(match) >= 2:
next = match[2]
next_match = match[2]
else:
next = None
next_match = None
elif char == '{':
match = follow.partition('}')
try:
test = formatString(match[0], getter, True)
except ValueError, KeyError:
test = format_string(match[0], getter, True)
except (ValueError, KeyError):
pass
else:
out = test
next = match[2]
next_match = match[2]
elif char == ' ':
next = follow
next_match = follow
elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string):
match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups()
rawValue = getter(match[0])
if not isinstance(rawValue, basestring):
raw_value = getter(match[0])
if not isinstance(raw_value, basestring):
if match[1] is not None:
out = match[1].join(rawValue)
out = match[1].join(raw_value)
else:
out = ''.join(rawValue)
out = ''.join(raw_value)
if not out and error:
raise ValueError
next = match[2]
next_match = match[2]
else:
raise ValueError('bogus string')
if next is not None and len(next):
return out + formatString(next, getter, error)
if next_match is not None and len(next_match):
return out + format_string(next_match, getter, error)
else:
return out
def PreWorker(url, cache):
def pre_worker(url, cache):
if urlparse.urlparse(url).netloc == 'itunes.apple.com':
match = re.search('/id([0-9]+)(\?.*)?$', url)
if match:
@ -84,6 +89,7 @@ def PreWorker(url, cache):
redirect = 'https://itunes.apple.com/lookup?id={id}'.format(id=iid)
cache.set('redirect', redirect)
class Builder(object):
def __init__(self, link, data=None, cache=False):
self.link = link
@ -93,11 +99,11 @@ class Builder(object):
data = urllib2.urlopen(link).read()
self.data = data
self.rule = getRule(link)
self.rule = get_rule(link)
if self.rule['mode'] == 'xpath':
if not isinstance(self.data, unicode):
self.data = self.data.decode(morss.detEncoding(self.data), 'replace')
self.data = self.data.decode(morss.detect_encoding(self.data), 'replace')
self.doc = lxml.html.fromstring(self.data)
elif self.rule['mode'] == 'json':
self.doc = json.loads(data)
@ -106,7 +112,7 @@ class Builder(object):
def raw(self, html, expr):
if self.rule['mode'] == 'xpath':
return html.xpath(toclass(expr))
return html.xpath(to_class(expr))
elif self.rule['mode'] == 'json':
a = [html]
@ -119,7 +125,7 @@ class Builder(object):
if kids is None:
pass
elif isinstance(kids, list):
[b.append(i) for i in kids]
b += kids
elif isinstance(kids, basestring):
b.append(kids.replace('\n', '<br/>'))
else:
@ -150,7 +156,7 @@ class Builder(object):
def string(self, html, expr):
getter = lambda x: self.strings(html, x)
return formatString(self.rule[expr], getter)
return format_string(self.rule[expr], getter)
def build(self):
if 'title' in self.rule:
@ -160,23 +166,22 @@ class Builder(object):
matches = self.raw(self.doc, self.rule['items'])
if matches and len(matches):
for item in matches:
feedItem = {}
feed_item = {}
if 'item_title' in self.rule:
feedItem['title'] = self.string(item, 'item_title')
feed_item['title'] = self.string(item, 'item_title')
if 'item_link' in self.rule:
url = self.string(item, 'item_link')
url = urlparse.urljoin(self.link, url)
feedItem['link'] = url
feed_item['link'] = url
if 'item_desc' in self.rule:
feedItem['desc'] = self.string(item, 'item_desc')
feed_item['desc'] = self.string(item, 'item_desc')
if 'item_content' in self.rule:
feedItem['content'] = self.string(item, 'item_content')
feed_item['content'] = self.string(item, 'item_content')
if 'item_time' in self.rule:
feedItem['updated'] = self.string(item, 'item_time')
feed_item['updated'] = self.string(item, 'item_time')
if 'item_id' in self.rule:
feedItem['id'] = self.string(item, 'item_id')
feedItem['isPermaLink'] = False
self.feed.items.append(feedItem)
feed_item['id'] = self.string(item, 'item_id')
feed_item['isPermaLink'] = False
self.feed.items.append(feed_item)

View File

@ -1,14 +1,16 @@
#!/usr/bin/env python
from lxml import etree
from datetime import datetime
import dateutil.parser
from dateutil import tz
import re
from StringIO import StringIO
import re
import json
import csv
import urllib2
from lxml import etree
from dateutil import tz
import dateutil.parser
try:
from wheezy.template.engine import Engine
@ -35,12 +37,13 @@ NSMAP = {'atom': 'http://www.w3.org/2005/Atom',
'content': 'http://purl.org/rss/1.0/modules/content/',
'rssfake': 'http://purl.org/rss/1.0/'}
def load(url):
import urllib2
d = urllib2.urlopen(url).read()
return parse(d)
def tagNS(tag, nsmap=NSMAP):
def tag_NS(tag, nsmap=NSMAP):
match = re.search(r'^\{([^\}]+)\}(.*)$', tag)
if match:
match = match.groups()
@ -55,15 +58,19 @@ def tagNS(tag, nsmap=NSMAP):
return "{%s}%s" % (nsmap[match[0]], match[1].lower())
return tag
def innerHTML(xml):
def inner_html(xml):
return (xml.text or '') + ''.join([etree.tostring(child) for child in xml.iterchildren()])
def cleanNode(xml):
def clean_node(xml):
[xml.remove(child) for child in xml.iterchildren()]
class FeedException(Exception):
pass
def parse(data):
# encoding
match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
@ -80,15 +87,16 @@ def parse(data):
# rss
match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
if len(match):
mtable = { 'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
m_table = {'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
'atom03:feed': FeedParserAtom, 'atom:feed': FeedParserAtom}
match = match[0]
tag = tagNS(match.tag)
if tag in mtable:
return mtable[tag](doc, tag)
tag = tag_NS(match.tag)
if tag in m_table:
return m_table[tag](doc, tag)
raise FeedException('unknown feed type')
class FeedBase(object):
"""
Base for xml-related classes, which provides simple wrappers around xpath
@ -135,7 +143,7 @@ class FeedBase(object):
else:
return ""
def xgetCreate(self, table):
def xget_create(self, table):
""" Returns an element, and creates it when not present """
value = table[self.tag]
if not isinstance(value, tuple):
@ -145,7 +153,7 @@ class FeedBase(object):
if match is not None:
return match
else:
element = etree.Element(tagNS(new))
element = etree.Element(tag_NS(new))
self.root.append(element)
return element
@ -158,58 +166,62 @@ class FeedBase(object):
""" Returns string using lxml. Arguments passed to tostring """
return etree.tostring(self.xml, pretty_print=True, **k)
class FeedDescriptor(object):
"""
Descriptor which gives off elements based on "self.getName" and
"self.setName" as getter/setters. Looks far better, and avoids duplicates
"""
def __init__(self, name):
self.name = name
self.nname = name[0].upper() + name[1:]
def __get__(self, instance, owner):
getter = getattr(instance, 'get%s' % self.nname)
getter = getattr(instance, 'get_%s' % self.name)
return getter()
def __set__(self, instance, value):
setter = getattr(instance, 'set%s' % self.nname)
setter = getattr(instance, 'set_%s' % self.name)
return setter(value)
def __delete__(self, instance):
deleter = getattr(instance, 'del%s' % self.nname)
deleter = getattr(instance, 'del_%s' % self.name)
return deleter()
class FeedTime(FeedDescriptor):
def __get__(self, instance, owner):
getter = getattr(instance, 'get%s' % self.nname)
getter = getattr(instance, 'get_%s' % self.name)
raw = getter()
try:
time = parseTime(raw)
time = parse_time(raw)
return time
except ValueError:
return None
def __set__(self, instance, value):
try:
time = parseTime(value)
time = parse_time(value)
raw = time.strftime(instance.timeFormat)
setter = getattr(instance, 'set%s' % self.nname)
setter = getattr(instance, 'set_%s' % self.name)
return setter(raw)
except ValueError:
pass
class FeedBool(FeedDescriptor):
def __get__(self, instance, owner):
getter = getattr(instance, 'get%s' % self.nname)
getter = getattr(instance, 'get_%s' % self.name)
raw = getter()
return (raw or '').lower() != 'false'
def __set__(self, instance, value):
raw = 'true' if value else 'false'
setter = getattr(instance, 'set%s' % self.nname)
setter = getattr(instance, 'set_%s' % self.name)
return setter(raw)
def parseTime(value):
def parse_time(value):
if isinstance(value, basestring):
if re.match(r'^[0-9]+$', value):
return datetime.fromtimestamp(int(value), tz.tzutc())
@ -222,6 +234,7 @@ def parseTime(value):
else:
return False
class FeedList(object):
"""
Class to map a list of xml elements against a list of matching objects,
@ -231,14 +244,15 @@ class FeedList(object):
Comes with its very own descriptor.
"""
def __init__(self, parent, getter, tag, childClass):
def __init__(self, parent, getter, tag, child_class):
self.parent = parent
self.getter = getter
self.childClass = childClass
self.childClass = child_class
self.tag = tag
self._children = {} # id(xml) => FeedItem
def getChildren(self):
def get_children(self):
children = self.getter()
out = []
for child in children:
@ -269,7 +283,7 @@ class FeedList(object):
return new
def __getitem__(self, key):
return self.getChildren()[key]
return self.get_children()[key]
def __delitem__(self, key):
child = self.getter()[key]
@ -282,10 +296,12 @@ class FeedList(object):
def __len__(self):
return len(self.getter())
class FeedListDescriptor(object):
"""
Descriptor for FeedList
"""
def __init__(self, name):
self.name = name
self.items = {} # id(instance) => FeedList
@ -295,9 +311,9 @@ class FeedListDescriptor(object):
if key in self.items:
return self.items[key]
else:
getter = getattr(instance, 'get%s' % self.name.title())
className = globals()[getattr(instance, '%sClass' % self.name)]
self.items[key] = FeedList(instance, getter, instance.tag, className)
getter = getattr(instance, 'get_%s' % self.name)
class_name = globals()[getattr(instance, '%sClass' % self.name)]
self.items[key] = FeedList(instance, getter, instance.tag, class_name)
return self.items[key]
def __set__(self, instance, value):
@ -305,6 +321,7 @@ class FeedListDescriptor(object):
[x.remove() for x in [x for x in f.items]]
[feedlist.append(x) for x in value]
class FeedParser(FeedBase):
itemsClass = 'FeedItem'
mimetype = 'application/xml'
@ -318,27 +335,25 @@ class FeedParser(FeedBase):
self.root = self.xml.xpath("//atom03:feed|//atom:feed|//channel|//rssfake:channel", namespaces=NSMAP)[0]
self.tag = tag
def getTitle(self):
def get_title(self):
return ""
def setTitle(self, value):
def set_title(self, value):
pass
def delTitle(self):
def del_title(self):
self.title = ""
def getDesc(self):
def get_desc(self):
pass
def setDesc(self, value):
def set_desc(self, value):
pass
def delDesc(self):
def del_desc(self):
self.desc = ""
def getItems(self):
def get_items(self):
return []
title = FeedDescriptor('title')
@ -355,7 +370,8 @@ class FeedParser(FeedBase):
out = StringIO()
c = csv.writer(out, dialect=csv.excel)
for item in self.items:
row = [x[1].encode('utf-8') if isinstance(x[1], unicode) else x[1] for x in item if isinstance(x[1], basestring)]
row = [x[1].encode('utf-8') if isinstance(x[1], unicode) else x[1] for x in item if
isinstance(x[1], basestring)]
c.writerow(row)
out.seek(0)
return out.read()
@ -369,44 +385,45 @@ class FeedParser(FeedBase):
template = engine.get_template('reader')
return template.render({'feed': self}).encode('utf-8')
class FeedParserRSS(FeedParser):
"""
RSS Parser
"""
itemsClass = 'FeedItemRSS'
mimetype = 'application/rss+xml'
base = { 'rdf:rdf': '<?xml version="1.0" encoding="utf-8"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/"><channel rdf:about="http://example.org/rss.rdf"></channel></rdf:RDF>',
base = {
'rdf:rdf': '<?xml version="1.0" encoding="utf-8"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/"><channel rdf:about="http://example.org/rss.rdf"></channel></rdf:RDF>',
'channel': '<?xml version="1.0" encoding="utf-8"?><rss version="2.0"><channel></channel></rss>'}
def getTitle(self):
def get_title(self):
return self.xval('rssfake:title|title')
def setTitle(self, value):
def set_title(self, value):
if not value:
return self.xdel('rssfake:title|title')
table = {'rdf:rdf': 'rssfake:title',
'channel': 'title'}
element = self.xgetCreate(table)
element = self.xget_create(table)
element.text = value
def getDesc(self):
def get_desc(self):
return self.xval('rssfake:description|description')
def setDesc(self, value):
def set_desc(self, value):
if not value:
return self.xdel('rssfake:description|description')
table = {'rdf:rdf': 'rssfake:description',
'channel': 'description'}
element = self.xgetCreate(table)
element = self.xget_create(table)
element.text = value
def getItems(self):
def get_items(self):
return self.xpath('rssfake:item|item')
class FeedParserAtom(FeedParser):
"""
Atom Parser
@ -416,120 +433,112 @@ class FeedParserAtom(FeedParser):
base = {'atom:feed': '<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>',
'atom03:feed': '<?xml version="1.0" encoding="utf-8"?><feed version="0.3" xmlns="http://purl.org/atom/ns#"></feed>'}
def getTitle(self):
def get_title(self):
return self.xval('atom:title|atom03:title')
def setTitle(self, value):
def set_title(self, value):
if not value:
return self.xval('atom:title|atom03:title')
table = {'atom:feed': 'atom:title',
'atom03:feed': 'atom03:title'}
element = self.xgetCreate(table)
element = self.xget_create(table)
element.text = value
def getDesc(self):
def get_desc(self):
return self.xval('atom:subtitle|atom03:subtitle')
def setDesc(self, value):
def set_desc(self, value):
if not value:
return self.xdel('atom:subtitle|atom03:subtitle')
table = {'atom:feed': 'atom:subtitle',
'atom03:feed': 'atom03:subtitle'}
element = self.xgetCreate(table)
element = self.xget_create(table)
element.text = value
def getItems(self):
def get_items(self):
return self.xpath('atom:entry|atom03:entry')
class FeedItem(FeedBase):
timeFormat = ''
dic = ('title', 'link', 'desc', 'content', 'id', 'isPermaLink', 'time', 'updated')
dic = ('title', 'link', 'desc', 'content', 'id', 'is_permalink', 'time', 'updated')
def __init__(self, xml=None, tag='atom:feed'):
if xml is None:
xml = Element(tagNS(self.base[tag]))
xml = Element(tag_NS(self.base[tag]))
self.root = self.xml = xml
self.tag = tag
def getTitle(self):
def get_title(self):
return ""
def setTitle(self):
def set_title(self, value):
pass
def delTitle(self):
def del_title(self):
self.title = ""
def getLink(self):
def get_link(self):
return ""
def setLink(self, value):
def set_link(self, value):
pass
def delLink(self):
def del_link(self):
self.link = ""
def getIsPermaLink(self):
def get_is_permalink(self):
return ""
def setIsPermaLink(self, value):
def set_is_permalink(self, value):
pass
def getDesc(self):
def get_desc(self):
return ""
def setDesc(self, value):
def set_desc(self, value):
pass
def delDesc(self):
def del_desc(self):
self.desc = ""
def getContent(self):
def get_content(self):
return ""
def setContent(self, value):
def set_content(self, value):
pass
def delContent(self):
def del_content(self):
self.content = ""
def getId(self):
def get_id(self):
return ""
def setId(self, value):
def set_id(self, value):
pass
def delId(self):
def del_id(self):
self.id = ""
def getTime(self):
def get_time(self):
return None
def setTime(self, value):
def set_time(self, value):
pass
def delTime(self):
self.time = None
def getUpdated(self):
def get_updated(self):
return None
def setUpdated(self, value):
def set_updated(self, value):
pass
def delUpdated(self):
def del_updated(self):
self.updated = None
title = FeedDescriptor('title')
@ -537,11 +546,11 @@ class FeedItem(FeedBase):
description = desc = FeedDescriptor('desc')
content = FeedDescriptor('content')
id = FeedDescriptor('id')
isPermaLink = FeedBool('isPermaLink')
is_permalink = FeedBool('is_permalink')
time = FeedTime('time')
updated = FeedTime('updated')
def pushContent(self, value):
def push_content(self, value):
if not self.desc and self.content:
self.desc = self.content
@ -550,201 +559,192 @@ class FeedItem(FeedBase):
def remove(self):
self.xml.getparent().remove(self.xml)
class FeedItemRSS(FeedItem):
timeFormat = '%a, %d %b %Y %H:%M:%S %Z'
base = {'rdf:rdf': 'rssfake:item',
'channel': 'item'}
def getTitle(self):
def get_title(self):
return self.xval('rssfake:title|title')
def setTitle(self, value):
def set_title(self, value):
if not value:
return self.xdel('rssfake:title|title')
table = {'rdf:rdf': 'rssfake:title',
'channel': 'title'}
element = self.xgetCreate(table)
element = self.xget_create(table)
element.text = value
def getLink(self):
def get_link(self):
return self.xval('rssfake:link|link')
def setLink(self, value):
if self.isPermaLink and self.id == self.link != value:
self.isPermaLink = False
def set_link(self, value):
if self.is_permalink and self.id == self.link != value:
self.is_permalink = False
table = {'rdf:rdf': 'rssfake:link',
'channel': 'link'}
element = self.xgetCreate(table)
element = self.xget_create(table)
element.text = value
def getDesc(self):
def get_desc(self):
return self.xval('rssfake:description|description')
def setDesc(self, value):
def set_desc(self, value):
if not value:
return self.xdel('rssfake:description|description')
table = {'rdf:rdf': 'rssfake:description',
'channel': 'description'}
element = self.xgetCreate(table)
element = self.xget_create(table)
element.text = value
def getContent(self):
def get_content(self):
return self.xval('content:encoded')
def setContent(self, value):
def set_content(self, value):
if not value:
return self.xdel('content:encoded')
table = {'rdf:rdf': 'content:encoded',
'channel': 'content:encoded'}
element = self.xgetCreate(table)
element = self.xget_create(table)
element.text = value
def getId(self):
def get_id(self):
return self.xval('rssfake:guid|guid')
def setId(self, value):
def set_id(self, value):
if not value:
return self.xdel('rssfake:guid|guid')
table = {'rdf:rdf': 'rssfake:guid',
'channel': 'guid'}
element = self.xgetCreate(table)
element = self.xget_create(table)
element.text = value
def getIsPermaLink(self):
def get_is_permalink(self):
return self.xget('rssfake:guid/@isPermaLink|guid/@isPermaLink')
def setIsPermaLink(self, value):
def set_is_permalink(self, value):
table = {'rdf:rdf': 'rssfake:guid',
'channel': 'guid'}
element = self.xgetCreate(table)
element = self.xget_create(table)
element.attrib['isPermaLink'] = value
def getTime(self):
def get_time(self):
return self.xval('rssfake:pubDate|pubDate')
def setTime(self, value):
def set_time(self, value):
if not value:
return self.xdel('rssfake:pubDate|pubDate')
table = {'rdf:rdf': 'rssfake:pubDate',
'channel': 'pubDate'}
element = self.xgetCreate(table)
element = self.xget_create(table)
element.text = value
class FeedItemAtom(FeedItem):
timeFormat = '%Y-%m-%dT%H:%M:%SZ'
base = {'atom:feed': 'atom:entry',
'atom03:feed': 'atom03:entry'}
def getTitle(self):
def get_title(self):
return self.xval('atom:title|atom03:title')
def setTitle(self, value):
def set_title(self, value):
if not value:
return self.xdel('atom:title|atom03:title')
table = {'atom:feed': 'atom:title',
'atom03:feed': 'atom03:title'}
element = self.xgetCreate(table)
element = self.xget_create(table)
element.text = value
def getLink(self):
def get_link(self):
return self.xget('(atom:link|atom03:link)[@rel="alternate" or not(@rel)]/@href')
def setLink(self, value):
def set_link(self, value):
table = {'atom:feed': ('atom:link', 'atom:link[@rel="alternate" or not(@rel)]'),
'atom03:feed': ('atom03:link', 'atom03:link[@rel="alternate" or not(@rel)]')}
element = self.xgetCreate(table)
element = self.xget_create(table)
element.attrib['href'] = value
def getDesc(self):
def get_desc(self):
# default "type" is "text"
element = self.xget('atom:summary|atom03:summary')
if element is not None:
return innerHTML(element)
return inner_html(element)
else:
return ""
def setDesc(self, value):
def set_desc(self, value):
if not value:
return self.xdel('atom:summary|atom03:summary')
table = {'atom:feed': 'atom:summary',
'atom03:feed': 'atom03:summary'}
element = self.xgetCreate(table)
element = self.xget_create(table)
if element.attrib.get('type', '') == 'xhtml':
cleanNode(element)
clean_node(element)
element.attrib['type'] = 'html'
element.text = value
def getContent(self):
def get_content(self):
element = self.xget('atom:content|atom03:content')
if element is not None:
return innerHTML(element)
return inner_html(element)
else:
return ""
def setContent(self, value):
def set_content(self, value):
if not value:
return self.xdel('atom:content|atom03:content')
table = {'atom:feed': 'atom:content',
'atom03:feed': 'atom03:content'}
element = self.xgetCreate(table)
element = self.xget_create(table)
if element.attrib.get('type', '') == 'xhtml':
cleanNode(element)
clean_node(element)
element.attrib['type'] = 'html'
element.text = value
def getId(self):
def get_id(self):
return self.xval('atom:id|atom03:id')
def setId(self, value):
def set_id(self, value):
if not value:
return self.xdel('atom:id|atom03:id')
table = {'atom:feed': 'atom:id',
'atom03:feed': 'atom03:id'}
element = self.xgetCreate(table)
element = self.xget_create(table)
element.text = value
def getTime(self):
def get_time(self):
return self.xval('atom:published|atom03:published')
def setTime(self, value):
def set_time(self, value):
if not value:
return self.xdel('atom:published|atom03:published')
table = {'atom:feed': 'atom:published',
'atom03:feed': 'atom03:published'}
element = self.xgetCreate(table)
element = self.xget_create(table)
element.text = value
def getUpdated(self):
def get_updated(self):
return self.xval('atom:updated|atom03:updated')
def setUpdated(self, value):
def set_updated(self, value):
if not value:
return self.xdel('atom:updated|atom03:updated')
table = {'atom:feed': 'atom:updated',
'atom03:feed': 'atom03:updated'}
element = self.xgetCreate(table)
element = self.xget_create(table)
element.text = value

View File

@ -44,7 +44,8 @@ DEBUG = False
UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)'
UA_HTML = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
MIMETYPE = {
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
FBAPPID = "<insert yours>"
@ -57,11 +58,14 @@ if 'SCRIPT_NAME' in os.environ:
httplib.HTTPConnection.debuglevel = 1
import cgitb
cgitb.enable()
class MorssException(Exception):
pass
def log(txt, force=False):
if DEBUG or force:
if 'REQUEST_URI' in os.environ:
@ -70,18 +74,19 @@ def log(txt, force=False):
print repr(txt)
def lenHTML(txt):
def len_html(txt):
if len(txt):
return len(lxml.html.fromstring(txt).text_content())
else:
return 0
def countWord(txt):
def count_words(txt):
if len(txt):
return len(lxml.html.fromstring(txt).text_content().split())
else:
return 0
class Options:
def __init__(self, options=None):
self.options = options or []
@ -95,8 +100,10 @@ class Options:
def __contains__(self, key):
return key in self.options
class Cache:
""" Light, error-prone caching system. """
def __init__(self, folder=None, key='cache', lifespan=10 * 24 * 3600):
self._key = key
self._dir = folder
@ -178,13 +185,16 @@ class Cache:
else:
return self
class SimpleDownload(urllib2.HTTPCookieProcessor):
"""
Custom urllib2 handler to download a page, using etag/last-modified headers,
to save bandwidth. The given headers are added back into the header on error
304 for easier use.
"""
def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None, accept=None, strict=False):
def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None,
accept=None, strict=False):
urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
self.cache = cache
self.etag = etag
@ -214,7 +224,7 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
out = {}
rank = 1.1
for group in self.accept:
rank = rank - 0.1
rank -= 0.1
if isinstance(group, basestring):
if group in MIMETYPE:
@ -259,20 +269,20 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
if resp.info().type in MIMETYPE['html']:
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
if match:
newurl = match.groups()[0]
log('redirect: %s' % newurl)
new_url = match.groups()[0]
log('redirect: %s' % new_url)
newheaders = dict((k,v) for k,v in req.headers.items()
new_headers = dict((k, v) for k, v in req.headers.items()
if k.lower() not in ('content-length', 'content-type'))
new = urllib2.Request(newurl,
headers=newheaders,
new = urllib2.Request(new_url,
headers=new_headers,
origin_req_host=req.get_origin_req_host(),
unverifiable=True)
return self.parent.open(new, timeout=req.timeout)
# encoding
enc = detEncoding(data, resp)
enc = detect_encoding(data, resp)
if enc:
data = data.decode(enc, 'replace')
@ -290,7 +300,8 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
https_response = http_response
https_request = http_request
def detEncoding(data, con=None):
def detect_encoding(data, con=None):
if con is not None and con.headers.getparam('charset'):
log('header')
return con.headers.getparam('charset')
@ -306,6 +317,7 @@ def detEncoding(data, con=None):
return None
def Fix(item, feedurl='/'):
""" Improves feed items (absolute links, resolve feedburner links, etc) """
@ -358,7 +370,8 @@ def Fix(item, feedurl='/'):
match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link)
if match:
url = match.groups()[0].split('0')
t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'H':',', 'I':'_', 'L':'http://', 'S':'www.', 'N':'.com', 'O':'.co.uk'}
t = {'A': '0', 'B': '.', 'C': '/', 'D': '?', 'E': '-', 'H': ',', 'I': '_', 'L': 'http://', 'S': 'www.',
'N': '.com', 'O': '.co.uk'}
item.link = ''.join([(t[s[0]] if s[0] in t else '=') + s[1:] for s in url[1:]])
log(item.link)
@ -371,6 +384,7 @@ def Fix(item, feedurl='/'):
return item
def Fill(item, cache, feedurl='/', fast=False):
""" Returns True when it has done its best """
@ -381,8 +395,8 @@ def Fill(item, cache, feedurl='/', fast=False):
log(item.link)
# content already provided?
count_content = countWord(item.content)
count_desc = countWord(item.desc)
count_content = count_words(item.content)
count_desc = count_words(item.desc)
if max(count_content, count_desc) > 500:
if count_desc > count_content:
@ -432,7 +446,7 @@ def Fill(item, cache, feedurl='/', fast=False):
log('old error')
else:
log('cached')
item.pushContent(cache.get(link))
item.push_content(cache.get(link))
return True
# super-fast mode
@ -457,8 +471,8 @@ def Fill(item, cache, feedurl='/', fast=False):
out = readability.Document(data, url=con.url).summary(True)
if countWord(out) > max(count_content, count_desc) > 0:
item.pushContent(out)
if count_words(out) > max(count_content, count_desc) > 0:
item.push_content(out)
cache.set(link, out)
else:
log('not bigger enough')
@ -467,7 +481,8 @@ def Fill(item, cache, feedurl='/', fast=False):
return True
def Init(url, cachePath, options):
def Init(url, cache_path, options):
# url clean up
log(url)
@ -481,14 +496,15 @@ def Init(url, cachePath, options):
url = url.replace(' ', '%20')
# cache
cache = Cache(cachePath, url)
cache = Cache(cache_path, url)
log(cache._hash)
return (url, cache)
def Fetch(url, cache, options):
# do some useful facebook work
feedify.PreWorker(url, cache)
feedify.pre_worker(url, cache)
if 'redirect' in cache:
url = cache.get('redirect')
@ -502,7 +518,8 @@ def Fetch(url, cache, options):
style = cache.get('style')
else:
try:
opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), accept=('xml','html'))
opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'),
accept=('xml', 'html'))
con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT * 2)
xml = con.read()
except (IOError, httplib.HTTPException):
@ -540,7 +557,8 @@ def Fetch(url, cache, options):
feed.build()
rss = feed.feed
elif style == 'html':
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
match = lxml.html.fromstring(xml).xpath(
"//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
if len(match):
link = urlparse.urljoin(url, match[0])
log('rss redirect: %s' % link)
@ -552,13 +570,13 @@ def Fetch(url, cache, options):
log('random page')
raise MorssException('Link provided is not a valid feed')
cache.save()
return rss
def Gather(rss, url, cache, options):
size = len(rss.items)
startTime = time.time()
start_time = time.time()
# custom settings
lim_item = LIM_ITEM
@ -580,14 +598,14 @@ def Gather(rss, url, cache, options):
queue.task_done()
def worker(i, item):
if time.time() - startTime > lim_time >= 0 or i+1 > lim_item >= 0:
if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0:
log('dropped')
item.remove()
return
item = Fix(item, url)
if time.time() - startTime > max_time >= 0 or i+1 > max_item >= 0:
if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
if not options.proxy:
if Fill(item, cache, url, True) is False:
item.remove()
@ -617,10 +635,11 @@ def Gather(rss, url, cache, options):
new.time = "5 Oct 2013 22:42"
log(len(rss.items))
log(time.time() - startTime)
log(time.time() - start_time)
return rss
def After(rss, options):
for i, item in enumerate(rss.items):
@ -662,8 +681,9 @@ def After(rss, options):
else:
return rss.tostring(xml_declaration=True, encoding='UTF-8')
def process(url, cache=None, options=None):
if options == None:
if not options:
options = []
options = Options(options)
@ -673,6 +693,7 @@ def process(url, cache=None, options=None):
return After(rss, options)
def cgi_app(environ, start_response):
# get options
if 'REQUEST_URI' in environ:
@ -696,7 +717,8 @@ def cgi_app(environ, start_response):
DEBUG = options.debug
if 'HTTP_IF_NONE_MATCH' in environ:
if not options.force and not options.facebook and time.time() - int(environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY:
if not options.force and not options.facebook and time.time() - int(
environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY:
headers['status'] = '304 Not Modified'
start_response(headers['status'], headers.items())
log(url)
@ -722,25 +744,26 @@ def cgi_app(environ, start_response):
url, cache = Init(url, os.getcwd() + '/cache', options)
if options.facebook:
doFacebook(url, environ, headers, options, cache)
do_facebook(url, environ, headers, options, cache)
start_response(headers['status'], headers.items())
return
# get the work done
RSS = Fetch(url, cache, options)
rss = Fetch(url, cache, options)
if headers['content-type'] == 'text/xml':
headers['content-type'] = RSS.mimetype
headers['content-type'] = rss.mimetype
start_response(headers['status'], headers.items())
RSS = Gather(RSS, url, cache, options)
rss = Gather(rss, url, cache, options)
if not DEBUG and not options.silent:
return After(RSS, options)
return After(rss, options)
log('done')
def cgi_wrapper(environ, start_response):
# simple http server for html and css
files = {
@ -774,13 +797,12 @@ def cgi_wrapper(environ, start_response):
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
headers = {}
headers['status'] = '500 Oops'
headers['content-type'] = 'text/plain'
headers = {'status': '500 Oops', 'content-type': 'text/plain'}
start_response(headers['status'], headers.items(), sys.exc_info())
log('ERROR: %s' % e.message, force=True)
return 'An error happened'
def cli_app():
options = Options(sys.argv[1:-1])
url = sys.argv[-1]
@ -789,15 +811,16 @@ def cli_app():
DEBUG = options.debug
url, cache = Init(url, os.path.expanduser('~/.cache/morss'), options)
RSS = Fetch(url, cache, options)
RSS = Gather(RSS, url, cache, options)
rss = Fetch(url, cache, options)
rss = Gather(rss, url, cache, options)
if not DEBUG and not options.silent:
print After(RSS, options)
print After(rss, options)
log('done')
def doFacebook(url, environ, headers, options, cache):
def do_facebook(url, environ, headers, options, cache):
log('fb stuff')
query = urlparse.urlparse(url).query
@ -805,11 +828,13 @@ def doFacebook(url, environ, headers, options, cache):
if 'code' in query:
# get real token from code
code = urlparse.parse_qs(query)['code'][0]
eurl = "https://graph.facebook.com/oauth/access_token?client_id={app_id}&redirect_uri={redirect_uri}&client_secret={app_secret}&code={code_parameter}".format(app_id=FBAPPID, app_secret=FBSECRET, code_parameter=code, redirect_uri=environ['SCRIPT_URI'])
eurl = "https://graph.facebook.com/oauth/access_token?client_id={app_id}&redirect_uri={redirect_uri}&client_secret={app_secret}&code={code_parameter}".format(
app_id=FBAPPID, app_secret=FBSECRET, code_parameter=code, redirect_uri=environ['SCRIPT_URI'])
token = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())['access_token'][0]
# get long-lived access token
eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(app_id=FBAPPID, app_secret=FBSECRET, short_lived_token=token)
eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(
app_id=FBAPPID, app_secret=FBSECRET, short_lived_token=token)
values = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())
ltoken = values['access_token'][0]
@ -824,6 +849,7 @@ def doFacebook(url, environ, headers, options, cache):
log('fb done')
return
def main():
if 'REQUEST_URI' in os.environ:
wsgiref.handlers.CGIHandler().run(cgi_wrapper)

View File

@ -1,7 +1,8 @@
from setuptools import setup, find_packages
package_name = 'morss'
setup( name=package_name,
setup(
name=package_name,
description='Get full-text RSS feeds',
author='pictuga',
author_email='contact at author name dot com',