First attempt at decent folder structure

Use setup.py, subfolder for code.
This commit is contained in:
2014-01-11 17:11:57 +01:00
parent 4038033336
commit 5feb061bf7
6 changed files with 15 additions and 0 deletions

0
morss/__init__.py Normal file
View File

70
morss/feedify.ini Normal file
View File

@@ -0,0 +1,70 @@
[twitter]
mode=xpath
path=
http://twitter.com/*
https://twitter.com/*
http://www.twitter.com/*
https://www.twitter.com/*
title= //head/title/text()
items= //div[class=tweet]
item_title= ./@data-name " (@" ./@data-screen-name ")"
item_link= .//a[class=details]/@href
item_content= .//p[class=tweet-text]
item_time= .//span/@data-time
[google]
mode=xpath
path=
http://google.com/search?q=*
http://www.google.com/search?q=*
title= //head/title/text()
items= //li[class=g]
item_title= .//h3//text()
item_link= .//a/@href
item_content= .//span[class=st]
[ddg.gg]
mode=xpath
path=
http://duckduckgo.com/html/?q=*
https://duckduckgo.com/html/?q=*
title= //head/title/text()
items= //div[class=results_links][not(contains(@class,'sponsored'))]
item_title= .//a[class=large]//text()
item_link= .//a[class=large]/@href
item_content= .//div[class=snippet]
[facebook home]
mode=json
path=
https://graph.facebook.com/*/home*
https://graph.facebook.com/*/feed*
title= "Facebook"
items= data
item_title= from.name {" > " to.data.name<", ">}
item_link= actions.link[0]
item_content= message story{"<br/><br/><a href='" link "'><img src='" picture "' /></a>"}{"<blockquote><a href='" link "'>" name "</a><br/>" description "</blockquote>"}{"<br/><br/> @ " place.name}
item_time= created_time
item_id= id
[facebook message/post]
mode=json
path=
https://graph.facebook.com/*
https://graph.facebook.com/*
title= "Facebook"
items= comments.data
item_title= from.name
item_content= message
item_time= created_time
item_id= id

243
morss/feedify.py Normal file
View File

@@ -0,0 +1,243 @@
#!/usr/bin/env python
from ConfigParser import ConfigParser
from fnmatch import fnmatch
import feeds
import morss
import re
import urllib2
import lxml.html
import json
import urlparse
import time
def toclass(query):
pattern = r'\[class=([^\]]+)\]'
repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
return re.sub(pattern, repl, query)
def getRule(link):
config = ConfigParser()
config.read('feedify.ini')
for section in config.sections():
values = dict(config.items(section))
values['path'] = values['path'].split('\n')[1:]
for path in values['path']:
if fnmatch(link, path):
return values
return False
def supported(link):
return getRule(link) is not False
def formatString(string, getter, error=False):
out = ""
char = string[0]
follow = string[1:]
if char == '"':
match = follow.partition('"')
out = match[0]
if len(match) >= 2:
next = match[2]
else:
next = None
elif char == '{':
match = follow.partition('}')
try:
test = formatString(match[0], getter, True)
except ValueError, KeyError:
pass
else:
out = test
next = match[2]
elif char == ' ':
next = follow
elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string):
match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups()
rawValue = getter(match[0])
if not isinstance(rawValue, basestring):
if match[1] is not None:
out = match[1].join(rawValue)
else:
out = ''.join(rawValue)
if not out and error:
raise ValueError
next = match[2]
else:
raise ValueError('bogus string')
if next is not None and len(next):
return out + formatString(next, getter, error)
else:
return out
def PreWorker(url, cache):
if urlparse.urlparse(url).netloc == 'graph.facebook.com':
facebook = cache.new('facebook', persistent=True, dic=True)
token = urlparse.parse_qs(urlparse.urlparse(url).query)['access_token'][0]
if token not in facebook['token']:
# this token ain't known, look for info about it
eurl = "https://graph.facebook.com/debug_token?input_token={token}&access_token={app_token}".format(token=token, app_token=morss.FBAPPTOKEN)
data = json.loads(urllib2.urlopen(eurl).read())['data']
app_id = str(data['app_id'])
user_id = str(data['user_id'])
expires = int(data['expires_at'])
short = 'issued_at' not in data
facebook['token'][token] = {'user': user_id, 'expires': expires}
# do some woodoo to know if we already have sth better
if user_id not in facebook['user']:
# grab a new one anyway, new user
facebook['user'][user_id] = {'original': token}
good = True
else:
# maybe it's a better one
last = facebook['user'][user_id]['token']
last_expires = facebook['token'][last]['expires']
if expires > last_expires:
# new is better
good = True
if good and short and app_id == morss.FBAPPID:
eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(app_id=morss.FBAPPID, app_secret=morss.FBSECRET, short_lived_token=token)
values = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())
token = values['access_token'][0]
expires = int(time.time() + int(values['expires'][0]))
facebook['token'][token] = {'user': user_id, 'expires': expires}
facebook['user'][user_id]['token'] = token
# hey look for a newer token and use it
token = urlparse.parse_qs(urlparse.urlparse(url).query)['access_token'][0]
user_id = facebook['token'][token]['user']
last = facebook['user'][user_id]['token']
original = facebook['user'][user_id]['original']
nurl = url.replace(token, last)
ncache = url.replace(token, original)
cache.set('redirect', nurl)
cache.set('cache', ncache)
class Builder(object):
def __init__(self, link, data=None, cache=False):
self.link = link
self.cache = cache
if data is None:
data = urllib2.urlopen(link).read()
self.data = data
self.rule = getRule(link)
if self.rule['mode'] == 'xpath':
if not isinstance(self.data, unicode):
self.data = self.data.decode(morss.detEncoding(self.data), 'replace')
self.doc = lxml.html.fromstring(self.data)
elif self.rule['mode'] == 'json':
self.doc = json.loads(data)
self.feed = feeds.FeedParserAtom()
def raw(self, html, expr):
if self.rule['mode'] == 'xpath':
return html.xpath(toclass(expr))
elif self.rule['mode'] == 'json':
a = [html]
b = []
for x in expr.strip(".").split("."):
match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups()
for elem in a:
if isinstance(elem, dict):
kids = elem.get(match[0])
if kids is None:
pass
elif isinstance(kids, list):
[b.append(i) for i in kids]
elif isinstance(kids, basestring):
b.append(kids.replace('\n', '<br/>'))
else:
b.append(kids)
if match[1] is None:
a = b
else:
if len(b)-1 >= int(match[1]):
a = [b[int(match[1])]]
else:
a = []
b = []
return a
def strings(self, html, expr):
if self.rule['mode'] == 'xpath':
out = []
for match in self.raw(html, expr):
if isinstance(match, basestring):
out.append(match)
elif isinstance(match, lxml.html.HtmlElement):
out.append(lxml.html.tostring(match))
return out
elif self.rule['mode'] == 'json':
return self.raw(html, expr)
def string(self, html, expr):
getter = lambda x: self.strings(html, x)
return formatString(self.rule[expr], getter)
def build(self):
if 'title' in self.rule:
self.feed.title = self.string(self.doc, 'title')
if 'items' in self.rule:
matches = self.raw(self.doc, self.rule['items'])
if matches and len(matches):
for item in matches:
feedItem = {}
if 'item_title' in self.rule:
feedItem['title'] = self.string(item, 'item_title')
if 'item_link' in self.rule:
url = self.string(item, 'item_link')
url = urlparse.urljoin(self.link, url)
feedItem['link'] = url
if 'item_desc' in self.rule:
feedItem['desc'] = self.string(item, 'item_desc')
if 'item_content' in self.rule:
feedItem['content'] = self.string(item, 'item_content')
if 'item_time' in self.rule:
feedItem['updated'] = self.string(item, 'item_time')
if 'item_id' in self.rule:
feedItem['id'] = self.string(item, 'item_id')
feedItem['isPermaLink'] = False
self.feed.items.append(feedItem)
if urlparse.urlparse(self.link).netloc == 'graph.facebook.com':
if self.cache:
facebook = self.cache.new('facebook', True)
token = urlparse.parse_qs(urlparse.urlparse(self.link).query)['access_token'][0]
expires = facebook['token'][token]['expires']
lifespan = expires - time.time()
if lifespan < 5*24*3600:
new = self.feed.items.append()
new.title = "APP AUTHORISATION RENEWAL NEEDED"
new.link = "https://www.facebook.com/dialog/oauth?client_id={app_id}&redirect_uri=http://test.morss.it/:facebook/".format(app_id=morss.FBAPPID)
new.desc = "Please renew your Facebook app token for this app to keep working for this feed.<br/><a href='{}'>Go!</a>".format(new.link)
new.time = expires

711
morss/feeds.py Normal file
View File

@@ -0,0 +1,711 @@
#!/usr/bin/env python
from lxml import etree
from datetime import datetime
import dateutil.parser
from dateutil import tz
import re
Element = etree.Element
NSMAP = {'atom': 'http://www.w3.org/2005/Atom',
'atom03': 'http://purl.org/atom/ns#',
'media': 'http://search.yahoo.com/mrss/',
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'slash': 'http://purl.org/rss/1.0/modules/slash/',
'dc': 'http://purl.org/dc/elements/1.1/',
'content': 'http://purl.org/rss/1.0/modules/content/',
'rssfake': 'http://purl.org/rss/1.0/'}
def load(url):
import urllib2
d = urllib2.urlopen(url).read()
return parse(d)
def tagNS(tag, nsmap=NSMAP):
match = re.search(r'^\{([^\}]+)\}(.*)$', tag)
if match:
match = match.groups()
for (key, url) in nsmap.iteritems():
if url == match[0]:
return "%s:%s" % (key, match[1].lower())
else:
match = re.search(r'^([^:]+):([^:]+)$', tag)
if match:
match = match.groups()
if match[0] in nsmap:
return "{%s}%s" % (nsmap[match[0]], match[1].lower())
return tag
def innerHTML(xml):
return (xml.text or '') + ''.join([etree.tostring(child) for child in xml.iterchildren()])
def cleanNode(xml):
[xml.remove(child) for child in xml.iterchildren()]
class FeedException(Exception):
pass
def parse(data):
# encoding
match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
if match:
enc = match.groups()[0].lower()
if not isinstance(data, unicode):
data = data.decode(enc, 'ignore')
data = data.encode(enc)
# parse
parser = etree.XMLParser(recover=True)
doc = etree.fromstring(data, parser)
# rss
match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
if len(match):
mtable = { 'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
'atom03:feed': FeedParserAtom, 'atom:feed': FeedParserAtom }
match = match[0]
tag = tagNS(match.tag)
if tag in mtable:
return mtable[tag](doc, tag)
raise FeedException('unknow feed type')
class FeedBase(object):
"""
Base for xml-related classes, which provides simple wrappers around xpath
selection and item creation
"""
def __getitem__(self, item):
return getattr(self, item)
def __setitem__(self, item, value):
setattr(self, item, value)
def __delitem__(self, item):
delattr(self, item)
def __iter__(self):
for element in self.dic:
value = self[element]
if isinstance(value, FeedList):
value = [dict(x) for x in value]
elif isinstance(value, datetime):
value = value.isoformat()
yield element, value
def xpath(self, path):
""" Test xpath rule on xml tree """
return self.root.xpath(path, namespaces=NSMAP)
def xget(self, path):
""" Returns the 1st xpath match """
match = self.xpath(path)
if len(match):
return match[0]
else:
return None
def xval(self, path):
""" Returns the .text of the 1st match """
match = self.xget(path)
if match is not None:
return match.text or ""
else:
return ""
def xgetCreate(self, table):
""" Returns an element, and creates it when not present """
value = table[self.tag]
if not isinstance(value, tuple):
value = (value, value)
new, xpath = value
match = self.xget(xpath)
if match is not None:
return match
else:
element = etree.Element(tagNS(new))
self.root.append(element)
return element
def xdel(self, path):
match = self.xget(path)
if match is not None:
return match.getparent().remove(match)
def tostring(self, **k):
""" Returns string using lxml. Arguments passed to tostring """
return etree.tostring(self.xml, pretty_print=True, **k)
class FeedDescriptor(object):
"""
Descriptor which gives off elements based on "self.getName" and
"self.setName" as getter/setters. Looks far better, and avoids duplicates
"""
def __init__(self, name):
self.name = name
self.nname = name[0].upper() + name[1:]
def __get__(self, instance, owner):
getter = getattr(instance, 'get%s' % self.nname)
return getter()
def __set__(self, instance, value):
setter = getattr(instance, 'set%s' % self.nname)
return setter(value)
def __delete__(self, instance):
deleter = getattr(instance, 'del%s' % self.nname)
return deleter()
class FeedTime(FeedDescriptor):
def __get__(self, instance, owner):
getter = getattr(instance, 'get%s' % self.nname)
raw = getter()
try:
time = parseTime(raw)
return time
except ValueError:
return None
def __set__(self, instance, value):
try:
time = parseTime(value)
raw = time.strftime(instance.timeFormat)
setter = getattr(instance, 'set%s' % self.nname)
return setter(raw)
except ValueError:
pass
class FeedBool(FeedDescriptor):
def __get__(self, instance, owner):
getter = getattr(instance, 'get%s' % self.nname)
raw = getter()
return (raw or '').lower() != 'false'
def __set__(self, instance, value):
raw = 'true' if value else 'false'
setter = getattr(instance, 'set%s' % self.nname)
return setter(raw)
def parseTime(value):
if isinstance(value, basestring):
if re.match(r'^[0-9]+$', value):
return datetime.fromtimestamp(int(value), tz.tzutc())
else:
return dateutil.parser.parse(value, tzinfos=tz.tzutc)
elif isinstance(value, int):
return datetime.fromtimestamp(value, tz.tzutc())
elif isinstance(value, datetime):
return value
else:
return False
class FeedList(object):
"""
Class to map a list of xml elements against a list of matching objects,
while avoiding to recreate the same matching object over and over again. So
as to avoid extra confusion, list's elements are called "children" here, so
as not to use "items", which is already in use in RSS/Atom related code.
Comes with its very own descriptor.
"""
def __init__(self, parent, getter, tag, childClass):
self.parent = parent
self.getter = getter
self.childClass = childClass
self.tag = tag
self._children = {} # id(xml) => FeedItem
def getChildren(self):
children = self.getter()
out = []
for child in children:
if id(child) in self._children:
out.append(self._children[id(child)])
else:
new = self.childClass(child, self.tag)
self._children[id(child)] = new
out.append(new)
return out
def append(self, cousin=None):
new = self.childClass(tag=self.tag)
self.parent.root.append(new.xml)
self._children[id(new.xml)] = new
if cousin is None:
return new
for key in self.childClass.__dict__:
if key[:3] == 'set':
attr = key[3:].lower()
if hasattr(cousin, attr):
setattr(new, attr, getattr(cousin, attr))
elif attr in cousin:
setattr(new, attr, cousin[attr])
return new
def __getitem__(self, key):
return self.getChildren()[key]
def __delitem__(self, key):
child = self.getter()[key]
if id(child) in self._children:
self._children[id(child)].remove()
del self._children[id(child)]
else:
child.getparent().remove(child)
def __len__(self):
return len(self.getter())
class FeedListDescriptor(object):
"""
Descriptor for FeedList
"""
def __init__(self, name):
self.name = name
self.items = {} # id(instance) => FeedList
def __get__(self, instance, owner=None):
key = id(instance)
if key in self.items:
return self.items[key]
else:
getter = getattr(instance, 'get%s' % self.name.title())
className = globals()[getattr(instance, '%sClass' % self.name)]
self.items[key] = FeedList(instance, getter, instance.tag, className)
return self.items[key]
def __set__(self, instance, value):
feedlist = self.__get__(instance)
[x.remove() for x in [x for x in f.items]]
[feedlist.append(x) for x in value]
class FeedParser(FeedBase):
itemsClass = 'FeedItem'
mimetype = 'application/xml'
base = '<?xml?>'
dic = ('title', 'desc', 'items')
def __init__(self, xml=None, tag='atom:feed'):
if xml is None:
xml = etree.fromstring(self.base[tag])
self.xml = xml
self.root = self.xml.xpath("//atom03:feed|//atom:feed|//channel|//rssfake:channel", namespaces=NSMAP)[0]
self.tag = tag
def getTitle(self):
return ""
def setTitle(self, value):
pass
def delTitle(self):
self.title = ""
def getDesc(self):
pass
def setDesc(self, value):
pass
def delDesc(self):
self.desc = ""
def getItems(self):
return []
title = FeedDescriptor('title')
description = desc = FeedDescriptor('desc')
items = FeedListDescriptor('items')
def tostring(self, **k):
return etree.tostring(self.xml.getroottree(), pretty_print=True, **k)
class FeedParserRSS(FeedParser):
"""
RSS Parser
"""
itemsClass = 'FeedItemRSS'
mimetype = 'application/rss+xml'
base = { 'rdf:rdf': '<?xml version="1.0" encoding="utf-8"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/"><channel rdf:about="http://example.org/rss.rdf"></channel></rdf:RDF>',
'channel': '<?xml version="1.0" encoding="utf-8"?><rss version="2.0"><channel></channel></rss>'}
def getTitle(self):
return self.xval('rssfake:title|title')
def setTitle(self, value):
if not value:
return self.xdel('rssfake:title|title')
table = { 'rdf:rdf': 'rssfake:title',
'channel': 'title'}
element = self.xgetCreate(table)
element.text = value
def getDesc(self):
return self.xval('rssfake:description|description')
def setDesc(self, value):
if not value:
return self.xdel('rssfake:description|description')
table = { 'rdf:rdf': 'rssfake:description',
'channel': 'description'}
element = self.xgetCreate(table)
element.text = value
def getItems(self):
return self.xpath('rssfake:item|item')
class FeedParserAtom(FeedParser):
"""
Atom Parser
"""
itemsClass = 'FeedItemAtom'
mimetype = 'application/atom+xml'
base = { 'atom:feed': '<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>',
'atom03:feed': '<?xml version="1.0" encoding="utf-8"?><feed version="0.3" xmlns="http://purl.org/atom/ns#"></feed>'}
def getTitle(self):
return self.xval('atom:title|atom03:title')
def setTitle(self, value):
if not value:
return self.xval('atom:title|atom03:title')
table = { 'atom:feed': 'atom:title',
'atom03:feed': 'atom03:title'}
element = self.xgetCreate(table)
element.text = value
def getDesc(self):
return self.xval('atom:subtitle|atom03:subtitle')
def setDesc(self, value):
if not value:
return self.xdel('atom:subtitle|atom03:subtitle')
table = { 'atom:feed': 'atom:subtitle',
'atom03:feed': 'atom03:subtitle'}
element = self.xgetCreate(table)
element.text = value
def getItems(self):
return self.xpath('atom:entry|atom03:entry')
class FeedItem(FeedBase):
timeFormat = ''
dic = ('title', 'link', 'isPermaLink', 'desc', 'content', 'id', 'time', 'updated')
def __init__(self, xml=None, tag='atom:feed'):
if xml is None:
xml = Element(tagNS(self.base[tag]))
self.root = self.xml = xml
self.tag = tag
def getTitle(self):
return ""
def setTitle(self):
pass
def delTitle(self):
self.title = ""
def getLink(self):
return ""
def setLink(self, value):
pass
def delLink(self):
pass
def getIsPermaLink(self):
return ""
def setIsPermaLink(self, value):
pass
def getDesc(self):
return ""
def setDesc(self, value):
pass
def delDesc(self):
self.desc = ""
def getContent(self):
return ""
def setContent(self, value):
pass
def delContent(self):
self.content = ""
def getId(self):
return ""
def setId(self, value):
pass
def delId(self):
self.id = ""
def getTime(self):
return None
def setTime(self, value):
pass
def delTime(self):
self.time = None
def getUpdated(self):
return None
def setUpdated(self, value):
pass
def delUpdated(self):
self.updated = None
title = FeedDescriptor('title')
link = FeedDescriptor('link')
description = desc = FeedDescriptor('desc')
content = FeedDescriptor('content')
id = FeedDescriptor('id')
isPermaLink = FeedBool('isPermaLink')
time = FeedTime('time')
updated = FeedTime('updated')
def pushContent(self, value):
if not self.desc and self.content:
self.desc = self.content
self.content = value
def remove(self):
self.xml.getparent().remove(self.xml)
class FeedItemRSS(FeedItem):
timeFormat = '%a, %d %b %Y %H:%M:%S %Z'
base = { 'rdf:rdf': 'rssfake:item',
'channel': 'item'}
def getTitle(self):
return self.xval('rssfake:title|title')
def setTitle(self, value):
if not value:
return self.xdel('rssfake:title|title')
table = { 'rdf:rdf': 'rssfake:title',
'channel': 'title'}
element = self.xgetCreate(table)
element.text = value
def getLink(self):
return self.xval('rssfake:link|link')
def setLink(self, value):
if self.isPermaLink and self.id == self.link != value:
self.isPermaLink = False
table = { 'rdf:rdf': 'rssfake:link',
'channel': 'link'}
element = self.xgetCreate(table)
element.text = value
def getDesc(self):
return self.xval('rssfake:description|description')
def setDesc(self, value):
if not value:
return self.xdel('rssfake:description|description')
table = { 'rdf:rdf': 'rssfake:description',
'channel': 'description'}
element = self.xgetCreate(table)
element.text = value
def getContent(self):
return self.xval('content:encoded')
def setContent(self, value):
if not value:
return self.xdel('content:encoded')
table = { 'rdf:rdf': 'content:encoded',
'channel': 'content:encoded'}
element = self.xgetCreate(table)
element.text = value
def getId(self):
return self.xval('rssfake:guid|guid')
def setId(self, value):
if not value:
return self.xdel('rssfake:guid|guid')
table = { 'rdf:rdf': 'rssfake:guid',
'channel': 'guid'}
element = self.xgetCreate(table)
element.text = value
def getIsPermaLink(self):
return self.xget('rssfake:guid/@isPermaLink|guid/@isPermaLink')
def setIsPermaLink(self, value):
table = { 'rdf:rdf': 'rssfake:guid',
'channel': 'guid'}
element = self.xgetCreate(table)
element.attrib['isPermaLink'] = value
def getTime(self):
return self.xval('rssfake:pubDate|pubDate')
def setTime(self, value):
if not value:
return self.xdel('rssfake:pubDate|pubDate')
table = { 'rdf:rdf': 'rssfake:pubDate',
'channel': 'pubDate'}
element = self.xgetCreate(table)
element.text = value
class FeedItemAtom(FeedItem):
timeFormat = '%Y-%m-%dT%H:%M:%SZ'
base = { 'atom:feed': 'atom:entry',
'atom03:feed': 'atom03:entry'}
def getTitle(self):
return self.xval('atom:title|atom03:title')
def setTitle(self, value):
if not value:
return self.xdel('atom:title|atom03:title')
table = { 'atom:feed': 'atom:title',
'atom03:feed': 'atom03:title'}
element = self.xgetCreate(table)
element.text = value
def getLink(self):
return self.xget('(atom:link|atom03:link)[@rel="alternate" or not(@rel)]/@href')
def setLink(self, value):
table = { 'atom:feed': ('atom:link', 'atom:link[@rel="alternate" or not(@rel)]'),
'atom03:feed': ('atom03:link', 'atom03:link[@rel="alternate" or not(@rel)]')}
element = self.xgetCreate(table)
element.attrib['href'] = value
def getDesc(self):
# default "type" is "text"
element = self.xget('atom:summary|atom03:summary')
if element is not None:
return innerHTML(element)
else:
return ""
def setDesc(self, value):
if not value:
return self.xdel('atom:summary|atom03:summary')
table = { 'atom:feed': 'atom:summary',
'atom03:feed': 'atom03:summary'}
element = self.xgetCreate(table)
if element.attrib.get('type', '') == 'xhtml':
cleanNode(element)
element.attrib['type'] = 'html'
element.text = value
def getContent(self):
element = self.xget('atom:content|atom03:content')
if element is not None:
return innerHTML(element)
else:
return ""
def setContent(self, value):
if not value:
return self.xdel('atom:content|atom03:content')
table = { 'atom:feed': 'atom:content',
'atom03:feed': 'atom03:content'}
element = self.xgetCreate(table)
if element.attrib.get('type', '') == 'xhtml':
cleanNode(element)
element.attrib['type'] = 'html'
element.text = value
def getId(self):
return self.xval('atom:id|atom03:id')
def setId(self, value):
if not value:
return self.xdel('atom:id|atom03:id')
table = { 'atom:feed': 'atom:id',
'atom03:feed': 'atom03:id'}
element = self.xgetCreate(table)
element.text = value
def getTime(self):
return self.xval('atom:published|atom03:published')
def setTime(self, value):
if not value:
return self.xdel('atom:published|atom03:published')
table = { 'atom:feed': 'atom:published',
'atom03:feed': 'atom03:published'}
element = self.xgetCreate(table)
element.text = value
def getUpdated(self):
return self.xval('atom:updated|atom03:updated')
def setUpdated(self, value):
if not value:
return self.xdel('atom:updated|atom03:updated')
table = { 'atom:feed': 'atom:updated',
'atom03:feed': 'atom03:updated'}
element = self.xgetCreate(table)
element.text = value

799
morss/morss.py Normal file
View File

@@ -0,0 +1,799 @@
#!/usr/bin/env python
import sys
import os
import os.path
import time
import Queue
import threading
from fnmatch import fnmatch
from base64 import b64encode, b64decode
import re
import string
import json
import lxml.html
import lxml.html.clean
import lxml.builder
import feeds
import feedify
import httplib
import urllib
import urllib2
import chardet
import urlparse
import wsgiref.util
import wsgiref.simple_server
import wsgiref.handlers
from gzip import GzipFile
from StringIO import StringIO
from readability import readability
LIM_ITEM = 100 # deletes what's beyond
LIM_TIME = 7 # deletes what's after
MAX_ITEM = 50 # cache-only beyond
MAX_TIME = 7 # cache-only after (in sec)
DELAY = 10*60 # xml cache & ETag cache (in sec)
TIMEOUT = 2 # http timeout (in sec)
THREADS = 10 # number of threads (1 for single-threaded)
DEBUG = False
HOLD = False
UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)'
UA_HTML = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
FBAPPID = "<insert yours>"
FBSECRET = "<insert yours>"
FBAPPTOKEN = FBAPPID + '|' + FBSECRET
PROTOCOL = ['http', 'https', 'ftp']
if 'SCRIPT_NAME' in os.environ:
httplib.HTTPConnection.debuglevel = 1
import cgitb
cgitb.enable()
class MorssException(Exception):
pass
def log(txt, force=False):
if DEBUG or force:
if 'REQUEST_URI' in os.environ:
open('morss.log', 'a').write("%s\n" % repr(txt))
else:
print repr(txt)
def lenHTML(txt):
if len(txt):
return len(lxml.html.fromstring(txt).text_content())
else:
return 0
def countWord(txt):
if len(txt):
return len(lxml.html.fromstring(txt).text_content().split())
else:
return 0
class ParseOptions:
def __init__(self, environ=False):
self.url = ''
self.options = {}
roptions = []
if environ:
if 'REQUEST_URI' in environ:
self.url = environ['REQUEST_URI'][1:]
else:
self.url = environ['PATH_INFO'][1:]
if self.url.startswith('/morss.py'):
self.url = self.url[10:]
elif self.url.startswith('morss.py'):
self.url = self.url[9:]
if self.url.startswith(':'):
roptions = self.url.split('/')[0].split(':')[1:]
self.url = self.url.split('/', 1)[1]
else:
if len(sys.argv) <= 1:
return
roptions = sys.argv[1:-1]
self.url = sys.argv[-1]
for option in roptions:
split = option.split('=', 1)
if len(split) > 1:
if split[0].lower() == 'true':
self.options[split[0]] = True
elif split[0].lower() == 'false':
self.options[split[0]] = False
else:
self.options[split[0]] = split[1]
else:
self.options[split[0]] = True
def __getattr__(self, key):
if key in self.options:
return self.options[key]
else:
return False
def __contains__(self, key):
return self.options.__contains__(key)
class Cache:
""" Light, error-prone caching system. """
def __init__(self, folder, key, persistent=False, dic=False):
self._key = key
self._dir = folder
self._dic = dic
maxsize = os.statvfs('./').f_namemax - len(self._dir) - 1
self._hash = urllib.quote_plus(self._key)[:maxsize]
self._file = self._dir + '/' + self._hash
self._cached = {} # what *was* cached
self._cache = {} # new things to put in cache
if os.path.isfile(self._file):
data = open(self._file).read()
if data:
self._cached = json.loads(data)
if persistent:
self._cache = self._cached
def __del__(self):
self.save()
def __contains__(self, key):
return key in self._cache or key in self._cached
def get(self, key):
if key in self._cache:
return self._cache[key]
elif key in self._cached:
self._cache[key] = self._cached[key]
return self._cached[key]
else:
if self._dic:
self._cache[key] = {}
return self._cache[key]
else:
return None
def set(self, key, content):
self._cache[key] = content
__getitem__ = get
__setitem__ = set
def save(self):
if len(self._cache) == 0:
return
if not os.path.exists(self._dir):
os.makedirs(self._dir)
out = json.dumps(self._cache, indent=4)
with open(self._file, 'w+') as file:
file.write(out)
def isYoungerThan(self, sec):
if not os.path.exists(self._file):
return False
return time.time() - os.path.getmtime(self._file) < sec
def new(self, key, persistent=False, dic=False):
""" Returns a Cache object in the same directory """
if key != self._key:
return Cache(self._dir, key, persistent, dic)
else:
return self
def redirect(self, key, persistent=False):
return self.__init__(self._dir, key, persistent)
class SimpleDownload(urllib2.HTTPCookieProcessor):
"""
Custom urllib2 handler to download a page, using etag/last-modified headers,
to save bandwidth. The given headers are added back into the header on error
304 for easier use.
"""
def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None, accept=None, strict=False):
urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
self.cache = cache
self.etag = etag
self.lastmodified = lastmodified
self.useragent = useragent
self.decode = decode
self.accept = accept
self.strict = strict
def http_request(self, req):
urllib2.HTTPCookieProcessor.http_request(self, req)
req.add_unredirected_header('Accept-Encoding', 'gzip')
req.add_unredirected_header('User-Agent', self.useragent)
if req.get_host() != 'feeds.feedburner.com':
req.add_unredirected_header('Referer', 'http://%s' % req.get_host())
if self.cache:
if self.etag:
req.add_unredirected_header('If-None-Match', self.etag)
if self.lastmodified:
req.add_unredirected_header('If-Modified-Since', self.lastmodified)
if self.accept is not None:
# req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
if isinstance(self.accept, basestring):
self.accept = (self.accept,)
out = {}
rank = 1.1
for group in self.accept:
rank = rank - 0.1
if isinstance(group, basestring):
if group in MIMETYPE:
group = MIMETYPE[group]
else:
out[group] = rank
continue
for mime in group:
if mime not in out:
out[mime] = rank
if not self.strict:
out['*/*'] = rank-0.1
string = ','.join([x+';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
req.add_unredirected_header('Accept', string)
return req
def http_error_304(self, req, fp, code, msg, headers):
log('http cached')
if self.etag:
headers.addheader('etag', self.etag)
if self.lastmodified:
headers.addheader('last-modified', self.lastmodified)
resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
return resp
def http_response(self, req, resp):
urllib2.HTTPCookieProcessor.http_response(self, req, resp)
odata = data = resp.read()
if 200 <= resp.code < 300:
# gzip
if resp.headers.get('Content-Encoding') == 'gzip':
log('un-gzip')
data = GzipFile(fileobj=StringIO(data), mode='r').read()
if 200 <= resp.code < 300 and resp.info().maintype == 'text':
# <meta> redirect
if resp.info().type in MIMETYPE['html']:
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
if match:
newurl = match.groups()[0]
log('redirect: %s' % newurl)
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ('content-length', 'content-type'))
new = urllib2.Request(newurl,
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True)
return self.parent.open(new, timeout=req.timeout)
# encoding
enc = detEncoding(data, resp)
if enc:
data = data.decode(enc, 'replace')
if not self.decode:
data = data.encode(enc)
fp = StringIO(data)
old_resp = resp
resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
https_request = http_request
def detEncoding(data, con=None):
if con is not None and con.headers.getparam('charset'):
log('header')
return con.headers.getparam('charset')
match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
if match:
log('meta.re')
return match.groups()[0]
match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
if match:
return match.groups()[0].lower()
return None
def Fix(item, feedurl='/'):
""" Improves feed items (absolute links, resolve feedburner links, etc) """
# check unwanted uppercase title
if len(item.title) > 20 and item.title.isupper():
item.title = item.title.title()
# check if it includes link
if not item.link:
log('no link')
return item
# check relative urls
item.link = urlparse.urljoin(feedurl, item.link)
# google
if fnmatch(item.link, 'http://www.google.com/url?q=*'):
item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['q'][0]
log(item.link)
# facebook
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['u'][0]
log(item.link)
# feedburner
feeds.NSMAP['feedburner'] = 'http://rssnamespace.org/feedburner/ext/1.0'
match = item.xval('feedburner:origLink')
if match:
item.link = match
# feedsportal
match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link)
if match:
url = match.groups()[0].split('0')
t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'H':',', 'I':'_', 'L':'http://', 'S':'www.', 'N':'.com', 'O':'.co.uk'}
item.link = ''.join([(t[s[0]] if s[0] in t else '=') + s[1:] for s in url[1:]])
log(item.link)
# reddit
if urlparse.urlparse(feedurl).netloc == 'www.reddit.com':
match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
if len(match):
item.link = match[0]
log(item.link)
return item
def Fill(item, cache, feedurl='/', fast=False):
""" Returns True when it has done its best """
if not item.link:
log('no link')
return item
log(item.link)
# content already provided?
count_content = countWord(item.content)
count_desc = countWord(item.desc)
if max(count_content, count_desc) > 500:
if count_desc > count_content:
item.content = item.desc
del item.desc
log('reversed sizes')
log('long enough')
return True
if count_content > 5*count_desc > 0 and count_content > 50:
log('content bigger enough')
return True
link = item.link
# twitter
if urlparse.urlparse(feedurl).netloc == 'twitter.com':
match = lxml.html.fromstring(item.content).xpath('//a/@data-expanded-url')
if len(match):
link = match[0]
log(link)
else:
link = None
# facebook
if urlparse.urlparse(feedurl).netloc == 'graph.facebook.com':
match = lxml.html.fromstring(item.content).xpath('//a/@href')
if len(match) and urlparse.urlparse(match[0]).netloc != 'www.facebook.com':
link = match[0]
log(link)
else:
link = None
if link is None:
log('no used link')
return True
# check cache and previous errors
if link in cache:
content = cache.get(link)
match = re.search(r'^error-([a-z]{2,10})$', content)
if match:
if cache.isYoungerThan(DELAY):
log('cached error: %s' % match.groups()[0])
return True
else:
log('old error')
else:
log('cached')
item.pushContent(cache.get(link))
return True
# super-fast mode
if fast:
log('skipped')
return False
# download
try:
url = link.encode('utf-8')
con = urllib2.build_opener(SimpleDownload(accept=('html', 'text/*'), strict=True)).open(url, timeout=TIMEOUT)
data = con.read()
except (IOError, httplib.HTTPException) as e:
log('http error: %s' % e.message)
cache.set(link, 'error-http')
return True
if con.info().type not in MIMETYPE['html'] and con.info().type != 'text/plain':
log('non-text page')
cache.set(link, 'error-type')
return True
out = readability.Document(data, url=con.url).summary(True)
if countWord(out) > max(count_content, count_desc) > 0:
item.pushContent(out)
cache.set(link, out)
else:
log('not bigger enough')
cache.set(link, 'error-length')
return True
return True
def Init(url, cachePath, options):
# url clean up
log(url)
if url is None:
raise MorssException('No url provided')
if urlparse.urlparse(url).scheme not in PROTOCOL:
url = 'http://' + url
log(url)
url = url.replace(' ', '%20')
# cache
cache = Cache(cachePath, url, options.proxy)
log(cache._hash)
return (url, cache)
def Fetch(url, cache, options):
# do some useful facebook work
feedify.PreWorker(url, cache)
if 'redirect' in cache:
url = cache.get('redirect')
log('url redirect')
log(url)
if 'cache' in cache:
cache.redirect(cache.get('cache'))
log('cache redirect')
# fetch feed
if cache.isYoungerThan(DELAY) and not options.theforce and 'xml' in cache and 'style' in cache:
log('xml cached')
xml = cache.get('xml')
style = cache.get('style')
else:
try:
opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), accept=('xml','html'))
con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT)
xml = con.read()
except (IOError, httplib.HTTPException):
raise MorssException('Error downloading feed')
cache.set('xml', xml)
cache.set('etag', con.headers.getheader('etag'))
cache.set('lastmodified', con.headers.getheader('last-modified'))
if xml.startswith('<?xml') or con.info().type in MIMETYPE['xml']:
style = 'normal'
elif feedify.supported(url):
style = 'feedify'
elif con.info().type in MIMETYPE['html']:
style = 'html'
else:
style = 'none'
log(con.info().type)
cache.set('style', style)
log(style)
if style == 'normal':
rss = feeds.parse(xml)
elif style == 'feedify':
feed = feedify.Builder(url, xml, cache)
feed.build()
rss = feed.feed
elif style == 'html':
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
if len(match):
link = urlparse.urljoin(url, match[0])
return Fetch(link, cachePath, options)
else:
log('no-link html')
raise MorssException('Link provided is an HTML page, which doesn\'t link to a feed')
else:
log('random page')
raise MorssException('Link provided is not a valid feed')
cache.save()
return rss
def Gather(rss, url, cache, options):
log('YEAH')
size = len(rss.items)
startTime = time.time()
# custom settings
global LIM_ITEM
global LIM_TIME
global MAX_ITEM
global MAX_TIME
if options.progress:
MAX_TIME = -1
LIM_TIME = 15
MAX_ITEM = -1
LIM_ITEM = -1
if options.cache:
MAX_TIME = 0
if options.OFCOURSENOT:
log('welcome home')
LIM_ITEM = -1
LIM_TIME = -1
MAX_ITEM = -1
MAX_TIME = -1
# set
def runner(queue):
while True:
value = queue.get()
try:
worker(*value)
except Exception as e:
log('Thread Error: %s' % e.message)
queue.task_done()
def worker(i, item):
if time.time() - startTime > LIM_TIME >= 0 or i+1 > LIM_ITEM >= 0:
log('dropped')
item.remove()
return
item = Fix(item, url)
if time.time() - startTime > MAX_TIME >= 0 or i+1 > MAX_ITEM >= 0:
if not options.proxy:
if Fill(item, cache, url, True) is False:
item.remove()
return
else:
if not options.proxy:
Fill(item, cache, url)
if 'al' in options:
if i+1 > int(options.al):
item.remove()
return
if item.desc and item.content:
if options.clip:
item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
del item.desc
if not options.keep:
del item.desc
queue = Queue.Queue()
for i in range(THREADS):
t = threading.Thread(target=runner, args=(queue,))
t.daemon = True
t.start()
for i, item in enumerate(rss.items):
queue.put([i, item])
queue.join()
cache.save()
log(len(rss.items))
log(time.time() - startTime)
return rss
def cgi_app(environ, start_response):
options = ParseOptions(environ)
url = options.url
headers = {}
global DEBUG
DEBUG = options.debug
if 'HTTP_IF_NONE_MATCH' in environ:
if not options.force and not options.facebook and time.time() - int(environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY:
headers['status'] = '304 Not Modified'
start_response(headers['status'], headers.items())
log(url)
log('etag good')
return []
headers['status'] = '200 OK'
headers['etag'] = '"%s"' % int(time.time())
if options.html:
headers['content-type'] = 'text/html'
elif options.debug or options.txt:
headers['content-type'] = 'text/plain'
elif options.json:
headers['content-type'] = 'application/json'
else:
headers['content-type'] = 'text/xml'
url, cache = Init(url, os.getcwd() + '/cache', options)
RSS = Fetch(url, cache, options)
RSS = Gather(RSS, url, cache, options)
if headers['content-type'] == 'text/xml':
headers['content-type'] = RSS.mimetype
start_response(headers['status'], headers.items())
if not DEBUG and not options.silent:
if options.json:
if options.indent:
return json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x))
else:
return json.dumps(RSS, sort_keys=True, default=lambda x: dict(x))
else:
return RSS.tostring(xml_declaration=True, encoding='UTF-8')
log('done')
def cgi_wrapper(environ, start_response):
try:
return cgi_app(environ, start_response)
except (KeyboardInterrupt, SystemExit):
raise
except MorssException as e:
headers = {}
headers['status'] = '500 Oops'
headers['content-type'] = 'text/plain'
start_response(headers['status'], headers.items(), sys.exc_info())
return 'Internal Error: %s' % e.message
except Exception as e:
headers = {}
headers['status'] = '500 Oops'
headers['content-type'] = 'text/plain'
start_response(headers['status'], headers.items(), sys.exc_info())
return 'Unknown Error: %s' % e.message
def cli_app():
options = ParseOptions()
url = options.url
global DEBUG
DEBUG = options.debug
url, cache = Init(url, os.path.expanduser('~/.cache/morss'), options)
RSS = Fetch(url, cache, options)
RSS = Gather(RSS, url, cache, options)
if not DEBUG and not options.silent:
if options.json:
if options.indent:
print json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x))
else:
print json.dumps(RSS, sort_keys=True, default=lambda x: dict(x))
else:
print RSS.tostring(xml_declaration=True, encoding='UTF-8')
log('done')
if options.facebook:
facebook = Cache(cachePath, 'facebook', persistent=True, dic=True)
# get real token from code
code = urlparse.parse_qs(urlparse.urlparse(url).query)['code'][0]
eurl = "https://graph.facebook.com/oauth/access_token?client_id={app_id}&redirect_uri={redirect_uri}&client_secret={app_secret}&code={code_parameter}".format(app_id=FBAPPID, app_secret=FBSECRET, code_parameter=code, redirect_uri="http://test.morss.it/:facebook/")
token = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())['access_token'][0]
# get long-lived access token
eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(app_id=FBAPPID, app_secret=FBSECRET, short_lived_token=token)
values = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())
ltoken = values['access_token'][0]
expires = int(time.time() + int(values['expires'][0]))
# get user id
iurl = "https://graph.facebook.com/me?fields=id&access_token={token}".format(ltoken)
user_id = json.loads(urllib2.urlopen(iurl).read())['id']
# do sth out of it
if user_id not in facebook['user']:
facebook['user'][user_id] = {'original': ltoken}
facebook['token'][ltoken] = {'user': user_id, 'expires': expires}
facebook['user'][user_id]['token'] = ltoken
facebook.save()
if 'REQUEST_URI' in os.environ:
print 'Status: 200'
print 'Content-Type: text/plain'
print ''
print "token updated"
sys.exit(0)
def main():
if 'REQUEST_URI' in os.environ:
wsgiref.handlers.CGIHandler().run(cgi_wrapper)
elif len(sys.argv) <= 1:
httpd = wsgiref.simple_server.make_server('', 8080, cgi_wrapper)
httpd.serve_forever()
else:
try:
cli_app()
except (KeyboardInterrupt, SystemExit):
raise
except MorssException as e:
print 'Internal Error: %s' % e.message
except Exception as e:
print 'Unknown Error: %s' % e.message
if __name__ == '__main__':
main()