First attempt at decent folder structure
Use setup.py, subfolder for code.
This commit is contained in:
0
morss/__init__.py
Normal file
0
morss/__init__.py
Normal file
70
morss/feedify.ini
Normal file
70
morss/feedify.ini
Normal file
@@ -0,0 +1,70 @@
|
||||
[twitter]
|
||||
mode=xpath
|
||||
path=
|
||||
http://twitter.com/*
|
||||
https://twitter.com/*
|
||||
http://www.twitter.com/*
|
||||
https://www.twitter.com/*
|
||||
|
||||
title= //head/title/text()
|
||||
items= //div[class=tweet]
|
||||
|
||||
item_title= ./@data-name " (@" ./@data-screen-name ")"
|
||||
item_link= .//a[class=details]/@href
|
||||
item_content= .//p[class=tweet-text]
|
||||
item_time= .//span/@data-time
|
||||
|
||||
[google]
|
||||
mode=xpath
|
||||
path=
|
||||
http://google.com/search?q=*
|
||||
http://www.google.com/search?q=*
|
||||
|
||||
title= //head/title/text()
|
||||
items= //li[class=g]
|
||||
|
||||
item_title= .//h3//text()
|
||||
item_link= .//a/@href
|
||||
item_content= .//span[class=st]
|
||||
|
||||
[ddg.gg]
|
||||
mode=xpath
|
||||
path=
|
||||
http://duckduckgo.com/html/?q=*
|
||||
https://duckduckgo.com/html/?q=*
|
||||
|
||||
title= //head/title/text()
|
||||
items= //div[class=results_links][not(contains(@class,'sponsored'))]
|
||||
|
||||
item_title= .//a[class=large]//text()
|
||||
item_link= .//a[class=large]/@href
|
||||
item_content= .//div[class=snippet]
|
||||
|
||||
[facebook home]
|
||||
mode=json
|
||||
path=
|
||||
https://graph.facebook.com/*/home*
|
||||
https://graph.facebook.com/*/feed*
|
||||
|
||||
title= "Facebook"
|
||||
items= data
|
||||
|
||||
item_title= from.name {" > " to.data.name<", ">}
|
||||
item_link= actions.link[0]
|
||||
item_content= message story{"<br/><br/><a href='" link "'><img src='" picture "' /></a>"}{"<blockquote><a href='" link "'>" name "</a><br/>" description "</blockquote>"}{"<br/><br/> – @ " place.name}
|
||||
item_time= created_time
|
||||
item_id= id
|
||||
|
||||
[facebook message/post]
|
||||
mode=json
|
||||
path=
|
||||
https://graph.facebook.com/*
|
||||
https://graph.facebook.com/*
|
||||
|
||||
title= "Facebook"
|
||||
items= comments.data
|
||||
|
||||
item_title= from.name
|
||||
item_content= message
|
||||
item_time= created_time
|
||||
item_id= id
|
243
morss/feedify.py
Normal file
243
morss/feedify.py
Normal file
@@ -0,0 +1,243 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from ConfigParser import ConfigParser
|
||||
from fnmatch import fnmatch
|
||||
import feeds
|
||||
import morss
|
||||
import re
|
||||
|
||||
import urllib2
|
||||
import lxml.html
|
||||
import json
|
||||
import urlparse
|
||||
|
||||
import time
|
||||
|
||||
def toclass(query):
|
||||
pattern = r'\[class=([^\]]+)\]'
|
||||
repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
|
||||
return re.sub(pattern, repl, query)
|
||||
|
||||
def getRule(link):
|
||||
config = ConfigParser()
|
||||
config.read('feedify.ini')
|
||||
|
||||
for section in config.sections():
|
||||
values = dict(config.items(section))
|
||||
values['path'] = values['path'].split('\n')[1:]
|
||||
for path in values['path']:
|
||||
if fnmatch(link, path):
|
||||
return values
|
||||
return False
|
||||
|
||||
def supported(link):
|
||||
return getRule(link) is not False
|
||||
|
||||
def formatString(string, getter, error=False):
|
||||
out = ""
|
||||
char = string[0]
|
||||
|
||||
follow = string[1:]
|
||||
|
||||
if char == '"':
|
||||
match = follow.partition('"')
|
||||
out = match[0]
|
||||
if len(match) >= 2:
|
||||
next = match[2]
|
||||
else:
|
||||
next = None
|
||||
elif char == '{':
|
||||
match = follow.partition('}')
|
||||
try:
|
||||
test = formatString(match[0], getter, True)
|
||||
except ValueError, KeyError:
|
||||
pass
|
||||
else:
|
||||
out = test
|
||||
|
||||
next = match[2]
|
||||
elif char == ' ':
|
||||
next = follow
|
||||
elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string):
|
||||
match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups()
|
||||
rawValue = getter(match[0])
|
||||
if not isinstance(rawValue, basestring):
|
||||
if match[1] is not None:
|
||||
out = match[1].join(rawValue)
|
||||
else:
|
||||
out = ''.join(rawValue)
|
||||
if not out and error:
|
||||
raise ValueError
|
||||
next = match[2]
|
||||
else:
|
||||
raise ValueError('bogus string')
|
||||
|
||||
if next is not None and len(next):
|
||||
return out + formatString(next, getter, error)
|
||||
else:
|
||||
return out
|
||||
|
||||
def PreWorker(url, cache):
|
||||
if urlparse.urlparse(url).netloc == 'graph.facebook.com':
|
||||
facebook = cache.new('facebook', persistent=True, dic=True)
|
||||
token = urlparse.parse_qs(urlparse.urlparse(url).query)['access_token'][0]
|
||||
|
||||
if token not in facebook['token']:
|
||||
# this token ain't known, look for info about it
|
||||
eurl = "https://graph.facebook.com/debug_token?input_token={token}&access_token={app_token}".format(token=token, app_token=morss.FBAPPTOKEN)
|
||||
data = json.loads(urllib2.urlopen(eurl).read())['data']
|
||||
|
||||
app_id = str(data['app_id'])
|
||||
user_id = str(data['user_id'])
|
||||
expires = int(data['expires_at'])
|
||||
short = 'issued_at' not in data
|
||||
|
||||
facebook['token'][token] = {'user': user_id, 'expires': expires}
|
||||
|
||||
# do some woodoo to know if we already have sth better
|
||||
|
||||
if user_id not in facebook['user']:
|
||||
# grab a new one anyway, new user
|
||||
facebook['user'][user_id] = {'original': token}
|
||||
good = True
|
||||
else:
|
||||
# maybe it's a better one
|
||||
last = facebook['user'][user_id]['token']
|
||||
last_expires = facebook['token'][last]['expires']
|
||||
|
||||
if expires > last_expires:
|
||||
# new is better
|
||||
good = True
|
||||
|
||||
if good and short and app_id == morss.FBAPPID:
|
||||
eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(app_id=morss.FBAPPID, app_secret=morss.FBSECRET, short_lived_token=token)
|
||||
values = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())
|
||||
|
||||
token = values['access_token'][0]
|
||||
expires = int(time.time() + int(values['expires'][0]))
|
||||
|
||||
facebook['token'][token] = {'user': user_id, 'expires': expires}
|
||||
|
||||
facebook['user'][user_id]['token'] = token
|
||||
|
||||
# hey look for a newer token and use it
|
||||
token = urlparse.parse_qs(urlparse.urlparse(url).query)['access_token'][0]
|
||||
user_id = facebook['token'][token]['user']
|
||||
last = facebook['user'][user_id]['token']
|
||||
original = facebook['user'][user_id]['original']
|
||||
|
||||
nurl = url.replace(token, last)
|
||||
ncache = url.replace(token, original)
|
||||
cache.set('redirect', nurl)
|
||||
cache.set('cache', ncache)
|
||||
|
||||
class Builder(object):
|
||||
def __init__(self, link, data=None, cache=False):
|
||||
self.link = link
|
||||
self.cache = cache
|
||||
|
||||
if data is None:
|
||||
data = urllib2.urlopen(link).read()
|
||||
self.data = data
|
||||
|
||||
self.rule = getRule(link)
|
||||
|
||||
if self.rule['mode'] == 'xpath':
|
||||
if not isinstance(self.data, unicode):
|
||||
self.data = self.data.decode(morss.detEncoding(self.data), 'replace')
|
||||
self.doc = lxml.html.fromstring(self.data)
|
||||
elif self.rule['mode'] == 'json':
|
||||
self.doc = json.loads(data)
|
||||
|
||||
self.feed = feeds.FeedParserAtom()
|
||||
|
||||
def raw(self, html, expr):
|
||||
if self.rule['mode'] == 'xpath':
|
||||
return html.xpath(toclass(expr))
|
||||
|
||||
elif self.rule['mode'] == 'json':
|
||||
a = [html]
|
||||
b = []
|
||||
for x in expr.strip(".").split("."):
|
||||
match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups()
|
||||
for elem in a:
|
||||
if isinstance(elem, dict):
|
||||
kids = elem.get(match[0])
|
||||
if kids is None:
|
||||
pass
|
||||
elif isinstance(kids, list):
|
||||
[b.append(i) for i in kids]
|
||||
elif isinstance(kids, basestring):
|
||||
b.append(kids.replace('\n', '<br/>'))
|
||||
else:
|
||||
b.append(kids)
|
||||
|
||||
if match[1] is None:
|
||||
a = b
|
||||
else:
|
||||
if len(b)-1 >= int(match[1]):
|
||||
a = [b[int(match[1])]]
|
||||
else:
|
||||
a = []
|
||||
b = []
|
||||
return a
|
||||
|
||||
def strings(self, html, expr):
|
||||
if self.rule['mode'] == 'xpath':
|
||||
out = []
|
||||
for match in self.raw(html, expr):
|
||||
if isinstance(match, basestring):
|
||||
out.append(match)
|
||||
elif isinstance(match, lxml.html.HtmlElement):
|
||||
out.append(lxml.html.tostring(match))
|
||||
return out
|
||||
|
||||
elif self.rule['mode'] == 'json':
|
||||
return self.raw(html, expr)
|
||||
|
||||
def string(self, html, expr):
|
||||
getter = lambda x: self.strings(html, x)
|
||||
return formatString(self.rule[expr], getter)
|
||||
|
||||
def build(self):
|
||||
if 'title' in self.rule:
|
||||
self.feed.title = self.string(self.doc, 'title')
|
||||
|
||||
if 'items' in self.rule:
|
||||
matches = self.raw(self.doc, self.rule['items'])
|
||||
if matches and len(matches):
|
||||
for item in matches:
|
||||
feedItem = {}
|
||||
|
||||
if 'item_title' in self.rule:
|
||||
feedItem['title'] = self.string(item, 'item_title')
|
||||
if 'item_link' in self.rule:
|
||||
url = self.string(item, 'item_link')
|
||||
url = urlparse.urljoin(self.link, url)
|
||||
feedItem['link'] = url
|
||||
if 'item_desc' in self.rule:
|
||||
feedItem['desc'] = self.string(item, 'item_desc')
|
||||
if 'item_content' in self.rule:
|
||||
feedItem['content'] = self.string(item, 'item_content')
|
||||
if 'item_time' in self.rule:
|
||||
feedItem['updated'] = self.string(item, 'item_time')
|
||||
if 'item_id' in self.rule:
|
||||
feedItem['id'] = self.string(item, 'item_id')
|
||||
feedItem['isPermaLink'] = False
|
||||
|
||||
self.feed.items.append(feedItem)
|
||||
|
||||
|
||||
if urlparse.urlparse(self.link).netloc == 'graph.facebook.com':
|
||||
if self.cache:
|
||||
facebook = self.cache.new('facebook', True)
|
||||
token = urlparse.parse_qs(urlparse.urlparse(self.link).query)['access_token'][0]
|
||||
expires = facebook['token'][token]['expires']
|
||||
lifespan = expires - time.time()
|
||||
|
||||
if lifespan < 5*24*3600:
|
||||
new = self.feed.items.append()
|
||||
new.title = "APP AUTHORISATION RENEWAL NEEDED"
|
||||
new.link = "https://www.facebook.com/dialog/oauth?client_id={app_id}&redirect_uri=http://test.morss.it/:facebook/".format(app_id=morss.FBAPPID)
|
||||
new.desc = "Please renew your Facebook app token for this app to keep working for this feed.<br/><a href='{}'>Go!</a>".format(new.link)
|
||||
new.time = expires
|
711
morss/feeds.py
Normal file
711
morss/feeds.py
Normal file
@@ -0,0 +1,711 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from lxml import etree
|
||||
from datetime import datetime
|
||||
import dateutil.parser
|
||||
from dateutil import tz
|
||||
import re
|
||||
|
||||
Element = etree.Element
|
||||
|
||||
NSMAP = {'atom': 'http://www.w3.org/2005/Atom',
|
||||
'atom03': 'http://purl.org/atom/ns#',
|
||||
'media': 'http://search.yahoo.com/mrss/',
|
||||
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
|
||||
'slash': 'http://purl.org/rss/1.0/modules/slash/',
|
||||
'dc': 'http://purl.org/dc/elements/1.1/',
|
||||
'content': 'http://purl.org/rss/1.0/modules/content/',
|
||||
'rssfake': 'http://purl.org/rss/1.0/'}
|
||||
|
||||
def load(url):
|
||||
import urllib2
|
||||
d = urllib2.urlopen(url).read()
|
||||
return parse(d)
|
||||
|
||||
def tagNS(tag, nsmap=NSMAP):
|
||||
match = re.search(r'^\{([^\}]+)\}(.*)$', tag)
|
||||
if match:
|
||||
match = match.groups()
|
||||
for (key, url) in nsmap.iteritems():
|
||||
if url == match[0]:
|
||||
return "%s:%s" % (key, match[1].lower())
|
||||
else:
|
||||
match = re.search(r'^([^:]+):([^:]+)$', tag)
|
||||
if match:
|
||||
match = match.groups()
|
||||
if match[0] in nsmap:
|
||||
return "{%s}%s" % (nsmap[match[0]], match[1].lower())
|
||||
return tag
|
||||
|
||||
def innerHTML(xml):
|
||||
return (xml.text or '') + ''.join([etree.tostring(child) for child in xml.iterchildren()])
|
||||
|
||||
def cleanNode(xml):
|
||||
[xml.remove(child) for child in xml.iterchildren()]
|
||||
|
||||
class FeedException(Exception):
|
||||
pass
|
||||
|
||||
def parse(data):
|
||||
# encoding
|
||||
match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
|
||||
if match:
|
||||
enc = match.groups()[0].lower()
|
||||
if not isinstance(data, unicode):
|
||||
data = data.decode(enc, 'ignore')
|
||||
data = data.encode(enc)
|
||||
|
||||
# parse
|
||||
parser = etree.XMLParser(recover=True)
|
||||
doc = etree.fromstring(data, parser)
|
||||
|
||||
# rss
|
||||
match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
|
||||
if len(match):
|
||||
mtable = { 'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
|
||||
'atom03:feed': FeedParserAtom, 'atom:feed': FeedParserAtom }
|
||||
match = match[0]
|
||||
tag = tagNS(match.tag)
|
||||
if tag in mtable:
|
||||
return mtable[tag](doc, tag)
|
||||
|
||||
raise FeedException('unknow feed type')
|
||||
|
||||
class FeedBase(object):
|
||||
"""
|
||||
Base for xml-related classes, which provides simple wrappers around xpath
|
||||
selection and item creation
|
||||
"""
|
||||
|
||||
def __getitem__(self, item):
|
||||
return getattr(self, item)
|
||||
|
||||
def __setitem__(self, item, value):
|
||||
setattr(self, item, value)
|
||||
|
||||
def __delitem__(self, item):
|
||||
delattr(self, item)
|
||||
|
||||
def __iter__(self):
|
||||
for element in self.dic:
|
||||
value = self[element]
|
||||
|
||||
if isinstance(value, FeedList):
|
||||
value = [dict(x) for x in value]
|
||||
elif isinstance(value, datetime):
|
||||
value = value.isoformat()
|
||||
|
||||
yield element, value
|
||||
|
||||
def xpath(self, path):
|
||||
""" Test xpath rule on xml tree """
|
||||
return self.root.xpath(path, namespaces=NSMAP)
|
||||
|
||||
def xget(self, path):
|
||||
""" Returns the 1st xpath match """
|
||||
match = self.xpath(path)
|
||||
if len(match):
|
||||
return match[0]
|
||||
else:
|
||||
return None
|
||||
|
||||
def xval(self, path):
|
||||
""" Returns the .text of the 1st match """
|
||||
match = self.xget(path)
|
||||
if match is not None:
|
||||
return match.text or ""
|
||||
else:
|
||||
return ""
|
||||
|
||||
def xgetCreate(self, table):
|
||||
""" Returns an element, and creates it when not present """
|
||||
value = table[self.tag]
|
||||
if not isinstance(value, tuple):
|
||||
value = (value, value)
|
||||
new, xpath = value
|
||||
match = self.xget(xpath)
|
||||
if match is not None:
|
||||
return match
|
||||
else:
|
||||
element = etree.Element(tagNS(new))
|
||||
self.root.append(element)
|
||||
return element
|
||||
|
||||
def xdel(self, path):
|
||||
match = self.xget(path)
|
||||
if match is not None:
|
||||
return match.getparent().remove(match)
|
||||
|
||||
def tostring(self, **k):
|
||||
""" Returns string using lxml. Arguments passed to tostring """
|
||||
return etree.tostring(self.xml, pretty_print=True, **k)
|
||||
|
||||
class FeedDescriptor(object):
|
||||
"""
|
||||
Descriptor which gives off elements based on "self.getName" and
|
||||
"self.setName" as getter/setters. Looks far better, and avoids duplicates
|
||||
"""
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.nname = name[0].upper() + name[1:]
|
||||
|
||||
def __get__(self, instance, owner):
|
||||
getter = getattr(instance, 'get%s' % self.nname)
|
||||
return getter()
|
||||
|
||||
def __set__(self, instance, value):
|
||||
setter = getattr(instance, 'set%s' % self.nname)
|
||||
return setter(value)
|
||||
|
||||
def __delete__(self, instance):
|
||||
deleter = getattr(instance, 'del%s' % self.nname)
|
||||
return deleter()
|
||||
|
||||
class FeedTime(FeedDescriptor):
|
||||
def __get__(self, instance, owner):
|
||||
getter = getattr(instance, 'get%s' % self.nname)
|
||||
raw = getter()
|
||||
try:
|
||||
time = parseTime(raw)
|
||||
return time
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def __set__(self, instance, value):
|
||||
try:
|
||||
time = parseTime(value)
|
||||
raw = time.strftime(instance.timeFormat)
|
||||
setter = getattr(instance, 'set%s' % self.nname)
|
||||
return setter(raw)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
class FeedBool(FeedDescriptor):
|
||||
def __get__(self, instance, owner):
|
||||
getter = getattr(instance, 'get%s' % self.nname)
|
||||
raw = getter()
|
||||
return (raw or '').lower() != 'false'
|
||||
|
||||
def __set__(self, instance, value):
|
||||
raw = 'true' if value else 'false'
|
||||
setter = getattr(instance, 'set%s' % self.nname)
|
||||
return setter(raw)
|
||||
|
||||
def parseTime(value):
|
||||
if isinstance(value, basestring):
|
||||
if re.match(r'^[0-9]+$', value):
|
||||
return datetime.fromtimestamp(int(value), tz.tzutc())
|
||||
else:
|
||||
return dateutil.parser.parse(value, tzinfos=tz.tzutc)
|
||||
elif isinstance(value, int):
|
||||
return datetime.fromtimestamp(value, tz.tzutc())
|
||||
elif isinstance(value, datetime):
|
||||
return value
|
||||
else:
|
||||
return False
|
||||
|
||||
class FeedList(object):
|
||||
"""
|
||||
Class to map a list of xml elements against a list of matching objects,
|
||||
while avoiding to recreate the same matching object over and over again. So
|
||||
as to avoid extra confusion, list's elements are called "children" here, so
|
||||
as not to use "items", which is already in use in RSS/Atom related code.
|
||||
|
||||
Comes with its very own descriptor.
|
||||
"""
|
||||
def __init__(self, parent, getter, tag, childClass):
|
||||
self.parent = parent
|
||||
self.getter = getter
|
||||
self.childClass = childClass
|
||||
self.tag = tag
|
||||
self._children = {} # id(xml) => FeedItem
|
||||
|
||||
def getChildren(self):
|
||||
children = self.getter()
|
||||
out = []
|
||||
for child in children:
|
||||
if id(child) in self._children:
|
||||
out.append(self._children[id(child)])
|
||||
else:
|
||||
new = self.childClass(child, self.tag)
|
||||
self._children[id(child)] = new
|
||||
out.append(new)
|
||||
return out
|
||||
|
||||
def append(self, cousin=None):
|
||||
new = self.childClass(tag=self.tag)
|
||||
self.parent.root.append(new.xml)
|
||||
self._children[id(new.xml)] = new
|
||||
|
||||
if cousin is None:
|
||||
return new
|
||||
|
||||
for key in self.childClass.__dict__:
|
||||
if key[:3] == 'set':
|
||||
attr = key[3:].lower()
|
||||
if hasattr(cousin, attr):
|
||||
setattr(new, attr, getattr(cousin, attr))
|
||||
elif attr in cousin:
|
||||
setattr(new, attr, cousin[attr])
|
||||
|
||||
return new
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.getChildren()[key]
|
||||
|
||||
def __delitem__(self, key):
|
||||
child = self.getter()[key]
|
||||
if id(child) in self._children:
|
||||
self._children[id(child)].remove()
|
||||
del self._children[id(child)]
|
||||
else:
|
||||
child.getparent().remove(child)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.getter())
|
||||
|
||||
class FeedListDescriptor(object):
|
||||
"""
|
||||
Descriptor for FeedList
|
||||
"""
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.items = {} # id(instance) => FeedList
|
||||
|
||||
def __get__(self, instance, owner=None):
|
||||
key = id(instance)
|
||||
if key in self.items:
|
||||
return self.items[key]
|
||||
else:
|
||||
getter = getattr(instance, 'get%s' % self.name.title())
|
||||
className = globals()[getattr(instance, '%sClass' % self.name)]
|
||||
self.items[key] = FeedList(instance, getter, instance.tag, className)
|
||||
return self.items[key]
|
||||
|
||||
def __set__(self, instance, value):
|
||||
feedlist = self.__get__(instance)
|
||||
[x.remove() for x in [x for x in f.items]]
|
||||
[feedlist.append(x) for x in value]
|
||||
|
||||
class FeedParser(FeedBase):
|
||||
itemsClass = 'FeedItem'
|
||||
mimetype = 'application/xml'
|
||||
base = '<?xml?>'
|
||||
dic = ('title', 'desc', 'items')
|
||||
|
||||
def __init__(self, xml=None, tag='atom:feed'):
|
||||
if xml is None:
|
||||
xml = etree.fromstring(self.base[tag])
|
||||
self.xml = xml
|
||||
self.root = self.xml.xpath("//atom03:feed|//atom:feed|//channel|//rssfake:channel", namespaces=NSMAP)[0]
|
||||
self.tag = tag
|
||||
|
||||
def getTitle(self):
|
||||
return ""
|
||||
|
||||
def setTitle(self, value):
|
||||
pass
|
||||
|
||||
def delTitle(self):
|
||||
self.title = ""
|
||||
|
||||
|
||||
def getDesc(self):
|
||||
pass
|
||||
|
||||
def setDesc(self, value):
|
||||
pass
|
||||
|
||||
def delDesc(self):
|
||||
self.desc = ""
|
||||
|
||||
|
||||
def getItems(self):
|
||||
return []
|
||||
|
||||
title = FeedDescriptor('title')
|
||||
description = desc = FeedDescriptor('desc')
|
||||
items = FeedListDescriptor('items')
|
||||
|
||||
def tostring(self, **k):
|
||||
return etree.tostring(self.xml.getroottree(), pretty_print=True, **k)
|
||||
|
||||
class FeedParserRSS(FeedParser):
|
||||
"""
|
||||
RSS Parser
|
||||
"""
|
||||
itemsClass = 'FeedItemRSS'
|
||||
mimetype = 'application/rss+xml'
|
||||
base = { 'rdf:rdf': '<?xml version="1.0" encoding="utf-8"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/"><channel rdf:about="http://example.org/rss.rdf"></channel></rdf:RDF>',
|
||||
'channel': '<?xml version="1.0" encoding="utf-8"?><rss version="2.0"><channel></channel></rss>'}
|
||||
|
||||
def getTitle(self):
|
||||
return self.xval('rssfake:title|title')
|
||||
|
||||
def setTitle(self, value):
|
||||
if not value:
|
||||
return self.xdel('rssfake:title|title')
|
||||
|
||||
table = { 'rdf:rdf': 'rssfake:title',
|
||||
'channel': 'title'}
|
||||
element = self.xgetCreate(table)
|
||||
element.text = value
|
||||
|
||||
|
||||
def getDesc(self):
|
||||
return self.xval('rssfake:description|description')
|
||||
|
||||
def setDesc(self, value):
|
||||
if not value:
|
||||
return self.xdel('rssfake:description|description')
|
||||
|
||||
table = { 'rdf:rdf': 'rssfake:description',
|
||||
'channel': 'description'}
|
||||
element = self.xgetCreate(table)
|
||||
element.text = value
|
||||
|
||||
|
||||
def getItems(self):
|
||||
return self.xpath('rssfake:item|item')
|
||||
|
||||
class FeedParserAtom(FeedParser):
|
||||
"""
|
||||
Atom Parser
|
||||
"""
|
||||
itemsClass = 'FeedItemAtom'
|
||||
mimetype = 'application/atom+xml'
|
||||
base = { 'atom:feed': '<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>',
|
||||
'atom03:feed': '<?xml version="1.0" encoding="utf-8"?><feed version="0.3" xmlns="http://purl.org/atom/ns#"></feed>'}
|
||||
|
||||
def getTitle(self):
|
||||
return self.xval('atom:title|atom03:title')
|
||||
|
||||
def setTitle(self, value):
|
||||
if not value:
|
||||
return self.xval('atom:title|atom03:title')
|
||||
|
||||
table = { 'atom:feed': 'atom:title',
|
||||
'atom03:feed': 'atom03:title'}
|
||||
element = self.xgetCreate(table)
|
||||
element.text = value
|
||||
|
||||
|
||||
def getDesc(self):
|
||||
return self.xval('atom:subtitle|atom03:subtitle')
|
||||
|
||||
def setDesc(self, value):
|
||||
if not value:
|
||||
return self.xdel('atom:subtitle|atom03:subtitle')
|
||||
|
||||
table = { 'atom:feed': 'atom:subtitle',
|
||||
'atom03:feed': 'atom03:subtitle'}
|
||||
element = self.xgetCreate(table)
|
||||
element.text = value
|
||||
|
||||
|
||||
def getItems(self):
|
||||
return self.xpath('atom:entry|atom03:entry')
|
||||
|
||||
class FeedItem(FeedBase):
|
||||
timeFormat = ''
|
||||
dic = ('title', 'link', 'isPermaLink', 'desc', 'content', 'id', 'time', 'updated')
|
||||
|
||||
def __init__(self, xml=None, tag='atom:feed'):
|
||||
if xml is None:
|
||||
xml = Element(tagNS(self.base[tag]))
|
||||
|
||||
self.root = self.xml = xml
|
||||
self.tag = tag
|
||||
|
||||
def getTitle(self):
|
||||
return ""
|
||||
|
||||
def setTitle(self):
|
||||
pass
|
||||
|
||||
def delTitle(self):
|
||||
self.title = ""
|
||||
|
||||
|
||||
def getLink(self):
|
||||
return ""
|
||||
|
||||
def setLink(self, value):
|
||||
pass
|
||||
|
||||
def delLink(self):
|
||||
pass
|
||||
|
||||
|
||||
def getIsPermaLink(self):
|
||||
return ""
|
||||
|
||||
def setIsPermaLink(self, value):
|
||||
pass
|
||||
|
||||
|
||||
def getDesc(self):
|
||||
return ""
|
||||
|
||||
def setDesc(self, value):
|
||||
pass
|
||||
|
||||
def delDesc(self):
|
||||
self.desc = ""
|
||||
|
||||
|
||||
def getContent(self):
|
||||
return ""
|
||||
|
||||
def setContent(self, value):
|
||||
pass
|
||||
|
||||
def delContent(self):
|
||||
self.content = ""
|
||||
|
||||
|
||||
def getId(self):
|
||||
return ""
|
||||
|
||||
def setId(self, value):
|
||||
pass
|
||||
|
||||
def delId(self):
|
||||
self.id = ""
|
||||
|
||||
|
||||
def getTime(self):
|
||||
return None
|
||||
|
||||
def setTime(self, value):
|
||||
pass
|
||||
|
||||
def delTime(self):
|
||||
self.time = None
|
||||
|
||||
|
||||
def getUpdated(self):
|
||||
return None
|
||||
|
||||
def setUpdated(self, value):
|
||||
pass
|
||||
|
||||
def delUpdated(self):
|
||||
self.updated = None
|
||||
|
||||
title = FeedDescriptor('title')
|
||||
link = FeedDescriptor('link')
|
||||
description = desc = FeedDescriptor('desc')
|
||||
content = FeedDescriptor('content')
|
||||
id = FeedDescriptor('id')
|
||||
isPermaLink = FeedBool('isPermaLink')
|
||||
time = FeedTime('time')
|
||||
updated = FeedTime('updated')
|
||||
|
||||
def pushContent(self, value):
|
||||
if not self.desc and self.content:
|
||||
self.desc = self.content
|
||||
|
||||
self.content = value
|
||||
|
||||
def remove(self):
|
||||
self.xml.getparent().remove(self.xml)
|
||||
|
||||
class FeedItemRSS(FeedItem):
|
||||
timeFormat = '%a, %d %b %Y %H:%M:%S %Z'
|
||||
base = { 'rdf:rdf': 'rssfake:item',
|
||||
'channel': 'item'}
|
||||
|
||||
def getTitle(self):
|
||||
return self.xval('rssfake:title|title')
|
||||
|
||||
def setTitle(self, value):
|
||||
if not value:
|
||||
return self.xdel('rssfake:title|title')
|
||||
|
||||
table = { 'rdf:rdf': 'rssfake:title',
|
||||
'channel': 'title'}
|
||||
element = self.xgetCreate(table)
|
||||
element.text = value
|
||||
|
||||
|
||||
def getLink(self):
|
||||
return self.xval('rssfake:link|link')
|
||||
|
||||
def setLink(self, value):
|
||||
if self.isPermaLink and self.id == self.link != value:
|
||||
self.isPermaLink = False
|
||||
|
||||
table = { 'rdf:rdf': 'rssfake:link',
|
||||
'channel': 'link'}
|
||||
element = self.xgetCreate(table)
|
||||
element.text = value
|
||||
|
||||
|
||||
def getDesc(self):
|
||||
return self.xval('rssfake:description|description')
|
||||
|
||||
def setDesc(self, value):
|
||||
if not value:
|
||||
return self.xdel('rssfake:description|description')
|
||||
|
||||
table = { 'rdf:rdf': 'rssfake:description',
|
||||
'channel': 'description'}
|
||||
element = self.xgetCreate(table)
|
||||
element.text = value
|
||||
|
||||
|
||||
def getContent(self):
|
||||
return self.xval('content:encoded')
|
||||
|
||||
def setContent(self, value):
|
||||
if not value:
|
||||
return self.xdel('content:encoded')
|
||||
|
||||
table = { 'rdf:rdf': 'content:encoded',
|
||||
'channel': 'content:encoded'}
|
||||
element = self.xgetCreate(table)
|
||||
element.text = value
|
||||
|
||||
|
||||
def getId(self):
|
||||
return self.xval('rssfake:guid|guid')
|
||||
|
||||
def setId(self, value):
|
||||
if not value:
|
||||
return self.xdel('rssfake:guid|guid')
|
||||
|
||||
table = { 'rdf:rdf': 'rssfake:guid',
|
||||
'channel': 'guid'}
|
||||
element = self.xgetCreate(table)
|
||||
element.text = value
|
||||
|
||||
|
||||
def getIsPermaLink(self):
|
||||
return self.xget('rssfake:guid/@isPermaLink|guid/@isPermaLink')
|
||||
|
||||
def setIsPermaLink(self, value):
|
||||
table = { 'rdf:rdf': 'rssfake:guid',
|
||||
'channel': 'guid'}
|
||||
element = self.xgetCreate(table)
|
||||
element.attrib['isPermaLink'] = value
|
||||
|
||||
|
||||
def getTime(self):
|
||||
return self.xval('rssfake:pubDate|pubDate')
|
||||
|
||||
def setTime(self, value):
|
||||
if not value:
|
||||
return self.xdel('rssfake:pubDate|pubDate')
|
||||
|
||||
table = { 'rdf:rdf': 'rssfake:pubDate',
|
||||
'channel': 'pubDate'}
|
||||
element = self.xgetCreate(table)
|
||||
element.text = value
|
||||
|
||||
class FeedItemAtom(FeedItem):
|
||||
timeFormat = '%Y-%m-%dT%H:%M:%SZ'
|
||||
base = { 'atom:feed': 'atom:entry',
|
||||
'atom03:feed': 'atom03:entry'}
|
||||
|
||||
def getTitle(self):
|
||||
return self.xval('atom:title|atom03:title')
|
||||
|
||||
def setTitle(self, value):
|
||||
if not value:
|
||||
return self.xdel('atom:title|atom03:title')
|
||||
|
||||
table = { 'atom:feed': 'atom:title',
|
||||
'atom03:feed': 'atom03:title'}
|
||||
element = self.xgetCreate(table)
|
||||
element.text = value
|
||||
|
||||
|
||||
def getLink(self):
|
||||
return self.xget('(atom:link|atom03:link)[@rel="alternate" or not(@rel)]/@href')
|
||||
|
||||
def setLink(self, value):
|
||||
table = { 'atom:feed': ('atom:link', 'atom:link[@rel="alternate" or not(@rel)]'),
|
||||
'atom03:feed': ('atom03:link', 'atom03:link[@rel="alternate" or not(@rel)]')}
|
||||
element = self.xgetCreate(table)
|
||||
element.attrib['href'] = value
|
||||
|
||||
|
||||
def getDesc(self):
|
||||
# default "type" is "text"
|
||||
element = self.xget('atom:summary|atom03:summary')
|
||||
if element is not None:
|
||||
return innerHTML(element)
|
||||
else:
|
||||
return ""
|
||||
|
||||
def setDesc(self, value):
|
||||
if not value:
|
||||
return self.xdel('atom:summary|atom03:summary')
|
||||
|
||||
table = { 'atom:feed': 'atom:summary',
|
||||
'atom03:feed': 'atom03:summary'}
|
||||
element = self.xgetCreate(table)
|
||||
if element.attrib.get('type', '') == 'xhtml':
|
||||
cleanNode(element)
|
||||
element.attrib['type'] = 'html'
|
||||
element.text = value
|
||||
|
||||
|
||||
def getContent(self):
|
||||
element = self.xget('atom:content|atom03:content')
|
||||
if element is not None:
|
||||
return innerHTML(element)
|
||||
else:
|
||||
return ""
|
||||
|
||||
def setContent(self, value):
|
||||
if not value:
|
||||
return self.xdel('atom:content|atom03:content')
|
||||
|
||||
table = { 'atom:feed': 'atom:content',
|
||||
'atom03:feed': 'atom03:content'}
|
||||
element = self.xgetCreate(table)
|
||||
if element.attrib.get('type', '') == 'xhtml':
|
||||
cleanNode(element)
|
||||
element.attrib['type'] = 'html'
|
||||
element.text = value
|
||||
|
||||
|
||||
def getId(self):
|
||||
return self.xval('atom:id|atom03:id')
|
||||
|
||||
def setId(self, value):
|
||||
if not value:
|
||||
return self.xdel('atom:id|atom03:id')
|
||||
|
||||
table = { 'atom:feed': 'atom:id',
|
||||
'atom03:feed': 'atom03:id'}
|
||||
element = self.xgetCreate(table)
|
||||
element.text = value
|
||||
|
||||
|
||||
def getTime(self):
|
||||
return self.xval('atom:published|atom03:published')
|
||||
|
||||
def setTime(self, value):
|
||||
if not value:
|
||||
return self.xdel('atom:published|atom03:published')
|
||||
|
||||
table = { 'atom:feed': 'atom:published',
|
||||
'atom03:feed': 'atom03:published'}
|
||||
element = self.xgetCreate(table)
|
||||
element.text = value
|
||||
|
||||
|
||||
def getUpdated(self):
|
||||
return self.xval('atom:updated|atom03:updated')
|
||||
|
||||
def setUpdated(self, value):
|
||||
if not value:
|
||||
return self.xdel('atom:updated|atom03:updated')
|
||||
|
||||
table = { 'atom:feed': 'atom:updated',
|
||||
'atom03:feed': 'atom03:updated'}
|
||||
element = self.xgetCreate(table)
|
||||
element.text = value
|
799
morss/morss.py
Normal file
799
morss/morss.py
Normal file
@@ -0,0 +1,799 @@
|
||||
#!/usr/bin/env python
|
||||
import sys
|
||||
import os
|
||||
import os.path
|
||||
import time
|
||||
|
||||
import Queue
|
||||
import threading
|
||||
|
||||
from fnmatch import fnmatch
|
||||
from base64 import b64encode, b64decode
|
||||
import re
|
||||
import string
|
||||
import json
|
||||
|
||||
import lxml.html
|
||||
import lxml.html.clean
|
||||
import lxml.builder
|
||||
|
||||
import feeds
|
||||
import feedify
|
||||
|
||||
import httplib
|
||||
import urllib
|
||||
import urllib2
|
||||
import chardet
|
||||
import urlparse
|
||||
|
||||
import wsgiref.util
|
||||
import wsgiref.simple_server
|
||||
import wsgiref.handlers
|
||||
|
||||
from gzip import GzipFile
|
||||
from StringIO import StringIO
|
||||
|
||||
from readability import readability
|
||||
|
||||
LIM_ITEM = 100 # deletes what's beyond
|
||||
LIM_TIME = 7 # deletes what's after
|
||||
MAX_ITEM = 50 # cache-only beyond
|
||||
MAX_TIME = 7 # cache-only after (in sec)
|
||||
DELAY = 10*60 # xml cache & ETag cache (in sec)
|
||||
TIMEOUT = 2 # http timeout (in sec)
|
||||
THREADS = 10 # number of threads (1 for single-threaded)
|
||||
|
||||
DEBUG = False
|
||||
HOLD = False
|
||||
|
||||
UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)'
|
||||
UA_HTML = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
|
||||
|
||||
MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
||||
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
||||
|
||||
FBAPPID = "<insert yours>"
|
||||
FBSECRET = "<insert yours>"
|
||||
FBAPPTOKEN = FBAPPID + '|' + FBSECRET
|
||||
|
||||
PROTOCOL = ['http', 'https', 'ftp']
|
||||
|
||||
if 'SCRIPT_NAME' in os.environ:
|
||||
httplib.HTTPConnection.debuglevel = 1
|
||||
|
||||
import cgitb
|
||||
cgitb.enable()
|
||||
|
||||
class MorssException(Exception):
|
||||
pass
|
||||
|
||||
def log(txt, force=False):
|
||||
if DEBUG or force:
|
||||
if 'REQUEST_URI' in os.environ:
|
||||
open('morss.log', 'a').write("%s\n" % repr(txt))
|
||||
else:
|
||||
print repr(txt)
|
||||
|
||||
|
||||
def lenHTML(txt):
|
||||
if len(txt):
|
||||
return len(lxml.html.fromstring(txt).text_content())
|
||||
else:
|
||||
return 0
|
||||
|
||||
def countWord(txt):
|
||||
if len(txt):
|
||||
return len(lxml.html.fromstring(txt).text_content().split())
|
||||
else:
|
||||
return 0
|
||||
|
||||
class ParseOptions:
|
||||
def __init__(self, environ=False):
|
||||
self.url = ''
|
||||
self.options = {}
|
||||
roptions = []
|
||||
|
||||
if environ:
|
||||
if 'REQUEST_URI' in environ:
|
||||
self.url = environ['REQUEST_URI'][1:]
|
||||
else:
|
||||
self.url = environ['PATH_INFO'][1:]
|
||||
|
||||
if self.url.startswith('/morss.py'):
|
||||
self.url = self.url[10:]
|
||||
elif self.url.startswith('morss.py'):
|
||||
self.url = self.url[9:]
|
||||
|
||||
if self.url.startswith(':'):
|
||||
roptions = self.url.split('/')[0].split(':')[1:]
|
||||
self.url = self.url.split('/', 1)[1]
|
||||
else:
|
||||
if len(sys.argv) <= 1:
|
||||
return
|
||||
|
||||
roptions = sys.argv[1:-1]
|
||||
self.url = sys.argv[-1]
|
||||
|
||||
for option in roptions:
|
||||
split = option.split('=', 1)
|
||||
if len(split) > 1:
|
||||
if split[0].lower() == 'true':
|
||||
self.options[split[0]] = True
|
||||
elif split[0].lower() == 'false':
|
||||
self.options[split[0]] = False
|
||||
else:
|
||||
self.options[split[0]] = split[1]
|
||||
else:
|
||||
self.options[split[0]] = True
|
||||
|
||||
def __getattr__(self, key):
|
||||
if key in self.options:
|
||||
return self.options[key]
|
||||
else:
|
||||
return False
|
||||
|
||||
def __contains__(self, key):
|
||||
return self.options.__contains__(key)
|
||||
|
||||
class Cache:
|
||||
""" Light, error-prone caching system. """
|
||||
def __init__(self, folder, key, persistent=False, dic=False):
|
||||
self._key = key
|
||||
self._dir = folder
|
||||
self._dic = dic
|
||||
|
||||
maxsize = os.statvfs('./').f_namemax - len(self._dir) - 1
|
||||
self._hash = urllib.quote_plus(self._key)[:maxsize]
|
||||
|
||||
self._file = self._dir + '/' + self._hash
|
||||
|
||||
self._cached = {} # what *was* cached
|
||||
self._cache = {} # new things to put in cache
|
||||
|
||||
if os.path.isfile(self._file):
|
||||
data = open(self._file).read()
|
||||
if data:
|
||||
self._cached = json.loads(data)
|
||||
|
||||
if persistent:
|
||||
self._cache = self._cached
|
||||
|
||||
def __del__(self):
|
||||
self.save()
|
||||
|
||||
def __contains__(self, key):
|
||||
return key in self._cache or key in self._cached
|
||||
|
||||
def get(self, key):
|
||||
if key in self._cache:
|
||||
return self._cache[key]
|
||||
elif key in self._cached:
|
||||
self._cache[key] = self._cached[key]
|
||||
return self._cached[key]
|
||||
else:
|
||||
if self._dic:
|
||||
self._cache[key] = {}
|
||||
return self._cache[key]
|
||||
else:
|
||||
return None
|
||||
|
||||
def set(self, key, content):
|
||||
self._cache[key] = content
|
||||
|
||||
__getitem__ = get
|
||||
__setitem__ = set
|
||||
|
||||
def save(self):
|
||||
if len(self._cache) == 0:
|
||||
return
|
||||
|
||||
if not os.path.exists(self._dir):
|
||||
os.makedirs(self._dir)
|
||||
|
||||
out = json.dumps(self._cache, indent=4)
|
||||
|
||||
with open(self._file, 'w+') as file:
|
||||
file.write(out)
|
||||
|
||||
def isYoungerThan(self, sec):
|
||||
if not os.path.exists(self._file):
|
||||
return False
|
||||
|
||||
return time.time() - os.path.getmtime(self._file) < sec
|
||||
|
||||
def new(self, key, persistent=False, dic=False):
|
||||
""" Returns a Cache object in the same directory """
|
||||
if key != self._key:
|
||||
return Cache(self._dir, key, persistent, dic)
|
||||
else:
|
||||
return self
|
||||
|
||||
def redirect(self, key, persistent=False):
|
||||
return self.__init__(self._dir, key, persistent)
|
||||
|
||||
class SimpleDownload(urllib2.HTTPCookieProcessor):
|
||||
"""
|
||||
Custom urllib2 handler to download a page, using etag/last-modified headers,
|
||||
to save bandwidth. The given headers are added back into the header on error
|
||||
304 for easier use.
|
||||
"""
|
||||
def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None, accept=None, strict=False):
|
||||
urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
|
||||
self.cache = cache
|
||||
self.etag = etag
|
||||
self.lastmodified = lastmodified
|
||||
self.useragent = useragent
|
||||
self.decode = decode
|
||||
self.accept = accept
|
||||
self.strict = strict
|
||||
|
||||
def http_request(self, req):
|
||||
urllib2.HTTPCookieProcessor.http_request(self, req)
|
||||
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
||||
req.add_unredirected_header('User-Agent', self.useragent)
|
||||
if req.get_host() != 'feeds.feedburner.com':
|
||||
req.add_unredirected_header('Referer', 'http://%s' % req.get_host())
|
||||
|
||||
if self.cache:
|
||||
if self.etag:
|
||||
req.add_unredirected_header('If-None-Match', self.etag)
|
||||
if self.lastmodified:
|
||||
req.add_unredirected_header('If-Modified-Since', self.lastmodified)
|
||||
|
||||
if self.accept is not None:
|
||||
# req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
|
||||
if isinstance(self.accept, basestring):
|
||||
self.accept = (self.accept,)
|
||||
|
||||
out = {}
|
||||
rank = 1.1
|
||||
for group in self.accept:
|
||||
rank = rank - 0.1
|
||||
|
||||
if isinstance(group, basestring):
|
||||
if group in MIMETYPE:
|
||||
group = MIMETYPE[group]
|
||||
else:
|
||||
out[group] = rank
|
||||
continue
|
||||
|
||||
for mime in group:
|
||||
if mime not in out:
|
||||
out[mime] = rank
|
||||
|
||||
if not self.strict:
|
||||
out['*/*'] = rank-0.1
|
||||
|
||||
string = ','.join([x+';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
|
||||
req.add_unredirected_header('Accept', string)
|
||||
|
||||
return req
|
||||
|
||||
def http_error_304(self, req, fp, code, msg, headers):
|
||||
log('http cached')
|
||||
if self.etag:
|
||||
headers.addheader('etag', self.etag)
|
||||
if self.lastmodified:
|
||||
headers.addheader('last-modified', self.lastmodified)
|
||||
resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
|
||||
return resp
|
||||
|
||||
def http_response(self, req, resp):
|
||||
urllib2.HTTPCookieProcessor.http_response(self, req, resp)
|
||||
odata = data = resp.read()
|
||||
|
||||
if 200 <= resp.code < 300:
|
||||
# gzip
|
||||
if resp.headers.get('Content-Encoding') == 'gzip':
|
||||
log('un-gzip')
|
||||
data = GzipFile(fileobj=StringIO(data), mode='r').read()
|
||||
|
||||
if 200 <= resp.code < 300 and resp.info().maintype == 'text':
|
||||
# <meta> redirect
|
||||
if resp.info().type in MIMETYPE['html']:
|
||||
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
|
||||
if match:
|
||||
newurl = match.groups()[0]
|
||||
log('redirect: %s' % newurl)
|
||||
|
||||
newheaders = dict((k,v) for k,v in req.headers.items()
|
||||
if k.lower() not in ('content-length', 'content-type'))
|
||||
new = urllib2.Request(newurl,
|
||||
headers=newheaders,
|
||||
origin_req_host=req.get_origin_req_host(),
|
||||
unverifiable=True)
|
||||
|
||||
return self.parent.open(new, timeout=req.timeout)
|
||||
|
||||
# encoding
|
||||
enc = detEncoding(data, resp)
|
||||
|
||||
if enc:
|
||||
data = data.decode(enc, 'replace')
|
||||
|
||||
if not self.decode:
|
||||
data = data.encode(enc)
|
||||
|
||||
fp = StringIO(data)
|
||||
old_resp = resp
|
||||
resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
https_request = http_request
|
||||
|
||||
def detEncoding(data, con=None):
|
||||
if con is not None and con.headers.getparam('charset'):
|
||||
log('header')
|
||||
return con.headers.getparam('charset')
|
||||
|
||||
match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
|
||||
if match:
|
||||
log('meta.re')
|
||||
return match.groups()[0]
|
||||
|
||||
match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
|
||||
if match:
|
||||
return match.groups()[0].lower()
|
||||
|
||||
return None
|
||||
|
||||
def Fix(item, feedurl='/'):
|
||||
""" Improves feed items (absolute links, resolve feedburner links, etc) """
|
||||
|
||||
# check unwanted uppercase title
|
||||
if len(item.title) > 20 and item.title.isupper():
|
||||
item.title = item.title.title()
|
||||
|
||||
# check if it includes link
|
||||
if not item.link:
|
||||
log('no link')
|
||||
return item
|
||||
|
||||
# check relative urls
|
||||
item.link = urlparse.urljoin(feedurl, item.link)
|
||||
|
||||
# google
|
||||
if fnmatch(item.link, 'http://www.google.com/url?q=*'):
|
||||
item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['q'][0]
|
||||
log(item.link)
|
||||
|
||||
# facebook
|
||||
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
|
||||
item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['u'][0]
|
||||
log(item.link)
|
||||
|
||||
# feedburner
|
||||
feeds.NSMAP['feedburner'] = 'http://rssnamespace.org/feedburner/ext/1.0'
|
||||
match = item.xval('feedburner:origLink')
|
||||
if match:
|
||||
item.link = match
|
||||
|
||||
# feedsportal
|
||||
match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link)
|
||||
if match:
|
||||
url = match.groups()[0].split('0')
|
||||
t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'H':',', 'I':'_', 'L':'http://', 'S':'www.', 'N':'.com', 'O':'.co.uk'}
|
||||
item.link = ''.join([(t[s[0]] if s[0] in t else '=') + s[1:] for s in url[1:]])
|
||||
log(item.link)
|
||||
|
||||
# reddit
|
||||
if urlparse.urlparse(feedurl).netloc == 'www.reddit.com':
|
||||
match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
|
||||
if len(match):
|
||||
item.link = match[0]
|
||||
log(item.link)
|
||||
|
||||
return item
|
||||
|
||||
def Fill(item, cache, feedurl='/', fast=False):
|
||||
""" Returns True when it has done its best """
|
||||
|
||||
if not item.link:
|
||||
log('no link')
|
||||
return item
|
||||
|
||||
log(item.link)
|
||||
|
||||
# content already provided?
|
||||
count_content = countWord(item.content)
|
||||
count_desc = countWord(item.desc)
|
||||
|
||||
if max(count_content, count_desc) > 500:
|
||||
if count_desc > count_content:
|
||||
item.content = item.desc
|
||||
del item.desc
|
||||
log('reversed sizes')
|
||||
log('long enough')
|
||||
return True
|
||||
|
||||
if count_content > 5*count_desc > 0 and count_content > 50:
|
||||
log('content bigger enough')
|
||||
return True
|
||||
|
||||
link = item.link
|
||||
|
||||
# twitter
|
||||
if urlparse.urlparse(feedurl).netloc == 'twitter.com':
|
||||
match = lxml.html.fromstring(item.content).xpath('//a/@data-expanded-url')
|
||||
if len(match):
|
||||
link = match[0]
|
||||
log(link)
|
||||
else:
|
||||
link = None
|
||||
|
||||
# facebook
|
||||
if urlparse.urlparse(feedurl).netloc == 'graph.facebook.com':
|
||||
match = lxml.html.fromstring(item.content).xpath('//a/@href')
|
||||
if len(match) and urlparse.urlparse(match[0]).netloc != 'www.facebook.com':
|
||||
link = match[0]
|
||||
log(link)
|
||||
else:
|
||||
link = None
|
||||
|
||||
if link is None:
|
||||
log('no used link')
|
||||
return True
|
||||
|
||||
# check cache and previous errors
|
||||
if link in cache:
|
||||
content = cache.get(link)
|
||||
match = re.search(r'^error-([a-z]{2,10})$', content)
|
||||
if match:
|
||||
if cache.isYoungerThan(DELAY):
|
||||
log('cached error: %s' % match.groups()[0])
|
||||
return True
|
||||
else:
|
||||
log('old error')
|
||||
else:
|
||||
log('cached')
|
||||
item.pushContent(cache.get(link))
|
||||
return True
|
||||
|
||||
# super-fast mode
|
||||
if fast:
|
||||
log('skipped')
|
||||
return False
|
||||
|
||||
# download
|
||||
try:
|
||||
url = link.encode('utf-8')
|
||||
con = urllib2.build_opener(SimpleDownload(accept=('html', 'text/*'), strict=True)).open(url, timeout=TIMEOUT)
|
||||
data = con.read()
|
||||
except (IOError, httplib.HTTPException) as e:
|
||||
log('http error: %s' % e.message)
|
||||
cache.set(link, 'error-http')
|
||||
return True
|
||||
|
||||
if con.info().type not in MIMETYPE['html'] and con.info().type != 'text/plain':
|
||||
log('non-text page')
|
||||
cache.set(link, 'error-type')
|
||||
return True
|
||||
|
||||
out = readability.Document(data, url=con.url).summary(True)
|
||||
|
||||
if countWord(out) > max(count_content, count_desc) > 0:
|
||||
item.pushContent(out)
|
||||
cache.set(link, out)
|
||||
else:
|
||||
log('not bigger enough')
|
||||
cache.set(link, 'error-length')
|
||||
return True
|
||||
|
||||
return True
|
||||
|
||||
def Init(url, cachePath, options):
|
||||
# url clean up
|
||||
log(url)
|
||||
|
||||
if url is None:
|
||||
raise MorssException('No url provided')
|
||||
|
||||
if urlparse.urlparse(url).scheme not in PROTOCOL:
|
||||
url = 'http://' + url
|
||||
log(url)
|
||||
|
||||
url = url.replace(' ', '%20')
|
||||
|
||||
# cache
|
||||
cache = Cache(cachePath, url, options.proxy)
|
||||
log(cache._hash)
|
||||
|
||||
return (url, cache)
|
||||
|
||||
def Fetch(url, cache, options):
|
||||
# do some useful facebook work
|
||||
feedify.PreWorker(url, cache)
|
||||
|
||||
if 'redirect' in cache:
|
||||
url = cache.get('redirect')
|
||||
log('url redirect')
|
||||
log(url)
|
||||
|
||||
if 'cache' in cache:
|
||||
cache.redirect(cache.get('cache'))
|
||||
log('cache redirect')
|
||||
|
||||
# fetch feed
|
||||
if cache.isYoungerThan(DELAY) and not options.theforce and 'xml' in cache and 'style' in cache:
|
||||
log('xml cached')
|
||||
xml = cache.get('xml')
|
||||
style = cache.get('style')
|
||||
else:
|
||||
try:
|
||||
opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), accept=('xml','html'))
|
||||
con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT)
|
||||
xml = con.read()
|
||||
except (IOError, httplib.HTTPException):
|
||||
raise MorssException('Error downloading feed')
|
||||
|
||||
cache.set('xml', xml)
|
||||
cache.set('etag', con.headers.getheader('etag'))
|
||||
cache.set('lastmodified', con.headers.getheader('last-modified'))
|
||||
|
||||
if xml.startswith('<?xml') or con.info().type in MIMETYPE['xml']:
|
||||
style = 'normal'
|
||||
elif feedify.supported(url):
|
||||
style = 'feedify'
|
||||
elif con.info().type in MIMETYPE['html']:
|
||||
style = 'html'
|
||||
else:
|
||||
style = 'none'
|
||||
log(con.info().type)
|
||||
|
||||
cache.set('style', style)
|
||||
|
||||
log(style)
|
||||
|
||||
if style == 'normal':
|
||||
rss = feeds.parse(xml)
|
||||
elif style == 'feedify':
|
||||
feed = feedify.Builder(url, xml, cache)
|
||||
feed.build()
|
||||
rss = feed.feed
|
||||
elif style == 'html':
|
||||
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
|
||||
if len(match):
|
||||
link = urlparse.urljoin(url, match[0])
|
||||
return Fetch(link, cachePath, options)
|
||||
else:
|
||||
log('no-link html')
|
||||
raise MorssException('Link provided is an HTML page, which doesn\'t link to a feed')
|
||||
else:
|
||||
log('random page')
|
||||
raise MorssException('Link provided is not a valid feed')
|
||||
|
||||
|
||||
cache.save()
|
||||
return rss
|
||||
|
||||
def Gather(rss, url, cache, options):
|
||||
log('YEAH')
|
||||
|
||||
size = len(rss.items)
|
||||
startTime = time.time()
|
||||
|
||||
# custom settings
|
||||
global LIM_ITEM
|
||||
global LIM_TIME
|
||||
global MAX_ITEM
|
||||
global MAX_TIME
|
||||
|
||||
if options.progress:
|
||||
MAX_TIME = -1
|
||||
LIM_TIME = 15
|
||||
MAX_ITEM = -1
|
||||
LIM_ITEM = -1
|
||||
if options.cache:
|
||||
MAX_TIME = 0
|
||||
if options.OFCOURSENOT:
|
||||
log('welcome home')
|
||||
LIM_ITEM = -1
|
||||
LIM_TIME = -1
|
||||
MAX_ITEM = -1
|
||||
MAX_TIME = -1
|
||||
|
||||
# set
|
||||
def runner(queue):
|
||||
while True:
|
||||
value = queue.get()
|
||||
try:
|
||||
worker(*value)
|
||||
except Exception as e:
|
||||
log('Thread Error: %s' % e.message)
|
||||
queue.task_done()
|
||||
|
||||
def worker(i, item):
|
||||
if time.time() - startTime > LIM_TIME >= 0 or i+1 > LIM_ITEM >= 0:
|
||||
log('dropped')
|
||||
item.remove()
|
||||
return
|
||||
|
||||
item = Fix(item, url)
|
||||
|
||||
if time.time() - startTime > MAX_TIME >= 0 or i+1 > MAX_ITEM >= 0:
|
||||
if not options.proxy:
|
||||
if Fill(item, cache, url, True) is False:
|
||||
item.remove()
|
||||
return
|
||||
else:
|
||||
if not options.proxy:
|
||||
Fill(item, cache, url)
|
||||
|
||||
if 'al' in options:
|
||||
if i+1 > int(options.al):
|
||||
item.remove()
|
||||
return
|
||||
|
||||
if item.desc and item.content:
|
||||
if options.clip:
|
||||
item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
|
||||
del item.desc
|
||||
if not options.keep:
|
||||
del item.desc
|
||||
|
||||
queue = Queue.Queue()
|
||||
|
||||
for i in range(THREADS):
|
||||
t = threading.Thread(target=runner, args=(queue,))
|
||||
t.daemon = True
|
||||
t.start()
|
||||
|
||||
for i, item in enumerate(rss.items):
|
||||
queue.put([i, item])
|
||||
|
||||
queue.join()
|
||||
cache.save()
|
||||
|
||||
log(len(rss.items))
|
||||
log(time.time() - startTime)
|
||||
|
||||
return rss
|
||||
|
||||
def cgi_app(environ, start_response):
|
||||
options = ParseOptions(environ)
|
||||
url = options.url
|
||||
headers = {}
|
||||
|
||||
global DEBUG
|
||||
DEBUG = options.debug
|
||||
|
||||
if 'HTTP_IF_NONE_MATCH' in environ:
|
||||
if not options.force and not options.facebook and time.time() - int(environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY:
|
||||
headers['status'] = '304 Not Modified'
|
||||
start_response(headers['status'], headers.items())
|
||||
log(url)
|
||||
log('etag good')
|
||||
return []
|
||||
|
||||
headers['status'] = '200 OK'
|
||||
headers['etag'] = '"%s"' % int(time.time())
|
||||
|
||||
if options.html:
|
||||
headers['content-type'] = 'text/html'
|
||||
elif options.debug or options.txt:
|
||||
headers['content-type'] = 'text/plain'
|
||||
elif options.json:
|
||||
headers['content-type'] = 'application/json'
|
||||
else:
|
||||
headers['content-type'] = 'text/xml'
|
||||
|
||||
url, cache = Init(url, os.getcwd() + '/cache', options)
|
||||
RSS = Fetch(url, cache, options)
|
||||
RSS = Gather(RSS, url, cache, options)
|
||||
|
||||
if headers['content-type'] == 'text/xml':
|
||||
headers['content-type'] = RSS.mimetype
|
||||
|
||||
start_response(headers['status'], headers.items())
|
||||
|
||||
if not DEBUG and not options.silent:
|
||||
if options.json:
|
||||
if options.indent:
|
||||
return json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x))
|
||||
else:
|
||||
return json.dumps(RSS, sort_keys=True, default=lambda x: dict(x))
|
||||
else:
|
||||
return RSS.tostring(xml_declaration=True, encoding='UTF-8')
|
||||
|
||||
log('done')
|
||||
|
||||
def cgi_wrapper(environ, start_response):
|
||||
try:
|
||||
return cgi_app(environ, start_response)
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise
|
||||
except MorssException as e:
|
||||
headers = {}
|
||||
headers['status'] = '500 Oops'
|
||||
headers['content-type'] = 'text/plain'
|
||||
start_response(headers['status'], headers.items(), sys.exc_info())
|
||||
return 'Internal Error: %s' % e.message
|
||||
except Exception as e:
|
||||
headers = {}
|
||||
headers['status'] = '500 Oops'
|
||||
headers['content-type'] = 'text/plain'
|
||||
start_response(headers['status'], headers.items(), sys.exc_info())
|
||||
return 'Unknown Error: %s' % e.message
|
||||
|
||||
def cli_app():
|
||||
options = ParseOptions()
|
||||
url = options.url
|
||||
|
||||
global DEBUG
|
||||
DEBUG = options.debug
|
||||
|
||||
url, cache = Init(url, os.path.expanduser('~/.cache/morss'), options)
|
||||
RSS = Fetch(url, cache, options)
|
||||
RSS = Gather(RSS, url, cache, options)
|
||||
|
||||
if not DEBUG and not options.silent:
|
||||
if options.json:
|
||||
if options.indent:
|
||||
print json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x))
|
||||
else:
|
||||
print json.dumps(RSS, sort_keys=True, default=lambda x: dict(x))
|
||||
else:
|
||||
print RSS.tostring(xml_declaration=True, encoding='UTF-8')
|
||||
|
||||
log('done')
|
||||
|
||||
if options.facebook:
|
||||
facebook = Cache(cachePath, 'facebook', persistent=True, dic=True)
|
||||
|
||||
# get real token from code
|
||||
code = urlparse.parse_qs(urlparse.urlparse(url).query)['code'][0]
|
||||
eurl = "https://graph.facebook.com/oauth/access_token?client_id={app_id}&redirect_uri={redirect_uri}&client_secret={app_secret}&code={code_parameter}".format(app_id=FBAPPID, app_secret=FBSECRET, code_parameter=code, redirect_uri="http://test.morss.it/:facebook/")
|
||||
token = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())['access_token'][0]
|
||||
|
||||
# get long-lived access token
|
||||
eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(app_id=FBAPPID, app_secret=FBSECRET, short_lived_token=token)
|
||||
values = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())
|
||||
|
||||
ltoken = values['access_token'][0]
|
||||
expires = int(time.time() + int(values['expires'][0]))
|
||||
|
||||
# get user id
|
||||
iurl = "https://graph.facebook.com/me?fields=id&access_token={token}".format(ltoken)
|
||||
user_id = json.loads(urllib2.urlopen(iurl).read())['id']
|
||||
|
||||
# do sth out of it
|
||||
if user_id not in facebook['user']:
|
||||
facebook['user'][user_id] = {'original': ltoken}
|
||||
|
||||
facebook['token'][ltoken] = {'user': user_id, 'expires': expires}
|
||||
facebook['user'][user_id]['token'] = ltoken
|
||||
|
||||
facebook.save()
|
||||
|
||||
if 'REQUEST_URI' in os.environ:
|
||||
print 'Status: 200'
|
||||
print 'Content-Type: text/plain'
|
||||
print ''
|
||||
|
||||
print "token updated"
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
def main():
|
||||
if 'REQUEST_URI' in os.environ:
|
||||
wsgiref.handlers.CGIHandler().run(cgi_wrapper)
|
||||
|
||||
elif len(sys.argv) <= 1:
|
||||
httpd = wsgiref.simple_server.make_server('', 8080, cgi_wrapper)
|
||||
httpd.serve_forever()
|
||||
|
||||
else:
|
||||
try:
|
||||
cli_app()
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise
|
||||
except MorssException as e:
|
||||
print 'Internal Error: %s' % e.message
|
||||
except Exception as e:
|
||||
print 'Unknown Error: %s' % e.message
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Reference in New Issue
Block a user