morss/morss.py

800 lines
20 KiB
Python
Raw Normal View History

2013-04-04 15:56:37 +00:00
#!/usr/bin/env python
2013-02-25 14:50:32 +00:00
import sys
2013-02-25 17:01:59 +00:00
import os
import os.path
import time
2013-04-22 22:04:44 +00:00
import Queue
import threading
2013-10-01 17:49:53 +00:00
from fnmatch import fnmatch
2013-04-22 22:04:44 +00:00
from base64 import b64encode, b64decode
import re
import string
import json
2013-04-22 22:04:44 +00:00
import lxml.html
import lxml.html.clean
import lxml.builder
2013-04-22 22:04:44 +00:00
import feeds
2013-09-25 10:36:21 +00:00
import feedify
import httplib
import urllib
2013-02-25 14:50:32 +00:00
import urllib2
import chardet
import urlparse
import wsgiref.util
import wsgiref.simple_server
import wsgiref.handlers
from gzip import GzipFile
from StringIO import StringIO
from readability import readability
LIM_ITEM = 100 # deletes what's beyond
LIM_TIME = 7 # deletes what's after
MAX_ITEM = 50 # cache-only beyond
2013-08-24 21:40:37 +00:00
MAX_TIME = 7 # cache-only after (in sec)
DELAY = 10*60 # xml cache & ETag cache (in sec)
2013-08-24 21:40:37 +00:00
TIMEOUT = 2 # http timeout (in sec)
THREADS = 10 # number of threads (1 for single-threaded)
DEBUG = False
HOLD = False
2013-05-01 15:54:17 +00:00
UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)'
2013-09-25 09:11:11 +00:00
UA_HTML = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
2013-05-01 15:54:17 +00:00
MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
2013-12-01 14:48:17 +00:00
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
2013-11-09 17:48:06 +00:00
FBAPPID = "<insert yours>"
FBSECRET = "<insert yours>"
FBAPPTOKEN = FBAPPID + '|' + FBSECRET
PROTOCOL = ['http', 'https', 'ftp']
if 'SCRIPT_NAME' in os.environ:
httplib.HTTPConnection.debuglevel = 1
import cgitb
cgitb.enable()
2013-02-25 17:01:59 +00:00
class MorssException(Exception):
pass
def log(txt, force=False):
if DEBUG or force:
if 'REQUEST_URI' in os.environ:
open('morss.log', 'a').write("%s\n" % repr(txt))
else:
print repr(txt)
def lenHTML(txt):
2013-06-08 15:30:11 +00:00
if len(txt):
return len(lxml.html.fromstring(txt).text_content())
else:
return 0
def countWord(txt):
if len(txt):
return len(lxml.html.fromstring(txt).text_content().split())
else:
return 0
class ParseOptions:
def __init__(self, environ=False):
self.url = ''
self.options = {}
roptions = []
if environ:
if 'REQUEST_URI' in environ:
self.url = environ['REQUEST_URI'][1:]
else:
self.url = environ['PATH_INFO'][1:]
if self.url.startswith('/morss.py'):
self.url = self.url[10:]
elif self.url.startswith('morss.py'):
self.url = self.url[9:]
if self.url.startswith(':'):
roptions = self.url.split('/')[0].split(':')[1:]
self.url = self.url.split('/', 1)[1]
else:
if len(sys.argv) <= 1:
return
roptions = sys.argv[1:-1]
self.url = sys.argv[-1]
2013-05-01 15:57:09 +00:00
for option in roptions:
split = option.split('=', 1)
if len(split) > 1:
if split[0].lower() == 'true':
self.options[split[0]] = True
2014-01-08 00:48:39 +00:00
elif split[0].lower() == 'false':
self.options[split[0]] = False
2014-01-08 00:48:39 +00:00
else:
self.options[split[0]] = split[1]
else:
self.options[split[0]] = True
2013-05-01 15:57:09 +00:00
def __getattr__(self, key):
if key in self.options:
return self.options[key]
else:
return False
2013-05-01 15:57:09 +00:00
def __contains__(self, key):
return self.options.__contains__(key)
class Cache:
""" Light, error-prone caching system. """
def __init__(self, folder, key, persistent=False, dic=False):
self._key = key
self._dir = folder
self._dic = dic
maxsize = os.statvfs('./').f_namemax - len(self._dir) - 1
self._hash = urllib.quote_plus(self._key)[:maxsize]
2013-07-14 17:00:16 +00:00
self._file = self._dir + '/' + self._hash
self._cached = {} # what *was* cached
self._cache = {} # new things to put in cache
if os.path.isfile(self._file):
data = open(self._file).read()
if data:
self._cached = json.loads(data)
if persistent:
self._cache = self._cached
def __del__(self):
self.save()
def __contains__(self, key):
return key in self._cache or key in self._cached
def get(self, key):
if key in self._cache:
return self._cache[key]
elif key in self._cached:
self._cache[key] = self._cached[key]
return self._cached[key]
else:
if self._dic:
self._cache[key] = {}
return self._cache[key]
else:
return None
def set(self, key, content):
self._cache[key] = content
__getitem__ = get
__setitem__ = set
def save(self):
2013-04-22 20:56:38 +00:00
if len(self._cache) == 0:
return
if not os.path.exists(self._dir):
os.makedirs(self._dir)
out = json.dumps(self._cache, indent=4)
with open(self._file, 'w+') as file:
file.write(out)
def isYoungerThan(self, sec):
if not os.path.exists(self._file):
return False
return time.time() - os.path.getmtime(self._file) < sec
def new(self, key, persistent=False, dic=False):
""" Returns a Cache object in the same directory """
if key != self._key:
return Cache(self._dir, key, persistent, dic)
else:
return self
def redirect(self, key, persistent=False):
return self.__init__(self._dir, key, persistent)
class SimpleDownload(urllib2.HTTPCookieProcessor):
"""
Custom urllib2 handler to download a page, using etag/last-modified headers,
to save bandwidth. The given headers are added back into the header on error
304 for easier use.
"""
def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None, accept=None, strict=False):
urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
self.cache = cache
self.etag = etag
self.lastmodified = lastmodified
self.useragent = useragent
self.decode = decode
self.accept = accept
self.strict = strict
def http_request(self, req):
urllib2.HTTPCookieProcessor.http_request(self, req)
req.add_unredirected_header('Accept-Encoding', 'gzip')
req.add_unredirected_header('User-Agent', self.useragent)
if req.get_host() != 'feeds.feedburner.com':
req.add_unredirected_header('Referer', 'http://%s' % req.get_host())
if self.cache:
if self.etag:
req.add_unredirected_header('If-None-Match', self.etag)
if self.lastmodified:
req.add_unredirected_header('If-Modified-Since', self.lastmodified)
if self.accept is not None:
# req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
if isinstance(self.accept, basestring):
self.accept = (self.accept,)
out = {}
rank = 1.1
for group in self.accept:
rank = rank - 0.1
if isinstance(group, basestring):
if group in MIMETYPE:
group = MIMETYPE[group]
else:
out[group] = rank
continue
for mime in group:
if mime not in out:
out[mime] = rank
if not self.strict:
out['*/*'] = rank-0.1
2013-12-01 14:47:20 +00:00
string = ','.join([x+';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
req.add_unredirected_header('Accept', string)
return req
def http_error_304(self, req, fp, code, msg, headers):
log('http cached')
if self.etag:
headers.addheader('etag', self.etag)
if self.lastmodified:
headers.addheader('last-modified', self.lastmodified)
resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
return resp
def http_response(self, req, resp):
urllib2.HTTPCookieProcessor.http_response(self, req, resp)
odata = data = resp.read()
if 200 <= resp.code < 300:
# gzip
if resp.headers.get('Content-Encoding') == 'gzip':
log('un-gzip')
data = GzipFile(fileobj=StringIO(data), mode='r').read()
if 200 <= resp.code < 300 and resp.info().maintype == 'text':
# <meta> redirect
if resp.info().type in MIMETYPE['html']:
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
if match:
newurl = match.groups()[0]
log('redirect: %s' % newurl)
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ('content-length', 'content-type'))
new = urllib2.Request(newurl,
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True)
return self.parent.open(new, timeout=req.timeout)
# encoding
enc = detEncoding(data, resp)
if enc:
data = data.decode(enc, 'replace')
if not self.decode:
data = data.encode(enc)
fp = StringIO(data)
old_resp = resp
resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
https_request = http_request
def detEncoding(data, con=None):
if con is not None and con.headers.getparam('charset'):
log('header')
return con.headers.getparam('charset')
match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
if match:
log('meta.re')
return match.groups()[0]
match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
if match:
return match.groups()[0].lower()
return None
def Fix(item, feedurl='/'):
""" Improves feed items (absolute links, resolve feedburner links, etc) """
# check unwanted uppercase title
if len(item.title) > 20 and item.title.isupper():
item.title = item.title.title()
# check if it includes link
if not item.link:
log('no link')
return item
2013-09-25 09:49:45 +00:00
# check relative urls
item.link = urlparse.urljoin(feedurl, item.link)
2013-10-01 17:49:53 +00:00
# google
if fnmatch(item.link, 'http://www.google.com/url?q=*'):
item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['q'][0]
log(item.link)
# facebook
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['u'][0]
log(item.link)
# feedburner
feeds.NSMAP['feedburner'] = 'http://rssnamespace.org/feedburner/ext/1.0'
match = item.xval('feedburner:origLink')
if match:
item.link = match
# feedsportal
match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link)
if match:
url = match.groups()[0].split('0')
t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'H':',', 'I':'_', 'L':'http://', 'S':'www.', 'N':'.com', 'O':'.co.uk'}
2013-07-14 17:00:16 +00:00
item.link = ''.join([(t[s[0]] if s[0] in t else '=') + s[1:] for s in url[1:]])
log(item.link)
# reddit
if urlparse.urlparse(feedurl).netloc == 'www.reddit.com':
match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
if len(match):
item.link = match[0]
log(item.link)
return item
def Fill(item, cache, feedurl='/', fast=False):
""" Returns True when it has done its best """
if not item.link:
log('no link')
return item
log(item.link)
# content already provided?
count_content = countWord(item.content)
count_desc = countWord(item.desc)
if max(count_content, count_desc) > 500:
if count_desc > count_content:
item.content = item.desc
del item.desc
log('reversed sizes')
log('long enough')
return True
if count_content > 5*count_desc > 0 and count_content > 50:
log('content bigger enough')
return True
link = item.link
# twitter
if urlparse.urlparse(feedurl).netloc == 'twitter.com':
match = lxml.html.fromstring(item.content).xpath('//a/@data-expanded-url')
if len(match):
link = match[0]
log(link)
else:
link = None
2013-11-09 17:48:06 +00:00
# facebook
2013-10-21 19:31:02 +00:00
if urlparse.urlparse(feedurl).netloc == 'graph.facebook.com':
2013-11-09 17:48:06 +00:00
match = lxml.html.fromstring(item.content).xpath('//a/@href')
if len(match) and urlparse.urlparse(match[0]).netloc != 'www.facebook.com':
link = match[0]
log(link)
else:
link = None
2013-10-21 19:31:02 +00:00
if link is None:
log('no used link')
return True
2013-05-01 15:56:03 +00:00
# check cache and previous errors
if link in cache:
content = cache.get(link)
match = re.search(r'^error-([a-z]{2,10})$', content)
if match:
2013-08-24 21:40:37 +00:00
if cache.isYoungerThan(DELAY):
log('cached error: %s' % match.groups()[0])
return True
2013-05-01 15:56:03 +00:00
else:
log('old error')
2013-05-01 15:56:03 +00:00
else:
log('cached')
item.pushContent(cache.get(link))
return True
# super-fast mode
if fast:
log('skipped')
return False
# download
try:
url = link.encode('utf-8')
con = urllib2.build_opener(SimpleDownload(accept=('html', 'text/*'), strict=True)).open(url, timeout=TIMEOUT)
data = con.read()
2014-01-08 00:38:50 +00:00
except (IOError, httplib.HTTPException) as e:
log('http error: %s' % e.message)
cache.set(link, 'error-http')
return True
if con.info().type not in MIMETYPE['html'] and con.info().type != 'text/plain':
log('non-text page')
cache.set(link, 'error-type')
return True
out = readability.Document(data, url=con.url).summary(True)
if countWord(out) > max(count_content, count_desc) > 0:
item.pushContent(out)
cache.set(link, out)
else:
log('not bigger enough')
cache.set(link, 'error-length')
return True
return True
def Init(url, cachePath, options):
# url clean up
log(url)
if url is None:
raise MorssException('No url provided')
if urlparse.urlparse(url).scheme not in PROTOCOL:
url = 'http://' + url
log(url)
2013-08-24 21:40:37 +00:00
url = url.replace(' ', '%20')
# cache
cache = Cache(cachePath, url, options.proxy)
2013-09-25 09:15:11 +00:00
log(cache._hash)
return (url, cache)
def Fetch(url, cache, options):
2013-11-09 17:48:06 +00:00
# do some useful facebook work
feedify.PreWorker(url, cache)
if 'redirect' in cache:
url = cache.get('redirect')
log('url redirect')
log(url)
if 'cache' in cache:
cache.redirect(cache.get('cache'))
log('cache redirect')
# fetch feed
if cache.isYoungerThan(DELAY) and not options.theforce and 'xml' in cache and 'style' in cache:
log('xml cached')
xml = cache.get('xml')
style = cache.get('style')
else:
try:
opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), accept=('xml','html'))
con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT)
xml = con.read()
except (IOError, httplib.HTTPException):
raise MorssException('Error downloading feed')
cache.set('xml', xml)
cache.set('etag', con.headers.getheader('etag'))
cache.set('lastmodified', con.headers.getheader('last-modified'))
if xml.startswith('<?xml') or con.info().type in MIMETYPE['xml']:
style = 'normal'
2013-09-25 10:36:21 +00:00
elif feedify.supported(url):
style = 'feedify'
elif con.info().type in MIMETYPE['html']:
style = 'html'
else:
style = 'none'
log(con.info().type)
cache.set('style', style)
log(style)
if style == 'normal':
rss = feeds.parse(xml)
2013-09-25 10:36:21 +00:00
elif style == 'feedify':
2013-11-09 17:48:06 +00:00
feed = feedify.Builder(url, xml, cache)
2013-10-21 19:28:43 +00:00
feed.build()
rss = feed.feed
elif style == 'html':
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
if len(match):
link = urlparse.urljoin(url, match[0])
return Fetch(link, cachePath, options)
else:
log('no-link html')
raise MorssException('Link provided is an HTML page, which doesn\'t link to a feed')
else:
log('random page')
raise MorssException('Link provided is not a valid feed')
cache.save()
return rss
def Gather(rss, url, cache, options):
log('YEAH')
size = len(rss.items)
startTime = time.time()
# custom settings
global LIM_ITEM
global LIM_TIME
global MAX_ITEM
global MAX_TIME
if options.progress:
MAX_TIME = -1
LIM_TIME = 15
MAX_ITEM = -1
LIM_ITEM = -1
if options.cache:
MAX_TIME = 0
if options.OFCOURSENOT:
log('welcome home')
LIM_ITEM = -1
LIM_TIME = -1
MAX_ITEM = -1
MAX_TIME = -1
# set
def runner(queue):
while True:
value = queue.get()
try:
worker(*value)
2014-01-08 00:38:50 +00:00
except Exception as e:
log('Thread Error: %s' % e.message)
queue.task_done()
def worker(i, item):
if time.time() - startTime > LIM_TIME >= 0 or i+1 > LIM_ITEM >= 0:
log('dropped')
item.remove()
return
2013-11-24 16:36:38 +00:00
item = Fix(item, url)
if time.time() - startTime > MAX_TIME >= 0 or i+1 > MAX_ITEM >= 0:
if not options.proxy:
if Fill(item, cache, url, True) is False:
item.remove()
return
else:
if not options.proxy:
Fill(item, cache, url)
if 'al' in options:
if i+1 > int(options.al):
item.remove()
return
if item.desc and item.content:
if options.clip:
2013-10-02 10:05:52 +00:00
item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
del item.desc
if not options.keep:
del item.desc
queue = Queue.Queue()
for i in range(THREADS):
t = threading.Thread(target=runner, args=(queue,))
t.daemon = True
t.start()
for i, item in enumerate(rss.items):
queue.put([i, item])
queue.join()
cache.save()
log(len(rss.items))
2013-09-15 13:44:25 +00:00
log(time.time() - startTime)
2013-12-21 19:14:10 +00:00
return rss
def cgi_app(environ, start_response):
options = ParseOptions(environ)
url = options.url
headers = {}
global DEBUG
DEBUG = options.debug
if 'HTTP_IF_NONE_MATCH' in environ:
if not options.force and not options.facebook and time.time() - int(environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY:
headers['status'] = '304 Not Modified'
start_response(headers['status'], headers.items())
log(url)
log('etag good')
return []
headers['status'] = '200 OK'
headers['etag'] = '"%s"' % int(time.time())
if options.html:
headers['content-type'] = 'text/html'
elif options.debug or options.txt:
headers['content-type'] = 'text/plain'
elif options.json:
headers['content-type'] = 'application/json'
else:
headers['content-type'] = 'text/xml'
url, cache = Init(url, os.getcwd() + '/cache', options)
RSS = Fetch(url, cache, options)
RSS = Gather(RSS, url, cache, options)
if headers['content-type'] == 'text/xml':
headers['content-type'] = RSS.mimetype
start_response(headers['status'], headers.items())
if not DEBUG and not options.silent:
if options.json:
if options.indent:
return json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x))
else:
return json.dumps(RSS, sort_keys=True, default=lambda x: dict(x))
else:
return RSS.tostring(xml_declaration=True, encoding='UTF-8')
log('done')
def cgi_wrapper(environ, start_response):
try:
return cgi_app(environ, start_response)
except (KeyboardInterrupt, SystemExit):
raise
except MorssException as e:
headers = {}
headers['status'] = '500 Oops'
headers['content-type'] = 'text/plain'
start_response(headers['status'], headers.items(), sys.exc_info())
return 'Internal Error: %s' % e.message
except Exception as e:
headers = {}
headers['status'] = '500 Oops'
headers['content-type'] = 'text/plain'
start_response(headers['status'], headers.items(), sys.exc_info())
return 'Unknown Error: %s' % e.message
def cli_app():
options = ParseOptions()
url = options.url
global DEBUG
DEBUG = options.debug
2013-05-01 15:57:09 +00:00
url, cache = Init(url, os.path.expanduser('~/.cache/morss'), options)
RSS = Fetch(url, cache, options)
RSS = Gather(RSS, url, cache, options)
if not DEBUG and not options.silent:
if options.json:
if options.indent:
print json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x))
else:
print json.dumps(RSS, sort_keys=True, default=lambda x: dict(x))
else:
print RSS.tostring(xml_declaration=True, encoding='UTF-8')
log('done')
2013-11-09 17:43:16 +00:00
2013-11-09 17:48:06 +00:00
if options.facebook:
2013-12-22 11:43:03 +00:00
facebook = Cache(cachePath, 'facebook', persistent=True, dic=True)
2013-11-09 17:48:06 +00:00
# get real token from code
code = urlparse.parse_qs(urlparse.urlparse(url).query)['code'][0]
eurl = "https://graph.facebook.com/oauth/access_token?client_id={app_id}&redirect_uri={redirect_uri}&client_secret={app_secret}&code={code_parameter}".format(app_id=FBAPPID, app_secret=FBSECRET, code_parameter=code, redirect_uri="http://test.morss.it/:facebook/")
token = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())['access_token'][0]
# get long-lived access token
eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(app_id=FBAPPID, app_secret=FBSECRET, short_lived_token=token)
values = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())
ltoken = values['access_token'][0]
expires = int(time.time() + int(values['expires'][0]))
# get user id
iurl = "https://graph.facebook.com/me?fields=id&access_token={token}".format(ltoken)
user_id = json.loads(urllib2.urlopen(iurl).read())['id']
# do sth out of it
2013-12-22 11:43:03 +00:00
if user_id not in facebook['user']:
facebook['user'][user_id] = {'original': ltoken}
2013-11-09 17:48:06 +00:00
2013-12-22 11:43:03 +00:00
facebook['token'][ltoken] = {'user': user_id, 'expires': expires}
facebook['user'][user_id]['token'] = ltoken
2013-11-09 17:48:06 +00:00
facebook.save()
2013-11-09 17:48:06 +00:00
if 'REQUEST_URI' in os.environ:
print 'Status: 200'
print 'Content-Type: text/plain'
print ''
print "token updated"
sys.exit(0)
def main():
2013-11-09 17:43:16 +00:00
if 'REQUEST_URI' in os.environ:
wsgiref.handlers.CGIHandler().run(cgi_wrapper)
elif len(sys.argv) <= 1:
httpd = wsgiref.simple_server.make_server('', 8080, cgi_wrapper)
httpd.serve_forever()
else:
try:
cli_app()
except (KeyboardInterrupt, SystemExit):
raise
except MorssException as e:
print 'Internal Error: %s' % e.message
except Exception as e:
print 'Unknown Error: %s' % e.message
if __name__ == '__main__':
main()