morss/morss.py

394 lines
9.2 KiB
Python
Raw Normal View History

2013-04-04 15:56:37 +00:00
#!/usr/bin/env python
2013-02-25 14:50:32 +00:00
import sys
2013-02-25 17:01:59 +00:00
import os
import os.path
import time
2013-04-22 22:04:44 +00:00
from base64 import b64encode, b64decode
import re
import string
import lxml.html
import lxml.html.clean
import lxml.builder
2013-04-22 22:04:44 +00:00
import feeds
2013-02-25 14:50:32 +00:00
import urllib2
import socket
from cookielib import CookieJar
import chardet
import urlparse
from readability import readability
LIM_ITEM = 100 # deletes what's beyond
MAX_ITEM = 50 # cache-only beyond
MAX_TIME = 7 # cache-only after
DELAY = 10 # xml cache
TIMEOUT = 2 # http timeout
OPTIONS = ['progress', 'cache']
2013-05-01 15:54:17 +00:00
UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)'
UA_HML = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
PROTOCOL = ['http', 'https', 'ftp']
ITEM_MAP = {
'link': (('{http://www.w3.org/2005/Atom}link', 'href'), '{}link'),
'desc': ('{http://www.w3.org/2005/Atom}summary', '{}description'),
'description': ('{http://www.w3.org/2005/Atom}summary', '{}description'),
'summary': ('{http://www.w3.org/2005/Atom}summary', '{}description'),
'content': ('{http://www.w3.org/2005/Atom}content', '{http://purl.org/rss/1.0/modules/content/}encoded')
}
RSS_MAP = {
'desc': ('{http://www.w3.org/2005/Atom}subtitle', '{}description'),
'description': ('{http://www.w3.org/2005/Atom}subtitle', '{}description'),
'subtitle': ('{http://www.w3.org/2005/Atom}subtitle', '{}description'),
'item': ('{http://www.w3.org/2005/Atom}entry', '{}item'),
'entry': ('{http://www.w3.org/2005/Atom}entry', '{}item')
}
if 'REQUEST_URI' in os.environ:
import httplib
httplib.HTTPConnection.debuglevel = 1
import cgitb
cgitb.enable()
2013-02-25 17:01:59 +00:00
def log(txt):
if not 'REQUEST_URI' in os.environ:
if os.getenv('DEBUG', False):
print repr(txt)
else:
with open('morss.log', 'a') as file:
2013-04-16 14:13:14 +00:00
file.write(repr(txt).encode('utf-8') + "\n")
def cleanXML(xml):
table = string.maketrans('', '')
return xml.translate(table, table[:32]).lstrip()
def lenHTML(txt):
2013-06-08 15:30:11 +00:00
if len(txt):
return len(lxml.html.fromstring(txt).text_content())
else:
return 0
def countWord(txt):
if len(txt):
return len(lxml.html.fromstring(txt).text_content().split())
else:
return 0
def makeDesc(txt, length, suffix='...'):
' '.join(txt.split()[:length]) + suffix
def setContent(item, txt):
if not item.desc:
if item.content:
log('content alone')
item.desc = item.content
item.content = txt
else:
log('empty')
item.desc = makeDesc(txt, 30)
item.content = txt
else:
item.content = txt
2013-05-01 15:57:09 +00:00
def parseOptions(available):
options = None
if 'REQUEST_URI' in os.environ:
if 'REDIRECT_URL' in os.environ:
url = os.environ['REQUEST_URI'][1:]
else:
url = os.environ['REQUEST_URI'][len(os.environ['SCRIPT_NAME'])+1:]
if urlparse.urlparse(url).scheme not in PROTOCOL:
2013-05-01 15:57:09 +00:00
split = url.split('/', 1)
if len(split) and split[0] in available:
options = split[0]
url = split[1]
url = "http://" + url
else:
if len(sys.argv) == 3:
if sys.argv[1] in available:
options = sys.argv[1]
url = sys.argv[2]
elif len(sys.argv) == 2:
url = sys.argv[1]
else:
return (None, None)
if urlparse.urlparse(url).scheme not in PROTOCOL:
2013-05-01 15:57:09 +00:00
url = "http://" + url
return (url, options)
class Cache:
"""Light, error-prone caching system."""
def __init__(self, folder, key):
self._key = key
self._hash = str(hash(self._key))
self._dir = folder
self._file = self._dir + "/" + self._hash
self._cached = {} # what *was* cached
self._cache = {} # new things to put in cache
if os.path.isfile(self._file):
data = open(self._file).readlines()
for line in data:
if "\t" in line:
key, bdata = line.split("\t", 1)
self._cached[key] = bdata
log(self._hash)
def __del__(self):
self.save()
def __contains__(self, key):
return key in self._cached
def get(self, key):
if key in self._cached:
self._cache[key] = self._cached[key]
return b64decode(self._cached[key])
else:
return None
def set(self, key, content):
self._cache[key] = b64encode(content)
def save(self):
2013-04-22 20:56:38 +00:00
if len(self._cache) == 0:
return
out = []
for (key, bdata) in self._cache.iteritems():
out.append(str(key) + "\t" + bdata)
txt = "\n".join(out)
if not os.path.exists(self._dir):
os.makedirs(self._dir)
with open(self._file, 'w') as file:
file.write(txt)
def isYoungerThan(self, sec):
if not os.path.exists(self._file):
return False
return time.time() - os.path.getmtime(self._file) < sec
def EncDownload(url):
try:
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
2013-05-01 15:54:17 +00:00
opener.addheaders = [('User-Agent', UA_HML)]
2013-04-30 17:54:32 +00:00
con = opener.open(url, timeout=TIMEOUT)
data = con.read()
except (urllib2.HTTPError, urllib2.URLError, socket.timeout) as error:
log(error)
return False
2013-04-19 09:43:47 +00:00
# meta-redirect
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
if match:
new_url = match.groups()[0]
log('redirect: %s' % new_url)
return EncDownload(new_url)
# encoding
if con.headers.getparam('charset'):
log('header')
enc = con.headers.getparam('charset')
else:
2013-04-30 17:51:29 +00:00
match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data)
if match:
log('meta.re')
2013-04-30 17:51:29 +00:00
enc = match.groups()[0]
else:
log('chardet')
enc = chardet.detect(data)['encoding']
2013-06-08 15:32:55 +00:00
log(enc)
return (data.decode(enc, 'replace'), con.geturl())
def Fill(item, cache, feedurl="/", fast=False):
""" Returns True when it has done its best """
if not item.link:
log('no link')
return True
log(item.link)
# feedburner
feeds.NSMAP['feedburner'] = 'http://rssnamespace.org/feedburner/ext/1.0'
match = item.xval('feedburner:origLink')
if match:
item.link = match
log(item.link)
# feedsportal
match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link)
if match:
url = match.groups()[0].split('0')
2013-04-28 08:10:58 +00:00
t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'I':'_', 'L':'http://', 'S':'www.', 'N':'.com', 'O':'.co.uk'}
item.link = "".join([(t[s[0]] if s[0] in t else "=") + s[1:] for s in url[1:]])
log(item.link)
# reddit
if urlparse.urlparse(item.link).netloc == 'www.reddit.com':
match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
if len(match):
item.link = match[0]
log(item.link)
2013-05-15 15:13:57 +00:00
# check relative urls
if urlparse.urlparse(item.link).netloc is '':
item.link = urlparse.urljoin(feedurl, item.link)
# check unwanted uppercase title
if len(item.title) > 20 and item.title.isupper():
item.title = item.title.title()
# content already provided?
count_content = countWord(item.content)
count_desc = countWord(item.desc)
log('desc: %s words, content: %s words' % (count_content, count_desc))
if max(count_content, count_desc) > 500:
log('long enough')
return True
if count_content > 5*count_desc > 0 and count_content > 50:
log('content bigger enough')
return True
2013-05-01 15:56:03 +00:00
# check cache and previous errors
if item.link in cache:
2013-05-01 15:56:03 +00:00
content = cache.get(item.link)
match = re.search(r'^error-([a-z]{2,10})$', content)
if match:
2013-05-01 15:56:03 +00:00
if cache.isYoungerThan(DELAY*60):
log('cached error: %s' % match.groups()[0])
return True
2013-05-01 15:56:03 +00:00
else:
log('old error')
2013-05-01 15:56:03 +00:00
else:
log('cached')
setContent(item, cache.get(item.link))
return True
# super-fast mode
if fast:
log('skipped')
return False
# download
2013-06-08 15:32:55 +00:00
ddl = EncDownload(item.link.encode('utf-8'))
if ddl is False:
2013-05-01 15:56:03 +00:00
log('http error')
cache.set(item.link, 'error-http')
return True
2013-06-08 15:32:55 +00:00
data, url = ddl
2013-06-08 15:32:55 +00:00
out = readability.Document(data, url=url).summary(True)
if countWord(out) > max(count_content, count_desc) > 0:
setContent(item, out)
cache.set(item.link, out)
else:
log('not bigger enough')
cache.set(item.link, 'error-length')
return True
return True
2013-05-01 15:57:09 +00:00
def Gather(url, cachePath, mode='feed'):
cache = Cache(cachePath, url)
# fetch feed
if cache.isYoungerThan(DELAY*60) and url in cache:
log('xml cached')
xml = cache.get(url)
else:
try:
req = urllib2.Request(url)
2013-05-01 15:54:17 +00:00
req.add_unredirected_header('User-Agent', UA_RSS)
xml = urllib2.urlopen(req).read()
cache.set(url, xml)
except (urllib2.HTTPError, urllib2.URLError):
return False
xml = cleanXML(xml)
rss = feeds.parse(xml)
size = len(rss)
# set
startTime = time.time()
for i, item in enumerate(rss.items):
2013-05-01 15:57:09 +00:00
if mode == 'progress':
if MAX_ITEM == 0:
print "%s/%s" % (i+1, size)
else:
print "%s/%s" % (i+1, min(MAX_ITEM, size))
2013-05-01 15:57:09 +00:00
sys.stdout.flush()
if i+1 > LIM_ITEM > 0:
item.remove()
elif time.time() - startTime > MAX_TIME >= 0 or i+1 > MAX_ITEM > 0:
if Fill(item, cache, url, True) is False:
item.remove()
else:
Fill(item, cache, url)
log(len(rss))
return rss.tostring(xml_declaration=True, encoding='UTF-8')
if __name__ == "__main__":
2013-06-25 11:13:23 +00:00
url, options = parseOptions(OPTIONS)
2013-05-01 15:57:09 +00:00
2013-06-25 11:13:23 +00:00
if 'REQUEST_URI' in os.environ:
print 'Status: 200'
if options == 'progress':
2013-06-25 11:13:23 +00:00
print 'Content-Type: application/octet-stream'
else:
2013-06-28 11:34:12 +00:00
print 'Content-Type: text/xml'
2013-06-25 11:13:23 +00:00
print
cache = os.getcwd() + '/cache'
log(url)
else:
2013-05-01 15:57:09 +00:00
cache = os.path.expanduser('~') + '/.cache/morss'
2013-06-25 11:13:23 +00:00
if url is None:
print "Please provide url."
sys.exit(1)
if options == 'progress':
MAX_TIME = -1
if options == 'cache':
MAX_TIME = 0
RSS = Gather(url, cache, options)
2013-05-01 15:57:09 +00:00
if RSS is not False and options != 'progress':
if 'REQUEST_URI' in os.environ or not os.getenv('DEBUG', False):
print RSS
if RSS is False and options != 'progress':
print "Error fetching feed."
log('done')