2013-04-04 15:56:37 +00:00
|
|
|
#!/usr/bin/env python
|
2013-02-25 14:50:32 +00:00
|
|
|
import sys
|
2013-02-25 17:01:59 +00:00
|
|
|
import os
|
2013-02-25 19:53:59 +00:00
|
|
|
from os.path import expanduser
|
2013-02-25 14:50:32 +00:00
|
|
|
from lxml import etree
|
2013-04-04 17:29:22 +00:00
|
|
|
import re
|
2013-03-01 13:26:51 +00:00
|
|
|
import string
|
2013-02-25 14:50:32 +00:00
|
|
|
import urllib2
|
2013-02-25 19:53:59 +00:00
|
|
|
from cookielib import CookieJar
|
2013-04-04 15:43:30 +00:00
|
|
|
import chardet
|
|
|
|
|
|
|
|
SERVER = True
|
|
|
|
|
|
|
|
if SERVER:
|
|
|
|
import httplib
|
|
|
|
httplib.HTTPConnection.debuglevel = 1
|
|
|
|
|
|
|
|
import cgitb
|
|
|
|
cgitb.enable()
|
2013-02-25 17:01:59 +00:00
|
|
|
|
2013-02-25 20:36:02 +00:00
|
|
|
def log(txt):
|
2013-04-04 15:43:30 +00:00
|
|
|
if not SERVER and os.getenv('DEBUG', False):
|
2013-02-25 20:36:02 +00:00
|
|
|
print txt
|
2013-04-04 15:43:30 +00:00
|
|
|
if SERVER:
|
|
|
|
with open('morss.log', 'a') as file:
|
|
|
|
if isinstance(txt, str):
|
|
|
|
file.write(txt.encode('utf-8') + "\n")
|
2013-02-25 14:50:32 +00:00
|
|
|
|
2013-04-04 15:43:30 +00:00
|
|
|
class Info:
|
|
|
|
def __init__(self, item, feed):
|
|
|
|
self.item = item
|
|
|
|
self.feed = feed
|
|
|
|
|
|
|
|
self.data = False
|
|
|
|
self.page = False
|
|
|
|
self.html = False
|
|
|
|
self.con = False
|
|
|
|
self.opener = False
|
|
|
|
self.enc = False
|
|
|
|
|
2013-04-04 17:29:22 +00:00
|
|
|
self.link = self.item.xpath('link')[0]
|
2013-04-04 15:43:30 +00:00
|
|
|
self.desc = self.item.xpath('description')[0]
|
|
|
|
|
2013-04-04 17:29:22 +00:00
|
|
|
def checkURL(self):
|
|
|
|
if self.link.text.startswith("http://rss.feedsportal.com"):
|
|
|
|
log('feedsportal')
|
|
|
|
url = re.search('/([0-9a-zA-Z]+)/[a-zA-Z0-9\.]+$', self.link.text).groups()[0].split('0')
|
|
|
|
t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'L':'ww', 'S':'w.'}
|
|
|
|
self.link.text = 'http://' + "".join([(t[s[0]] if s[0] in t else "=") + s[1:] for s in url[1:]])
|
|
|
|
log(self.link.text)
|
|
|
|
|
2013-04-04 15:43:30 +00:00
|
|
|
def fetch(self):
|
2013-04-04 17:29:22 +00:00
|
|
|
log(self.link.text)
|
|
|
|
self.checkURL()
|
2013-04-04 15:43:30 +00:00
|
|
|
if not self.findCache():
|
|
|
|
self.download()
|
|
|
|
self.chardet()
|
|
|
|
self.fetchDesc()
|
|
|
|
self.save()
|
|
|
|
log(self.enc)
|
|
|
|
|
|
|
|
def parseHTML(self):
|
|
|
|
if self.enc is False:
|
|
|
|
self.page = etree.HTML(self.data)
|
|
|
|
else:
|
|
|
|
try:
|
|
|
|
self.page = etree.HTML(self.data.decode(self.enc, 'ignore'))
|
|
|
|
except ValueError:
|
|
|
|
self.page = etree.HTML(self.data)
|
|
|
|
|
|
|
|
|
|
|
|
def save(self):
|
|
|
|
self.feed.save()
|
|
|
|
|
|
|
|
def findCache(self):
|
|
|
|
if self.feed.cache is not False:
|
2013-04-04 17:29:22 +00:00
|
|
|
xpath = "//link[text()='" + self.link.text + "']/../description/text()"
|
2013-04-04 15:43:30 +00:00
|
|
|
match = self.feed.cache.xpath(xpath)
|
|
|
|
if len(match):
|
|
|
|
log('cached')
|
|
|
|
self.desc.text = match[0]
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
def fetchDesc(self):
|
|
|
|
self.parseHTML()
|
|
|
|
match = self.page.xpath(self.feed.rule)
|
|
|
|
if len(match):
|
|
|
|
self.html = match[0]
|
|
|
|
self.deleteTags()
|
|
|
|
self.desc.text = etree.tostring(self.html).decode(self.enc, 'ignore')
|
|
|
|
log('ok txt')
|
|
|
|
else:
|
|
|
|
log('no match')
|
|
|
|
|
|
|
|
def download(self):
|
2013-02-25 17:01:59 +00:00
|
|
|
try:
|
2013-02-25 19:53:59 +00:00
|
|
|
cj = CookieJar()
|
2013-04-04 15:43:30 +00:00
|
|
|
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
|
2013-04-04 17:29:22 +00:00
|
|
|
self.con = self.opener.open(self.link.text.encode('utf-8'))
|
2013-04-04 15:43:30 +00:00
|
|
|
self.data = self.con.read()
|
2013-03-29 19:06:31 +00:00
|
|
|
except (urllib2.HTTPError, urllib2.URLError) as error:
|
2013-02-25 19:53:22 +00:00
|
|
|
log(error)
|
2013-04-04 15:43:30 +00:00
|
|
|
log('http error')
|
|
|
|
|
|
|
|
def chardet(self):
|
|
|
|
if self.con.headers.getparam('charset'):
|
|
|
|
log('header')
|
|
|
|
self.enc = self.con.headers.getparam('charset')
|
|
|
|
return
|
|
|
|
|
|
|
|
page = etree.HTML(self.data)
|
|
|
|
header = page.xpath("//head/meta[@http-equiv='Content-Type']/@content")
|
|
|
|
if len(header) and len(header[0].split("=")):
|
|
|
|
log('meta')
|
|
|
|
self.enc = header[0].split("=")[1]
|
|
|
|
return
|
|
|
|
|
|
|
|
header = page.xpath("//head/meta[@charset]/@charset")
|
|
|
|
if len(header):
|
|
|
|
log('meta2')
|
|
|
|
self.enc = header[0]
|
|
|
|
return
|
|
|
|
|
|
|
|
log('chardet')
|
|
|
|
self.enc = chardet.detect(self.data)['encoding']
|
|
|
|
|
|
|
|
def deleteTags(self):
|
|
|
|
for tag in self.feed.trash:
|
|
|
|
for elem in self.html.xpath(tag):
|
|
|
|
elem.getparent().remove(elem)
|
2013-02-25 14:50:32 +00:00
|
|
|
|
2013-04-04 15:43:30 +00:00
|
|
|
class Feed:
|
|
|
|
def __init__(self, impl, data, cachePath):
|
|
|
|
self.rulePath = 'rules'
|
|
|
|
self.rule = '//article|//h1/..'
|
|
|
|
|
|
|
|
self.trash = ['//script', '//iframe', '//object', '//noscript', '//form', '//h1']
|
|
|
|
self.max = 70
|
|
|
|
|
|
|
|
self.cachePath = cachePath
|
|
|
|
self.cacheFile = False
|
|
|
|
self.cache = False
|
|
|
|
self.impl = impl
|
|
|
|
|
|
|
|
self.items = []
|
|
|
|
self.rss = False
|
|
|
|
self.out = False
|
|
|
|
|
|
|
|
if self.impl == 'server':
|
|
|
|
self.url = data
|
|
|
|
self.xml = False
|
|
|
|
else:
|
|
|
|
self.url = False
|
|
|
|
self.xml = data
|
|
|
|
|
|
|
|
def save(self):
|
|
|
|
self.out = etree.tostring(self.rss, xml_declaration=True, pretty_print=True)
|
|
|
|
open(self.cacheFile, 'w').write(self.out)
|
|
|
|
|
|
|
|
def getData(self):
|
|
|
|
if self.impl == 'server':
|
|
|
|
req = urllib2.Request(self.url)
|
|
|
|
req.add_unredirected_header('User-Agent', '')
|
|
|
|
self.xml = urllib2.urlopen(req).read()
|
|
|
|
self.cleanXml()
|
|
|
|
|
|
|
|
def setCache(self):
|
|
|
|
if self.cache is not False:
|
|
|
|
return
|
|
|
|
|
|
|
|
self.parse()
|
|
|
|
key = str(hash(self.rss.xpath('//channel/title/text()')[0]))
|
|
|
|
self.cacheFile = self.cachePath + "/" + key
|
|
|
|
log(self.cacheFile)
|
|
|
|
if not os.path.exists(self.cachePath):
|
|
|
|
os.makedirs(self.cachePath)
|
|
|
|
|
|
|
|
if os.path.exists(self.cacheFile):
|
|
|
|
self.cache = etree.XML(open(self.cacheFile, 'r').read())
|
|
|
|
|
|
|
|
def parse(self):
|
|
|
|
if self.rss is not False:
|
|
|
|
return
|
|
|
|
|
|
|
|
self.rss = etree.XML(self.xml)
|
|
|
|
|
|
|
|
def setItems(self):
|
|
|
|
self.items = [Info(e, self) for e in self.rss.xpath('//item')]
|
|
|
|
if self.max:
|
|
|
|
self.items = self.items[:self.max]
|
|
|
|
|
|
|
|
def fill(self):
|
|
|
|
self.parseRules()
|
|
|
|
log(self.rule)
|
|
|
|
for item in self.items:
|
|
|
|
item.fetch()
|
|
|
|
|
|
|
|
def cleanXml(self):
|
|
|
|
table = string.maketrans('', '')
|
|
|
|
self.xml = self.xml.translate(table, table[:32]).lstrip()
|
|
|
|
|
|
|
|
def parseRules(self):
|
|
|
|
if self.impl == 'server':
|
|
|
|
rules = open(self.rulePath, "r").read().split("\n\n")
|
|
|
|
rules = [r.split('\n') for r in rules]
|
|
|
|
for rule in rules:
|
|
|
|
if rule[1] == self.url:
|
|
|
|
self.rule = rule[2]
|
|
|
|
return
|
|
|
|
else:
|
|
|
|
if len(sys.argv) > 1:
|
|
|
|
self.rule = sys.argv[1]
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
if SERVER:
|
|
|
|
print 'Content-Type: text/html\n'
|
|
|
|
url = os.environ['REQUEST_URI'][len(os.environ['SCRIPT_NAME'])+1:]
|
|
|
|
url = 'http://' + url.replace(' ', '%20')
|
|
|
|
log(url)
|
|
|
|
RSS = Feed('server', url, os.getcwd() + '/cache')
|
|
|
|
else:
|
|
|
|
xml = sys.stdin.read()
|
|
|
|
cache = expanduser('~') + '/.cache/morss'
|
|
|
|
RSS = Feed('liferea', xml, os.getcwd() + '/cache')
|
|
|
|
|
|
|
|
RSS.getData()
|
|
|
|
RSS.parse()
|
|
|
|
RSS.setCache()
|
|
|
|
RSS.setItems()
|
|
|
|
RSS.fill()
|
|
|
|
RSS.save()
|
|
|
|
|
|
|
|
if SERVER or not os.getenv('DEBUG', False):
|
|
|
|
print RSS.out
|
|
|
|
else:
|
|
|
|
print 'done'
|