Move to OOP.

This is a huge commit. The whole code is ported to Object-Oritented Programming. This makes the code cleaner, which became required to deal with all the different cases, for example with encoding detection. Encoding detection now works better, and uses 3 different methods. HTML pages with an xml declaration are now supported. Feed urls with parameters (eg. "index.php?option=par") are also supported. Cache is now smarter, since it no longer grows indefinitely, since only in-use pages are kept in the cache. Caching is now mandatory. urllib (not urllib2) is no longer needed. Solved a possible crash with log function (when passing list of str with non-unicode encoging). README is also updated.
2013-04-04 17:43:30 +02:00
parent c21af6d9a8
commit 82084c2c75
2 changed files with 244 additions and 55 deletions
--- a/256
+++ b/256
@@ -1,69 +1,227 @@
-#! /usr/bin/env python2.7
+#!/usr/bin/python
 import sys
 import os
 from os.path import expanduser
 from lxml import etree
 import string
 import urllib2
-import urllib
 from cookielib import CookieJar
+import chardet
+
+SERVER = True
+
+if SERVER:
+	import httplib
+	httplib.HTTPConnection.debuglevel = 1
+
+	import cgitb
+	cgitb.enable()

 def log(txt):
-	if os.getenv('DEBUG', False):
+	if not SERVER and os.getenv('DEBUG', False):
 		print txt
+	if SERVER:
+		with open('morss.log', 'a') as file:
+			if isinstance(txt, str):
+				file.write(txt.encode('utf-8') + "\n")

-def xmlclean(xml):
-	table = string.maketrans('', '')
-	return xml.translate(table, table[:32])
+class Info:
+	def __init__(self, item, feed):
+		self.item = item
+		self.feed = feed

-node =	sys.argv[1] if len(sys.argv) > 1 else "//h1/.."
+		self.data = False
+		self.page = False
+		self.html = False
+		self.con = False
+		self.opener = False
+		self.enc = False

-xml =	xmlclean(sys.stdin.read())
-rss =	etree.XML(xml)
-items =	rss.xpath('//item')
+		self.link = self.item.findtext('link')
+		self.desc = self.item.xpath('description')[0]

-cache = expanduser("~") + "/.cache/morss"
-if not os.path.exists(cache):
-	os.makedirs(cache)
+	def fetch(self):
+		log(self.link)
+		if not self.findCache():
+			self.download()
+			self.chardet()
+			self.fetchDesc()
+		self.save()
+		log(self.enc)

-for item in items:
-	link = 	item.findtext('link').encode('utf-8')
-	desc =	item.xpath('description')[0]
-	
-	log(link)
-	
-	cached = cache + "/" + str(hash(link))
-	log(cached)
-	
-	if os.path.exists(cached):
-		log("cached")
-		desc.text = open(cached, 'r').read()
-	else:
+	def parseHTML(self):
+		if self.enc is False:
+			self.page = etree.HTML(self.data)
+		else:
+			try:
+				self.page = etree.HTML(self.data.decode(self.enc, 'ignore'))
+			except ValueError:
+				self.page = etree.HTML(self.data)
+
+
+	def save(self):
+		self.feed.save()
+
+	def findCache(self):
+		if self.feed.cache is not False:
+			xpath = "//link[text()='" + self.link + "']/../description/text()"
+			match = self.feed.cache.xpath(xpath)
+			if len(match):
+				log('cached')
+				self.desc.text = match[0]
+				return True
+		return False
+
+	def fetchDesc(self):
+		self.parseHTML()
+		match =	self.page.xpath(self.feed.rule)
+		if len(match):
+			self.html = match[0]
+			self.deleteTags()
+			self.desc.text = etree.tostring(self.html).decode(self.enc, 'ignore')
+			log('ok txt')
+		else:
+			log('no match')
+
+	def download(self):
 		try:
 			cj = CookieJar()
-			opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
-			data =	opener.open(link).read()
-			html =	etree.HTML(data)
-			match =	html.xpath(node)
-		
-			if len(match):
-				try:
-					text =	etree.tostring(match[0])
-					log("ok txt")
-				except etree.SerialisationError:
-					log('serialisation')
-					continue
-				try:
-					desc.text = text
-					open(cached, 'w').write(text)
-				except ValueError:
-					log('xml error')
-			else:
-				log("no match")
-		
+			self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
+			self.con = self.opener.open(self.link.encode('utf-8'))
+			self.data = self.con.read()
 		except (urllib2.HTTPError, urllib2.URLError) as error:
 			log(error)
-			log("http error")
+			log('http error')

-if not os.getenv('DEBUG', False):
-	print etree.tostring(rss)
+	def chardet(self):
+		if self.con.headers.getparam('charset'):
+			log('header')
+			self.enc = self.con.headers.getparam('charset')
+			return
+
+		page = etree.HTML(self.data)
+		header = page.xpath("//head/meta[@http-equiv='Content-Type']/@content")
+		if len(header) and len(header[0].split("=")):
+			log('meta')
+			self.enc = header[0].split("=")[1]
+			return
+
+		header = page.xpath("//head/meta[@charset]/@charset")
+		if len(header):
+			log('meta2')
+			self.enc = header[0]
+			return
+
+		log('chardet')
+		self.enc = chardet.detect(self.data)['encoding']
+
+	def deleteTags(self):
+		for tag in self.feed.trash:
+			for elem in self.html.xpath(tag):
+				elem.getparent().remove(elem)
+
+class Feed:
+	def __init__(self, impl, data, cachePath):
+		self.rulePath = 'rules'
+		self.rule = '//article|//h1/..'
+
+		self.trash = ['//script', '//iframe', '//object', '//noscript', '//form', '//h1']
+		self.max = 70
+
+		self.cachePath = cachePath
+		self.cacheFile = False
+		self.cache = False
+		self.impl = impl
+
+		self.items = []
+		self.rss = False
+		self.out = False
+
+		if self.impl == 'server':
+			self.url = data
+			self.xml = False
+		else:
+			self.url = False
+			self.xml = data
+
+	def save(self):
+		self.out = etree.tostring(self.rss, xml_declaration=True, pretty_print=True)
+		open(self.cacheFile, 'w').write(self.out)
+
+	def getData(self):
+		if self.impl == 'server':
+			req = urllib2.Request(self.url)
+			req.add_unredirected_header('User-Agent', '')
+			self.xml = urllib2.urlopen(req).read()
+		self.cleanXml()
+
+	def setCache(self):
+		if self.cache is not False:
+			return
+
+		self.parse()
+		key = str(hash(self.rss.xpath('//channel/title/text()')[0]))
+		self.cacheFile = self.cachePath + "/" + key
+		log(self.cacheFile)
+		if not os.path.exists(self.cachePath):
+			os.makedirs(self.cachePath)
+
+		if os.path.exists(self.cacheFile):
+			self.cache = etree.XML(open(self.cacheFile, 'r').read())
+
+	def parse(self):
+		if self.rss is not False:
+			return
+
+		self.rss = etree.XML(self.xml)
+
+	def setItems(self):
+		self.items = [Info(e, self) for e in self.rss.xpath('//item')]
+		if self.max:
+			self.items = self.items[:self.max]
+
+	def fill(self):
+		self.parseRules()
+		log(self.rule)
+		for item in self.items:
+			item.fetch()
+
+	def cleanXml(self):
+		table = string.maketrans('', '')
+		self.xml = self.xml.translate(table, table[:32]).lstrip()
+
+	def parseRules(self):
+		if self.impl == 'server':
+			rules = open(self.rulePath, "r").read().split("\n\n")
+			rules = [r.split('\n') for r in rules]
+			for rule in rules:
+				if rule[1] == self.url:
+					self.rule = rule[2]
+					return
+		else:
+			if len(sys.argv) > 1:
+				self.rule = sys.argv[1]
+
+if __name__ == "__main__":
+	if SERVER:
+		print 'Content-Type: text/html\n'
+		url = os.environ['REQUEST_URI'][len(os.environ['SCRIPT_NAME'])+1:]
+		url = 'http://' + url.replace(' ', '%20')
+		log(url)
+		RSS = Feed('server', url, os.getcwd() + '/cache')
+	else:
+		xml =	sys.stdin.read()
+		cache =	expanduser('~') + '/.cache/morss'
+		RSS = Feed('liferea', xml, os.getcwd() + '/cache')
+
+	RSS.getData()
+	RSS.parse()
+	RSS.setCache()
+	RSS.setItems()
+	RSS.fill()
+	RSS.save()
+
+	if SERVER or not os.getenv('DEBUG', False):
+		print RSS.out
+	else:
+		print 'done'