Add feedify, and use it in morss

2013-09-25 12:36:21 +02:00
parent 9bc4417be3
commit da14242bcf
2 changed files with 74 additions and 0 deletions
--- a/feedify.py
+++ b/feedify.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+
+from ConfigParser import ConfigParser
+from fnmatch import fnmatch
+import feeds
+import re
+
+import urllib2
+import lxml.html
+import urlparse
+
+def toclass(query):
+	pattern = r'\[class=([^\]]+)\]'
+	repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
+	return re.sub(pattern, repl, query)
+
+def getRule(link=URL):
+	config = ConfigParser()
+	config.read('feedify.ini')
+
+	for section in config.sections():
+		values = dict(config.items(section))
+		values['path'] = values['path'].split('\n')[1:]
+		for path in values['path']:
+			if fnmatch(link, path):
+				return values
+	return False
+
+def supported(link):
+	return getRule(link) is not False
+
+def getString(expr, html):
+	match = html.xpath(toclass(expr))
+	if len(match):
+		return match[0].text_content()
+	else:
+		return ''
+
+def build(link, data=None):
+	rule = getRule(link)
+	if rule is False:
+		return False
+
+	if data is None:
+		data = urllib2.urlopen(link).read()
+
+	html = lxml.html.fromstring(data)
+	feed = feeds.FeedParserAtom()
+
+	if 'title' in rule:
+		feed.title = html.xpath(toclass(rule['title']))[0]
+
+	if 'items' in rule:
+		for item in html.xpath(toclass(rule['items'])):
+			feedItem = {}
+
+			if 'item_title' in rule:
+				feedItem['title'] = item.xpath(toclass(rule['item_title']))[0]
+			if 'item_link' in rule:
+				url = item.xpath(toclass(rule['item_link']))[0]
+				url = urlparse.urljoin(link, url)
+				feedItem['link'] = url
+			if 'item_desc' in rule:
+				feedItem['desc'] = lxml.html.tostring(item.xpath(toclass(rule['item_desc']))[0], encoding='unicode')
+			if 'item_content' in rule:
+				feedItem['content'] = lxml.html.tostring(item.xpath(toclass(rule['item_content']))[0])
+
+			feed.items.append(feedItem)
+	return feed
--- a/morss.py
+++ b/morss.py
@@ -13,6 +13,7 @@ import lxml.html.clean
 import lxml.builder

 import feeds
+import feedify

 import httplib
 import urllib2
@@ -377,6 +378,8 @@ def Gather(url, cachePath, progress=False):

 		if xml[:5] == '<?xml' or con.info().type in MIMETYPE['xml']:
 			style = 'normal'
+		elif feedify.supported(url):
+			style = 'feedify'
 		elif con.info().type in MIMETYPE['html']:
 			style = 'html'
 		else:
@@ -389,6 +392,8 @@ def Gather(url, cachePath, progress=False):

 	if style == 'normal':
 		rss = feeds.parse(xml)
+	elif style == 'feedify':
+		xml = decodeHTML(xml)
 		rss = feedify.build(url, xml)
 	elif style == 'html':
 		match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")