parent
da857f8bb2
commit
da0a8feadd
274
morss/feedify.py
274
morss/feedify.py
|
@ -13,170 +13,170 @@ import urlparse
|
||||||
|
|
||||||
|
|
||||||
def toclass(query):
|
def toclass(query):
|
||||||
pattern = r'\[class=([^\]]+)\]'
|
pattern = r'\[class=([^\]]+)\]'
|
||||||
repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
|
repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
|
||||||
return re.sub(pattern, repl, query)
|
return re.sub(pattern, repl, query)
|
||||||
|
|
||||||
def getRule(link):
|
def getRule(link):
|
||||||
config = ConfigParser()
|
config = ConfigParser()
|
||||||
config.read('feedify.ini')
|
config.read('feedify.ini')
|
||||||
|
|
||||||
for section in config.sections():
|
for section in config.sections():
|
||||||
values = dict(config.items(section))
|
values = dict(config.items(section))
|
||||||
values['path'] = values['path'].split('\n')[1:]
|
values['path'] = values['path'].split('\n')[1:]
|
||||||
for path in values['path']:
|
for path in values['path']:
|
||||||
if fnmatch(link, path):
|
if fnmatch(link, path):
|
||||||
return values
|
return values
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def supported(link):
|
def supported(link):
|
||||||
return getRule(link) is not False
|
return getRule(link) is not False
|
||||||
|
|
||||||
def formatString(string, getter, error=False):
|
def formatString(string, getter, error=False):
|
||||||
out = ""
|
out = ""
|
||||||
char = string[0]
|
char = string[0]
|
||||||
|
|
||||||
follow = string[1:]
|
follow = string[1:]
|
||||||
|
|
||||||
if char == '"':
|
if char == '"':
|
||||||
match = follow.partition('"')
|
match = follow.partition('"')
|
||||||
out = match[0]
|
out = match[0]
|
||||||
if len(match) >= 2:
|
if len(match) >= 2:
|
||||||
next = match[2]
|
next = match[2]
|
||||||
else:
|
else:
|
||||||
next = None
|
next = None
|
||||||
elif char == '{':
|
elif char == '{':
|
||||||
match = follow.partition('}')
|
match = follow.partition('}')
|
||||||
try:
|
try:
|
||||||
test = formatString(match[0], getter, True)
|
test = formatString(match[0], getter, True)
|
||||||
except ValueError, KeyError:
|
except ValueError, KeyError:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
out = test
|
out = test
|
||||||
|
|
||||||
next = match[2]
|
next = match[2]
|
||||||
elif char == ' ':
|
elif char == ' ':
|
||||||
next = follow
|
next = follow
|
||||||
elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string):
|
elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string):
|
||||||
match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups()
|
match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups()
|
||||||
rawValue = getter(match[0])
|
rawValue = getter(match[0])
|
||||||
if not isinstance(rawValue, basestring):
|
if not isinstance(rawValue, basestring):
|
||||||
if match[1] is not None:
|
if match[1] is not None:
|
||||||
out = match[1].join(rawValue)
|
out = match[1].join(rawValue)
|
||||||
else:
|
else:
|
||||||
out = ''.join(rawValue)
|
out = ''.join(rawValue)
|
||||||
if not out and error:
|
if not out and error:
|
||||||
raise ValueError
|
raise ValueError
|
||||||
next = match[2]
|
next = match[2]
|
||||||
else:
|
else:
|
||||||
raise ValueError('bogus string')
|
raise ValueError('bogus string')
|
||||||
|
|
||||||
if next is not None and len(next):
|
if next is not None and len(next):
|
||||||
return out + formatString(next, getter, error)
|
return out + formatString(next, getter, error)
|
||||||
else:
|
else:
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def PreWorker(url, cache):
|
def PreWorker(url, cache):
|
||||||
if urlparse.urlparse(url).netloc == 'itunes.apple.com':
|
if urlparse.urlparse(url).netloc == 'itunes.apple.com':
|
||||||
match = re.search('/id([0-9]+)(\?.*)?$', url)
|
match = re.search('/id([0-9]+)(\?.*)?$', url)
|
||||||
if match:
|
if match:
|
||||||
iid = match.groups()[0]
|
iid = match.groups()[0]
|
||||||
redirect = 'https://itunes.apple.com/lookup?id={id}'.format(id=iid)
|
redirect = 'https://itunes.apple.com/lookup?id={id}'.format(id=iid)
|
||||||
cache.set('redirect', redirect)
|
cache.set('redirect', redirect)
|
||||||
|
|
||||||
class Builder(object):
|
class Builder(object):
|
||||||
def __init__(self, link, data=None, cache=False):
|
def __init__(self, link, data=None, cache=False):
|
||||||
self.link = link
|
self.link = link
|
||||||
self.cache = cache
|
self.cache = cache
|
||||||
|
|
||||||
if data is None:
|
if data is None:
|
||||||
data = urllib2.urlopen(link).read()
|
data = urllib2.urlopen(link).read()
|
||||||
self.data = data
|
self.data = data
|
||||||
|
|
||||||
self.rule = getRule(link)
|
self.rule = getRule(link)
|
||||||
|
|
||||||
if self.rule['mode'] == 'xpath':
|
if self.rule['mode'] == 'xpath':
|
||||||
if not isinstance(self.data, unicode):
|
if not isinstance(self.data, unicode):
|
||||||
self.data = self.data.decode(morss.detEncoding(self.data), 'replace')
|
self.data = self.data.decode(morss.detEncoding(self.data), 'replace')
|
||||||
self.doc = lxml.html.fromstring(self.data)
|
self.doc = lxml.html.fromstring(self.data)
|
||||||
elif self.rule['mode'] == 'json':
|
elif self.rule['mode'] == 'json':
|
||||||
self.doc = json.loads(data)
|
self.doc = json.loads(data)
|
||||||
|
|
||||||
self.feed = feeds.FeedParserAtom()
|
self.feed = feeds.FeedParserAtom()
|
||||||
|
|
||||||
def raw(self, html, expr):
|
def raw(self, html, expr):
|
||||||
if self.rule['mode'] == 'xpath':
|
if self.rule['mode'] == 'xpath':
|
||||||
return html.xpath(toclass(expr))
|
return html.xpath(toclass(expr))
|
||||||
|
|
||||||
elif self.rule['mode'] == 'json':
|
elif self.rule['mode'] == 'json':
|
||||||
a = [html]
|
a = [html]
|
||||||
b = []
|
b = []
|
||||||
for x in expr.strip(".").split("."):
|
for x in expr.strip(".").split("."):
|
||||||
match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups()
|
match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups()
|
||||||
for elem in a:
|
for elem in a:
|
||||||
if isinstance(elem, dict):
|
if isinstance(elem, dict):
|
||||||
kids = elem.get(match[0])
|
kids = elem.get(match[0])
|
||||||
if kids is None:
|
if kids is None:
|
||||||
pass
|
pass
|
||||||
elif isinstance(kids, list):
|
elif isinstance(kids, list):
|
||||||
[b.append(i) for i in kids]
|
[b.append(i) for i in kids]
|
||||||
elif isinstance(kids, basestring):
|
elif isinstance(kids, basestring):
|
||||||
b.append(kids.replace('\n', '<br/>'))
|
b.append(kids.replace('\n', '<br/>'))
|
||||||
else:
|
else:
|
||||||
b.append(kids)
|
b.append(kids)
|
||||||
|
|
||||||
if match[1] is None:
|
if match[1] is None:
|
||||||
a = b
|
a = b
|
||||||
else:
|
else:
|
||||||
if len(b)-1 >= int(match[1]):
|
if len(b)-1 >= int(match[1]):
|
||||||
a = [b[int(match[1])]]
|
a = [b[int(match[1])]]
|
||||||
else:
|
else:
|
||||||
a = []
|
a = []
|
||||||
b = []
|
b = []
|
||||||
return a
|
return a
|
||||||
|
|
||||||
def strings(self, html, expr):
|
def strings(self, html, expr):
|
||||||
if self.rule['mode'] == 'xpath':
|
if self.rule['mode'] == 'xpath':
|
||||||
out = []
|
out = []
|
||||||
for match in self.raw(html, expr):
|
for match in self.raw(html, expr):
|
||||||
if isinstance(match, basestring):
|
if isinstance(match, basestring):
|
||||||
out.append(match)
|
out.append(match)
|
||||||
elif isinstance(match, lxml.html.HtmlElement):
|
elif isinstance(match, lxml.html.HtmlElement):
|
||||||
out.append(lxml.html.tostring(match))
|
out.append(lxml.html.tostring(match))
|
||||||
return out
|
return out
|
||||||
|
|
||||||
elif self.rule['mode'] == 'json':
|
elif self.rule['mode'] == 'json':
|
||||||
return self.raw(html, expr)
|
return self.raw(html, expr)
|
||||||
|
|
||||||
def string(self, html, expr):
|
def string(self, html, expr):
|
||||||
getter = lambda x: self.strings(html, x)
|
getter = lambda x: self.strings(html, x)
|
||||||
return formatString(self.rule[expr], getter)
|
return formatString(self.rule[expr], getter)
|
||||||
|
|
||||||
def build(self):
|
def build(self):
|
||||||
if 'title' in self.rule:
|
if 'title' in self.rule:
|
||||||
self.feed.title = self.string(self.doc, 'title')
|
self.feed.title = self.string(self.doc, 'title')
|
||||||
|
|
||||||
if 'items' in self.rule:
|
if 'items' in self.rule:
|
||||||
matches = self.raw(self.doc, self.rule['items'])
|
matches = self.raw(self.doc, self.rule['items'])
|
||||||
if matches and len(matches):
|
if matches and len(matches):
|
||||||
for item in matches:
|
for item in matches:
|
||||||
feedItem = {}
|
feedItem = {}
|
||||||
|
|
||||||
if 'item_title' in self.rule:
|
if 'item_title' in self.rule:
|
||||||
feedItem['title'] = self.string(item, 'item_title')
|
feedItem['title'] = self.string(item, 'item_title')
|
||||||
if 'item_link' in self.rule:
|
if 'item_link' in self.rule:
|
||||||
url = self.string(item, 'item_link')
|
url = self.string(item, 'item_link')
|
||||||
url = urlparse.urljoin(self.link, url)
|
url = urlparse.urljoin(self.link, url)
|
||||||
feedItem['link'] = url
|
feedItem['link'] = url
|
||||||
if 'item_desc' in self.rule:
|
if 'item_desc' in self.rule:
|
||||||
feedItem['desc'] = self.string(item, 'item_desc')
|
feedItem['desc'] = self.string(item, 'item_desc')
|
||||||
if 'item_content' in self.rule:
|
if 'item_content' in self.rule:
|
||||||
feedItem['content'] = self.string(item, 'item_content')
|
feedItem['content'] = self.string(item, 'item_content')
|
||||||
if 'item_time' in self.rule:
|
if 'item_time' in self.rule:
|
||||||
feedItem['updated'] = self.string(item, 'item_time')
|
feedItem['updated'] = self.string(item, 'item_time')
|
||||||
if 'item_id' in self.rule:
|
if 'item_id' in self.rule:
|
||||||
feedItem['id'] = self.string(item, 'item_id')
|
feedItem['id'] = self.string(item, 'item_id')
|
||||||
feedItem['isPermaLink'] = False
|
feedItem['isPermaLink'] = False
|
||||||
|
|
||||||
self.feed.items.append(feedItem)
|
self.feed.items.append(feedItem)
|
||||||
|
|
||||||
|
|
1074
morss/feeds.py
1074
morss/feeds.py
File diff suppressed because it is too large
Load Diff
1220
morss/morss.py
1220
morss/morss.py
File diff suppressed because it is too large
Load Diff
20
setup.py
20
setup.py
|
@ -1,13 +1,13 @@
|
||||||
from setuptools import setup, find_packages
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
package_name = 'morss'
|
package_name = 'morss'
|
||||||
setup( name=package_name,
|
setup( name=package_name,
|
||||||
description='Get full-text RSS feeds',
|
description='Get full-text RSS feeds',
|
||||||
author='pictuga',
|
author='pictuga',
|
||||||
author_email='contact at author name dot com',
|
author_email='contact at author name dot com',
|
||||||
url='http://morss.it/',
|
url='http://morss.it/',
|
||||||
license='GPL 3+',
|
license='GPL 3+',
|
||||||
package_dir={package_name: package_name},
|
package_dir={package_name: package_name},
|
||||||
packages=find_packages(),
|
packages=find_packages(),
|
||||||
package_data={package_name: ['feedify.ini']},
|
package_data={package_name: ['feedify.ini']},
|
||||||
test_suite=package_name + '.tests')
|
test_suite=package_name + '.tests')
|
||||||
|
|
Loading…
Reference in New Issue