708 lines
19 KiB
Python
708 lines
19 KiB
Python
import sys
|
|
import os
|
|
import os.path
|
|
import time
|
|
|
|
import threading
|
|
|
|
from fnmatch import fnmatch
|
|
import re
|
|
import json
|
|
|
|
import lxml.etree
|
|
import lxml.html
|
|
|
|
from . import feeds
|
|
from . import feedify
|
|
from . import crawler
|
|
|
|
import wsgiref.simple_server
|
|
import wsgiref.handlers
|
|
|
|
from html2text import HTML2Text
|
|
|
|
try:
|
|
from Queue import Queue
|
|
from httplib import HTTPException
|
|
from urllib2 import build_opener
|
|
from urllib2 import HTTPError
|
|
from urllib import quote_plus
|
|
from urlparse import urlparse, urljoin, parse_qs
|
|
except ImportError:
|
|
from queue import Queue
|
|
from http.client import HTTPException
|
|
from urllib.request import build_opener
|
|
from urllib.error import HTTPError
|
|
from urllib.parse import quote_plus
|
|
from urllib.parse import urlparse, urljoin, parse_qs
|
|
|
|
LIM_ITEM = 100 # deletes what's beyond
|
|
LIM_TIME = 7 # deletes what's after
|
|
MAX_ITEM = 50 # cache-only beyond
|
|
MAX_TIME = 7 # cache-only after (in sec)
|
|
DELAY = 10 * 60 # xml cache & ETag cache (in sec)
|
|
TIMEOUT = 4 # http timeout (in sec)
|
|
THREADS = 10 # number of threads (1 for single-threaded)
|
|
|
|
DEBUG = False
|
|
PORT = 8080
|
|
|
|
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
|
|
|
MIMETYPE = {
|
|
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
|
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
|
|
|
PROTOCOL = ['http', 'https', 'ftp']
|
|
|
|
|
|
def filterOptions(options):
|
|
return options
|
|
|
|
# example of filtering code below
|
|
|
|
#allowed = ['proxy', 'clip', 'keep', 'cache', 'force', 'silent', 'pro', 'debug']
|
|
#filtered = dict([(key,value) for (key,value) in options.items() if key in allowed])
|
|
|
|
#return filtered
|
|
|
|
|
|
class MorssException(Exception):
|
|
pass
|
|
|
|
|
|
def log(txt, force=False):
|
|
if DEBUG or force:
|
|
if 'REQUEST_URI' in os.environ:
|
|
open('morss.log', 'a').write("%s\n" % repr(txt))
|
|
else:
|
|
print(repr(txt))
|
|
|
|
|
|
from . import readabilite
|
|
def readability(html, url):
|
|
return readabilite.get_article(html)
|
|
|
|
|
|
|
|
def len_html(txt):
|
|
if len(txt):
|
|
return len(lxml.html.fromstring(txt).text_content())
|
|
else:
|
|
return 0
|
|
|
|
|
|
def count_words(txt):
|
|
if len(txt):
|
|
return len(lxml.html.fromstring(txt).text_content().split())
|
|
return 0
|
|
|
|
|
|
class Options:
|
|
def __init__(self, options=None, **args):
|
|
if len(args):
|
|
self.options = args
|
|
self.options.update(options or {})
|
|
else:
|
|
self.options = options or {}
|
|
|
|
def __getattr__(self, key):
|
|
if key in self.options:
|
|
return self.options[key]
|
|
else:
|
|
return False
|
|
|
|
def __setitem__(self, key, value):
|
|
self.options[key] = value
|
|
|
|
def __contains__(self, key):
|
|
return key in self.options
|
|
|
|
|
|
def parseOptions(options):
|
|
""" Turns ['md=True'] into {'md':True} """
|
|
out = {}
|
|
for option in options:
|
|
split = option.split('=', 1)
|
|
if len(split) > 1:
|
|
if split[0].lower() == 'true':
|
|
out[split[0]] = True
|
|
elif split[0].lower() == 'false':
|
|
out[split[0]] = False
|
|
else:
|
|
out[split[0]] = split[1]
|
|
else:
|
|
out[split[0]] = True
|
|
return out
|
|
|
|
|
|
default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA),
|
|
crawler.AutoRefererHandler(), crawler.HTTPEquivHandler(),
|
|
crawler.HTTPRefreshHandler(), crawler.EncodingFixHandler()]
|
|
|
|
def custom_handler(accept, delay=DELAY):
|
|
handlers = default_handlers[:]
|
|
handlers.append(crawler.ContentNegociationHandler(accept))
|
|
handlers.append(crawler.SQliteCacheHandler(delay))
|
|
|
|
return build_opener(*handlers)
|
|
|
|
|
|
def Fix(item, feedurl='/'):
|
|
""" Improves feed items (absolute links, resolve feedburner links, etc) """
|
|
|
|
# check unwanted uppercase title
|
|
if len(item.title) > 20 and item.title.isupper():
|
|
item.title = item.title.title()
|
|
|
|
# check if it includes link
|
|
if not item.link:
|
|
log('no link')
|
|
return item
|
|
|
|
# wikipedia daily highlight
|
|
if fnmatch(feedurl, 'http*://*.wikipedia.org/w/api.php?*&feedformat=atom'):
|
|
match = lxml.html.fromstring(item.desc).xpath('//b/a/@href')
|
|
if len(match):
|
|
item.link = match[0]
|
|
log(item.link)
|
|
|
|
# check relative urls
|
|
item.link = urljoin(feedurl, item.link)
|
|
|
|
# google translate
|
|
if fnmatch(item.link, 'http://translate.google.*/translate*u=*'):
|
|
item.link = parse_qs(urlparse(item.link).query)['u'][0]
|
|
log(item.link)
|
|
|
|
# google
|
|
if fnmatch(item.link, 'http://www.google.*/url?q=*'):
|
|
item.link = parse_qs(urlparse(item.link).query)['q'][0]
|
|
log(item.link)
|
|
|
|
# google news
|
|
if fnmatch(item.link, 'http://news.google.com/news/url*url=*'):
|
|
item.link = parse_qs(urlparse(item.link).query)['url'][0]
|
|
log(item.link)
|
|
|
|
# facebook
|
|
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
|
|
item.link = parse_qs(urlparse(item.link).query)['u'][0]
|
|
log(item.link)
|
|
|
|
# feedburner
|
|
feeds.NSMAP['feedburner'] = 'http://rssnamespace.org/feedburner/ext/1.0'
|
|
match = item.xval('feedburner:origLink')
|
|
if match:
|
|
item.link = match
|
|
|
|
# feedsportal
|
|
match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link)
|
|
if match:
|
|
url = match.groups()[0].split('0')
|
|
t = {'A': '0', 'B': '.', 'C': '/', 'D': '?', 'E': '-', 'F': '=',
|
|
'G': '&', 'H': ',', 'I': '_', 'J': '%', 'K': '+', 'L': 'http://',
|
|
'M': 'https://', 'N': '.com', 'O': '.co.uk', 'P': ';', 'Q': '|',
|
|
'R': ':', 'S': 'www.', 'T': '#', 'U': '$', 'V': '~', 'W': '!',
|
|
'X': '(', 'Y': ')', 'Z': 'Z'}
|
|
item.link = ''.join([(t[s[0]] if s[0] in t else s[0]) + s[1:] for s in url[1:]])
|
|
log(item.link)
|
|
|
|
# reddit
|
|
if urlparse(feedurl).netloc == 'www.reddit.com':
|
|
match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
|
|
if len(match):
|
|
item.link = match[0]
|
|
log(item.link)
|
|
|
|
return item
|
|
|
|
|
|
def Fill(item, options, feedurl='/', fast=False):
|
|
""" Returns True when it has done its best """
|
|
|
|
if not item.link:
|
|
log('no link')
|
|
return item
|
|
|
|
log(item.link)
|
|
|
|
# content already provided?
|
|
count_content = count_words(item.content)
|
|
count_desc = count_words(item.desc)
|
|
|
|
if not options.hungry and max(count_content, count_desc) > 500:
|
|
if count_desc > count_content:
|
|
item.content = item.desc
|
|
del item.desc
|
|
log('reversed sizes')
|
|
log('long enough')
|
|
return True
|
|
|
|
if not options.hungry and count_content > 5 * count_desc > 0 and count_content > 50:
|
|
log('content bigger enough')
|
|
return True
|
|
|
|
link = item.link
|
|
|
|
# twitter
|
|
if urlparse(feedurl).netloc == 'twitter.com':
|
|
match = lxml.html.fromstring(item.content).xpath('//a/@data-expanded-url')
|
|
if len(match):
|
|
link = match[0]
|
|
log(link)
|
|
else:
|
|
link = None
|
|
|
|
# facebook
|
|
if urlparse(feedurl).netloc == 'graph.facebook.com':
|
|
match = lxml.html.fromstring(item.content).xpath('//a/@href')
|
|
if len(match) and urlparse(match[0]).netloc != 'www.facebook.com':
|
|
link = match[0]
|
|
log(link)
|
|
else:
|
|
link = None
|
|
|
|
if link is None:
|
|
log('no used link')
|
|
return True
|
|
|
|
# download
|
|
delay = -1
|
|
|
|
if fast:
|
|
# super-fast mode
|
|
delay = -2
|
|
|
|
try:
|
|
con = custom_handler(('html', 'text/*'), delay).open(link, timeout=TIMEOUT)
|
|
data = con.read()
|
|
|
|
except (IOError, HTTPException) as e:
|
|
log('http error')
|
|
return False # let's just delete errors stuff when in cache mode
|
|
|
|
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
|
if contenttype not in MIMETYPE['html'] and contenttype != 'text/plain':
|
|
log('non-text page')
|
|
return True
|
|
|
|
out = readability(data, con.url)
|
|
|
|
if options.hungry or count_words(out) > max(count_content, count_desc):
|
|
item.push_content(out)
|
|
|
|
else:
|
|
log('link not bigger enough')
|
|
return True
|
|
|
|
return True
|
|
|
|
|
|
def Fetch(url, options):
|
|
# basic url clean-up
|
|
if url is None:
|
|
raise MorssException('No url provided')
|
|
|
|
if urlparse(url).scheme not in PROTOCOL:
|
|
url = 'http://' + url
|
|
log(url)
|
|
|
|
url = url.replace(' ', '%20')
|
|
|
|
if isinstance(url, bytes):
|
|
url = url.decode()
|
|
|
|
# do some useful facebook work
|
|
pre = feedify.pre_worker(url)
|
|
if pre:
|
|
url = pre
|
|
log('url redirect')
|
|
log(url)
|
|
|
|
# fetch feed
|
|
delay = DELAY
|
|
|
|
if options.theforce:
|
|
delay = 0
|
|
|
|
try:
|
|
con = custom_handler(('xml', 'html'), delay).open(url, timeout=TIMEOUT * 2)
|
|
xml = con.read()
|
|
|
|
except (HTTPError) as e:
|
|
raise MorssException('Error downloading feed (HTTP Error %s)' % e.code)
|
|
|
|
except (IOError, HTTPException):
|
|
raise MorssException('Error downloading feed')
|
|
|
|
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
|
|
|
if url.startswith('https://itunes.apple.com/lookup?id='):
|
|
link = json.loads(xml.decode('utf-8', 'replace'))['results'][0]['feedUrl']
|
|
log('itunes redirect: %s' % link)
|
|
return Fetch(link, options)
|
|
|
|
elif re.match(b'\s*<?xml', xml) is not None or contenttype in MIMETYPE['xml']:
|
|
rss = feeds.parse(xml)
|
|
|
|
elif feedify.supported(url):
|
|
feed = feedify.Builder(url, xml)
|
|
feed.build()
|
|
rss = feed.feed
|
|
|
|
elif contenttype in MIMETYPE['html']:
|
|
match = lxml.html.fromstring(xml).xpath(
|
|
"//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
|
|
if len(match):
|
|
link = urljoin(url, match[0])
|
|
log('rss redirect: %s' % link)
|
|
return Fetch(link, options)
|
|
else:
|
|
log('no-link html')
|
|
raise MorssException('Link provided is an HTML page, which doesn\'t link to a feed')
|
|
else:
|
|
log('random page')
|
|
log(contenttype)
|
|
raise MorssException('Link provided is not a valid feed')
|
|
|
|
return rss
|
|
|
|
|
|
def Gather(rss, url, options):
|
|
size = len(rss.items)
|
|
start_time = time.time()
|
|
|
|
# custom settings
|
|
lim_item = LIM_ITEM
|
|
lim_time = LIM_TIME
|
|
max_item = MAX_ITEM
|
|
max_time = MAX_TIME
|
|
threads = THREADS
|
|
|
|
if options.cache:
|
|
max_time = 0
|
|
|
|
if options.mono:
|
|
threads = 1
|
|
|
|
# set
|
|
def runner(queue):
|
|
while True:
|
|
value = queue.get()
|
|
try:
|
|
worker(*value)
|
|
except Exception as e:
|
|
log('Thread Error: %s' % e.message)
|
|
queue.task_done()
|
|
|
|
def worker(i, item):
|
|
if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0:
|
|
log('dropped')
|
|
item.remove()
|
|
return
|
|
|
|
item = Fix(item, url)
|
|
|
|
if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
|
|
if not options.proxy:
|
|
if Fill(item, options, url, True) is False:
|
|
item.remove()
|
|
return
|
|
else:
|
|
if not options.proxy:
|
|
Fill(item, options, url)
|
|
|
|
queue = Queue()
|
|
|
|
for i in range(threads):
|
|
t = threading.Thread(target=runner, args=(queue,))
|
|
t.daemon = True
|
|
t.start()
|
|
|
|
for i, item in enumerate(list(rss.items)):
|
|
if threads == 1:
|
|
worker(*[i, item])
|
|
else:
|
|
queue.put([i, item])
|
|
|
|
if threads != 1:
|
|
queue.join()
|
|
|
|
if options.ad:
|
|
new = rss.items.append()
|
|
new.title = "Are you hungry?"
|
|
new.desc = "Eat some Galler chocolate :)"
|
|
new.link = "http://www.galler.com/"
|
|
new.time = "5 Oct 2013 22:42"
|
|
|
|
log(len(rss.items))
|
|
log(time.time() - start_time)
|
|
|
|
return rss
|
|
|
|
|
|
def Before(rss, options):
|
|
for i, item in enumerate(list(rss.items)):
|
|
if options.empty:
|
|
item.remove()
|
|
continue
|
|
|
|
if options.search:
|
|
if options.search not in item.title:
|
|
item.remove()
|
|
continue
|
|
|
|
return rss
|
|
|
|
|
|
def After(rss, options):
|
|
for i, item in enumerate(list(rss.items)):
|
|
if options.strip:
|
|
del item.desc
|
|
del item.content
|
|
|
|
if options.clip and item.desc and item.content:
|
|
item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
|
|
del item.desc
|
|
|
|
if not options.keep and not options.proxy:
|
|
del item.desc
|
|
|
|
if options.nolink and item.content:
|
|
content = lxml.html.fromstring(item.content)
|
|
for link in content.xpath('//a'):
|
|
log(link.text_content())
|
|
link.drop_tag()
|
|
item.content = lxml.etree.tostring(content)
|
|
|
|
if options.noref:
|
|
item.link = ''
|
|
|
|
if options.md:
|
|
conv = HTML2Text(baseurl=item.link)
|
|
conv.unicode_snob = True
|
|
|
|
if item.desc:
|
|
item.desc = conv.handle(item.desc)
|
|
if item.content:
|
|
item.content = conv.handle(item.content)
|
|
|
|
return rss
|
|
|
|
|
|
def Format(rss, options):
|
|
if options.callback:
|
|
if re.match(r'^[a-zA-Z0-9\.]+$', options.callback) is not None:
|
|
return '%s(%s)' % (options.callback, rss.tojson())
|
|
else:
|
|
raise MorssException('Invalid callback var name')
|
|
elif options.json:
|
|
if options.indent:
|
|
return rss.tojson(indent=4)
|
|
else:
|
|
return rss.tojson()
|
|
elif options.csv:
|
|
return rss.tocsv()
|
|
elif options.reader:
|
|
return rss.tohtml()
|
|
else:
|
|
if options.indent:
|
|
return rss.tostring(xml_declaration=True, encoding='UTF-8', pretty_print=True)
|
|
else:
|
|
return rss.tostring(xml_declaration=True, encoding='UTF-8')
|
|
|
|
|
|
def process(url, cache=None, options=None):
|
|
if not options:
|
|
options = []
|
|
|
|
options = Options(options)
|
|
if cache: crawler.sqlite_default = cache
|
|
rss = Fetch(url, options)
|
|
rss = Before(rss, options)
|
|
rss = Gather(rss, url, options)
|
|
rss = After(rss, options)
|
|
|
|
return Format(rss, options)
|
|
|
|
|
|
def cgi_app(environ, start_response):
|
|
# get options
|
|
if 'REQUEST_URI' in environ:
|
|
url = environ['REQUEST_URI'][1:]
|
|
else:
|
|
url = environ['PATH_INFO'][1:]
|
|
|
|
url = re.sub(r'^/?(morss.py|main.py|cgi/main.py)/', '', url)
|
|
|
|
if url.startswith(':'):
|
|
split = url.split('/', 1)
|
|
options = split[0].split(':')[1:]
|
|
if len(split) > 1:
|
|
url = split[1]
|
|
else:
|
|
url = ''
|
|
else:
|
|
options = []
|
|
|
|
# init
|
|
options = Options(filterOptions(parseOptions(options)))
|
|
headers = {}
|
|
|
|
global DEBUG
|
|
DEBUG = options.debug
|
|
|
|
# headers
|
|
headers['status'] = '200 OK'
|
|
headers['cache-control'] = 'max-age=%s' % DELAY
|
|
|
|
if options.cors:
|
|
headers['access-control-allow-origin'] = '*'
|
|
|
|
if options.html or options.reader:
|
|
headers['content-type'] = 'text/html'
|
|
elif options.txt:
|
|
headers['content-type'] = 'text/plain'
|
|
elif options.json:
|
|
headers['content-type'] = 'application/json'
|
|
elif options.callback:
|
|
headers['content-type'] = 'application/javascript'
|
|
elif options.csv:
|
|
headers['content-type'] = 'text/csv'
|
|
headers['content-disposition'] = 'attachment; filename="feed.csv"'
|
|
else:
|
|
headers['content-type'] = 'text/xml'
|
|
|
|
crawler.sqlite_default = os.path.join(os.getcwd(), 'morss-cache.db')
|
|
|
|
# get the work done
|
|
rss = Fetch(url, options)
|
|
|
|
if headers['content-type'] == 'text/xml':
|
|
headers['content-type'] = rss.mimetype
|
|
|
|
start_response(headers['status'], list(headers.items()))
|
|
|
|
rss = Before(rss, options)
|
|
rss = Gather(rss, url, options)
|
|
rss = After(rss, options)
|
|
out = Format(rss, options)
|
|
|
|
if not options.silent:
|
|
return out
|
|
|
|
log('done')
|
|
|
|
|
|
def cgi_wrapper(environ, start_response):
|
|
# simple http server for html and css
|
|
files = {
|
|
'': 'text/html',
|
|
'index.html': 'text/html'}
|
|
|
|
if 'REQUEST_URI' in environ:
|
|
url = environ['REQUEST_URI'][1:]
|
|
else:
|
|
url = environ['PATH_INFO'][1:]
|
|
|
|
if url in files:
|
|
headers = {}
|
|
|
|
if url == '':
|
|
url = 'index.html'
|
|
|
|
if '--root' in sys.argv[1:]:
|
|
path = os.path.join(sys.argv[-1], url)
|
|
|
|
else:
|
|
path = url
|
|
|
|
try:
|
|
body = open(path, 'rb').read()
|
|
|
|
headers['status'] = '200 OK'
|
|
headers['content-type'] = files[url]
|
|
start_response(headers['status'], list(headers.items()))
|
|
return body
|
|
|
|
except IOError:
|
|
headers['status'] = '404 Not found'
|
|
start_response(headers['status'], list(headers.items()))
|
|
return 'Error %s' % headers['status']
|
|
|
|
# actual morss use
|
|
try:
|
|
return cgi_app(environ, start_response) or []
|
|
except (KeyboardInterrupt, SystemExit):
|
|
raise
|
|
except Exception as e:
|
|
headers = {'status': '500 Oops', 'content-type': 'text/plain'}
|
|
start_response(headers['status'], list(headers.items()), sys.exc_info())
|
|
log('ERROR <%s>: %s' % (url, e.message), force=True)
|
|
return 'An error happened:\n%s' % e.message
|
|
|
|
|
|
def cli_app():
|
|
options = Options(filterOptions(parseOptions(sys.argv[1:-1])))
|
|
url = sys.argv[-1]
|
|
|
|
global DEBUG
|
|
DEBUG = options.debug
|
|
|
|
crawler.sqlite_default = os.path.expanduser('~/.cache/morss-cache.db')
|
|
|
|
rss = Fetch(url, options)
|
|
rss = Before(rss, options)
|
|
rss = Gather(rss, url, options)
|
|
rss = After(rss, options)
|
|
out = Format(rss, options)
|
|
|
|
if not options.silent:
|
|
print(out.decode('utf-8', 'replace') if isinstance(out, bytes) else out)
|
|
|
|
log('done')
|
|
|
|
|
|
def isInt(string):
|
|
try:
|
|
int(string)
|
|
return True
|
|
except ValueError:
|
|
return False
|
|
|
|
|
|
def main():
|
|
if 'REQUEST_URI' in os.environ:
|
|
# mod_cgi
|
|
wsgiref.handlers.CGIHandler().run(cgi_wrapper)
|
|
|
|
elif len(sys.argv) <= 1 or isInt(sys.argv[1]) or '--root' in sys.argv[1:]:
|
|
# start internal (basic) http server
|
|
|
|
if len(sys.argv) > 1 and isInt(sys.argv[1]):
|
|
argPort = int(sys.argv[1])
|
|
if argPort > 0:
|
|
port = argPort
|
|
else:
|
|
raise MorssException('Port must be positive integer')
|
|
|
|
else:
|
|
port = PORT
|
|
|
|
print('Serving http://localhost:%s/'%port)
|
|
httpd = wsgiref.simple_server.make_server('', port, cgi_wrapper)
|
|
httpd.serve_forever()
|
|
|
|
else:
|
|
# as a CLI app
|
|
try:
|
|
cli_app()
|
|
except (KeyboardInterrupt, SystemExit):
|
|
raise
|
|
except Exception as e:
|
|
print('ERROR: %s' % e.message)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|