Compare commits
23 Commits
v1.1
...
bf86c1e962
Author | SHA1 | Date | |
---|---|---|---|
bf86c1e962 | |||
d20f6237bd | |||
8a4d68d72c | |||
e6811138fd | |||
35b702fffd | |||
4a88886767 | |||
1653394cf7 | |||
a8a90cf414 | |||
bdbaf0f8a7 | |||
d0e447a2a6 | |||
e6817e01b4 | |||
7c3091d64c | |||
37b4e144a9 | |||
bd4b7b5bb2 | |||
68d920d4b5 | |||
758ff404a8 | |||
463530f02c | |||
ec0a28a91d | |||
421acb439d | |||
42c5d09ccb | |||
056de12484 | |||
961a31141f | |||
a7b01ee85e |
@@ -48,6 +48,7 @@ You do need:
|
|||||||
|
|
||||||
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
|
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
|
||||||
- [lxml](http://lxml.de/) for xml parsing
|
- [lxml](http://lxml.de/) for xml parsing
|
||||||
|
- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
|
||||||
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
|
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
|
||||||
- [chardet](https://pypi.python.org/pypi/chardet)
|
- [chardet](https://pypi.python.org/pypi/chardet)
|
||||||
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
|
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
|
||||||
|
@@ -27,13 +27,14 @@ except NameError:
|
|||||||
|
|
||||||
MIMETYPE = {
|
MIMETYPE = {
|
||||||
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
|
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
|
||||||
|
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
||||||
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
||||||
|
|
||||||
|
|
||||||
def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=False):
|
def custom_handler(follow=None, delay=None, encoding=None):
|
||||||
handlers = []
|
handlers = []
|
||||||
|
|
||||||
# as per urllib2 source code, these Handelers are added first
|
# as per urllib2 source code, these Handelers are added first
|
||||||
@@ -51,14 +52,12 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F
|
|||||||
handlers.append(HTTPEquivHandler())
|
handlers.append(HTTPEquivHandler())
|
||||||
handlers.append(HTTPRefreshHandler())
|
handlers.append(HTTPRefreshHandler())
|
||||||
handlers.append(UAHandler(DEFAULT_UA))
|
handlers.append(UAHandler(DEFAULT_UA))
|
||||||
|
handlers.append(AutoRefererHandler())
|
||||||
if not basic:
|
|
||||||
handlers.append(AutoRefererHandler())
|
|
||||||
|
|
||||||
handlers.append(EncodingFixHandler(encoding))
|
handlers.append(EncodingFixHandler(encoding))
|
||||||
|
|
||||||
if accept:
|
if follow:
|
||||||
handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict))
|
handlers.append(AlternateHandler(MIMETYPE[follow]))
|
||||||
|
|
||||||
handlers.append(CacheHandler(force_min=delay))
|
handlers.append(CacheHandler(force_min=delay))
|
||||||
|
|
||||||
@@ -198,43 +197,28 @@ class UAHandler(BaseHandler):
|
|||||||
|
|
||||||
class AutoRefererHandler(BaseHandler):
|
class AutoRefererHandler(BaseHandler):
|
||||||
def http_request(self, req):
|
def http_request(self, req):
|
||||||
req.add_unredirected_header('Referer', 'http://%s' % req.host)
|
req.add_unredirected_header('Referer', '%s://%s' % (req.type, req.host))
|
||||||
return req
|
return req
|
||||||
|
|
||||||
https_request = http_request
|
https_request = http_request
|
||||||
|
|
||||||
|
|
||||||
class ContentNegociationHandler(BaseHandler):
|
class AlternateHandler(BaseHandler):
|
||||||
" Handler for content negociation. Also parses <link rel='alternate' type='application/rss+xml' href='...' /> "
|
" Follow <link rel='alternate' type='application/rss+xml' href='...' /> "
|
||||||
|
|
||||||
def __init__(self, accept=None, strict=False):
|
def __init__(self, follow=None):
|
||||||
self.accept = accept
|
self.follow = follow or []
|
||||||
self.strict = strict
|
|
||||||
|
|
||||||
def http_request(self, req):
|
|
||||||
if self.accept is not None:
|
|
||||||
if isinstance(self.accept, basestring):
|
|
||||||
self.accept = (self.accept,)
|
|
||||||
|
|
||||||
string = ','.join(self.accept)
|
|
||||||
|
|
||||||
if self.strict:
|
|
||||||
string += ',*/*;q=0.9'
|
|
||||||
|
|
||||||
req.add_unredirected_header('Accept', string)
|
|
||||||
|
|
||||||
return req
|
|
||||||
|
|
||||||
def http_response(self, req, resp):
|
def http_response(self, req, resp):
|
||||||
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
||||||
if 200 <= resp.code < 300 and self.accept is not None and self.strict and contenttype in MIMETYPE['html'] and contenttype not in self.accept:
|
if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
|
||||||
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
|
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
|
||||||
|
|
||||||
data = resp.read()
|
data = resp.read()
|
||||||
links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
|
links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
if link.get('type', '') in self.accept:
|
if link.get('type', '') in self.follow:
|
||||||
resp.code = 302
|
resp.code = 302
|
||||||
resp.msg = 'Moved Temporarily'
|
resp.msg = 'Moved Temporarily'
|
||||||
resp.headers['location'] = link.get('href')
|
resp.headers['location'] = link.get('href')
|
||||||
@@ -246,7 +230,6 @@ class ContentNegociationHandler(BaseHandler):
|
|||||||
|
|
||||||
return resp
|
return resp
|
||||||
|
|
||||||
https_request = http_request
|
|
||||||
https_response = http_response
|
https_response = http_response
|
||||||
|
|
||||||
|
|
||||||
|
232
morss/morss.py
232
morss/morss.py
@@ -10,6 +10,7 @@ import re
|
|||||||
|
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
import lxml.html
|
import lxml.html
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from . import feeds
|
from . import feeds
|
||||||
from . import feedify
|
from . import feedify
|
||||||
@@ -18,19 +19,20 @@ from . import readabilite
|
|||||||
|
|
||||||
import wsgiref.simple_server
|
import wsgiref.simple_server
|
||||||
import wsgiref.handlers
|
import wsgiref.handlers
|
||||||
|
import cgitb
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# python 2
|
# python 2
|
||||||
from Queue import Queue
|
from Queue import Queue
|
||||||
from httplib import HTTPException
|
from httplib import HTTPException
|
||||||
from urllib import quote_plus
|
from urllib import unquote
|
||||||
from urlparse import urlparse, urljoin, parse_qs
|
from urlparse import urlparse, urljoin, parse_qs
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# python 3
|
# python 3
|
||||||
from queue import Queue
|
from queue import Queue
|
||||||
from http.client import HTTPException
|
from http.client import HTTPException
|
||||||
from urllib.parse import quote_plus
|
from urllib.parse import unquote
|
||||||
from urllib.parse import urlparse, urljoin, parse_qs
|
from urllib.parse import urlparse, urljoin, parse_qs
|
||||||
|
|
||||||
LIM_ITEM = 100 # deletes what's beyond
|
LIM_ITEM = 100 # deletes what's beyond
|
||||||
@@ -44,7 +46,7 @@ THREADS = 10 # number of threads (1 for single-threaded)
|
|||||||
DEBUG = False
|
DEBUG = False
|
||||||
PORT = 8080
|
PORT = 8080
|
||||||
|
|
||||||
PROTOCOL = ['http', 'https', 'ftp']
|
PROTOCOL = ['http', 'https']
|
||||||
|
|
||||||
|
|
||||||
def filterOptions(options):
|
def filterOptions(options):
|
||||||
@@ -66,6 +68,7 @@ def log(txt, force=False):
|
|||||||
if DEBUG or force:
|
if DEBUG or force:
|
||||||
if 'REQUEST_URI' in os.environ:
|
if 'REQUEST_URI' in os.environ:
|
||||||
open('morss.log', 'a').write("%s\n" % repr(txt))
|
open('morss.log', 'a').write("%s\n" % repr(txt))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print(repr(txt))
|
print(repr(txt))
|
||||||
|
|
||||||
@@ -73,6 +76,7 @@ def log(txt, force=False):
|
|||||||
def len_html(txt):
|
def len_html(txt):
|
||||||
if len(txt):
|
if len(txt):
|
||||||
return len(lxml.html.fromstring(txt).text_content())
|
return len(lxml.html.fromstring(txt).text_content())
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
@@ -80,6 +84,7 @@ def len_html(txt):
|
|||||||
def count_words(txt):
|
def count_words(txt):
|
||||||
if len(txt):
|
if len(txt):
|
||||||
return len(lxml.html.fromstring(txt).text_content().split())
|
return len(lxml.html.fromstring(txt).text_content().split())
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
@@ -88,12 +93,14 @@ class Options:
|
|||||||
if len(args):
|
if len(args):
|
||||||
self.options = args
|
self.options = args
|
||||||
self.options.update(options or {})
|
self.options.update(options or {})
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.options = options or {}
|
self.options = options or {}
|
||||||
|
|
||||||
def __getattr__(self, key):
|
def __getattr__(self, key):
|
||||||
if key in self.options:
|
if key in self.options:
|
||||||
return self.options[key]
|
return self.options[key]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@@ -107,17 +114,23 @@ class Options:
|
|||||||
def parseOptions(options):
|
def parseOptions(options):
|
||||||
""" Turns ['md=True'] into {'md':True} """
|
""" Turns ['md=True'] into {'md':True} """
|
||||||
out = {}
|
out = {}
|
||||||
|
|
||||||
for option in options:
|
for option in options:
|
||||||
split = option.split('=', 1)
|
split = option.split('=', 1)
|
||||||
|
|
||||||
if len(split) > 1:
|
if len(split) > 1:
|
||||||
if split[0].lower() == 'true':
|
if split[0].lower() == 'true':
|
||||||
out[split[0]] = True
|
out[split[0]] = True
|
||||||
|
|
||||||
elif split[0].lower() == 'false':
|
elif split[0].lower() == 'false':
|
||||||
out[split[0]] = False
|
out[split[0]] = False
|
||||||
|
|
||||||
else:
|
else:
|
||||||
out[split[0]] = split[1]
|
out[split[0]] = split[1]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
out[split[0]] = True
|
out[split[0]] = True
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
@@ -158,6 +171,11 @@ def ItemFix(item, feedurl='/'):
|
|||||||
item.link = parse_qs(urlparse(item.link).query)['url'][0]
|
item.link = parse_qs(urlparse(item.link).query)['url'][0]
|
||||||
log(item.link)
|
log(item.link)
|
||||||
|
|
||||||
|
# pocket
|
||||||
|
if fnmatch(item.link, 'https://getpocket.com/redirect?url=*'):
|
||||||
|
item.link = parse_qs(urlparse(item.link).query)['url'][0]
|
||||||
|
log(item.link)
|
||||||
|
|
||||||
# facebook
|
# facebook
|
||||||
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
|
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
|
||||||
item.link = parse_qs(urlparse(item.link).query)['u'][0]
|
item.link = parse_qs(urlparse(item.link).query)['u'][0]
|
||||||
@@ -208,6 +226,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
|||||||
if len(match):
|
if len(match):
|
||||||
link = match[0]
|
link = match[0]
|
||||||
log(link)
|
log(link)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
link = None
|
link = None
|
||||||
|
|
||||||
@@ -217,6 +236,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
|||||||
if len(match) and urlparse(match[0]).netloc != 'www.facebook.com':
|
if len(match) and urlparse(match[0]).netloc != 'www.facebook.com':
|
||||||
link = match[0]
|
link = match[0]
|
||||||
log(link)
|
log(link)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
link = None
|
link = None
|
||||||
|
|
||||||
@@ -232,7 +252,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
|||||||
delay = -2
|
delay = -2
|
||||||
|
|
||||||
try:
|
try:
|
||||||
con = crawler.custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT)
|
con = crawler.custom_handler(delay=delay, encoding=options.encoding).open(link, timeout=TIMEOUT)
|
||||||
data = con.read()
|
data = con.read()
|
||||||
|
|
||||||
except (IOError, HTTPException) as e:
|
except (IOError, HTTPException) as e:
|
||||||
@@ -284,24 +304,27 @@ def ItemAfter(item, options):
|
|||||||
return item
|
return item
|
||||||
|
|
||||||
|
|
||||||
def FeedFetch(url, options):
|
def UrlFix(url):
|
||||||
# basic url clean-up
|
|
||||||
if url is None:
|
if url is None:
|
||||||
raise MorssException('No url provided')
|
raise MorssException('No url provided')
|
||||||
|
|
||||||
|
if isinstance(url, bytes):
|
||||||
|
url = url.decode()
|
||||||
|
|
||||||
if urlparse(url).scheme not in PROTOCOL:
|
if urlparse(url).scheme not in PROTOCOL:
|
||||||
url = 'http://' + url
|
url = 'http://' + url
|
||||||
log(url)
|
log(url)
|
||||||
|
|
||||||
url = url.replace(' ', '%20')
|
url = url.replace(' ', '%20')
|
||||||
|
|
||||||
if isinstance(url, bytes):
|
return url
|
||||||
url = url.decode()
|
|
||||||
|
|
||||||
|
|
||||||
|
def FeedFetch(url, options):
|
||||||
# allow for code execution for feedify
|
# allow for code execution for feedify
|
||||||
pre = feedify.pre_worker(url)
|
pre = feedify.pre_worker(url)
|
||||||
if pre:
|
if pre:
|
||||||
url = pre
|
url = UrlFix(pre)
|
||||||
log('url redirect')
|
log('url redirect')
|
||||||
log(url)
|
log(url)
|
||||||
|
|
||||||
@@ -312,8 +335,7 @@ def FeedFetch(url, options):
|
|||||||
delay = 0
|
delay = 0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
con = crawler.custom_handler(accept='xml', strict=True, delay=delay,
|
con = crawler.custom_handler(follow='rss', delay=delay, encoding=options.encoding) \
|
||||||
encoding=options.encoding, basic=not options.items) \
|
|
||||||
.open(url, timeout=TIMEOUT * 2)
|
.open(url, timeout=TIMEOUT * 2)
|
||||||
xml = con.read()
|
xml = con.read()
|
||||||
|
|
||||||
@@ -324,20 +346,24 @@ def FeedFetch(url, options):
|
|||||||
|
|
||||||
if options.items:
|
if options.items:
|
||||||
# using custom rules
|
# using custom rules
|
||||||
rss = feeds.FeedHTML(xml, url, contenttype)
|
rss = feeds.FeedHTML(xml)
|
||||||
feed.rule
|
|
||||||
|
rss.rules['title'] = options.title if options.title else '//head/title'
|
||||||
|
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
|
||||||
|
|
||||||
rss.rules['items'] = options.items
|
rss.rules['items'] = options.items
|
||||||
|
|
||||||
if options.item_title:
|
rss.rules['item_title'] = options.item_title if options.item_title else './/a|.'
|
||||||
rss.rules['item_title'] = options.item_title
|
rss.rules['item_link'] = options.item_link if options.item_link else './@href|.//a/@href'
|
||||||
if options.item_link:
|
|
||||||
rss.rules['item_link'] = options.item_link
|
|
||||||
if options.item_content:
|
if options.item_content:
|
||||||
rss.rules['item_content'] = options.item_content
|
rss.rules['item_content'] = options.item_content
|
||||||
|
|
||||||
if options.item_time:
|
if options.item_time:
|
||||||
rss.rules['item_time'] = options.item_time
|
rss.rules['item_time'] = options.item_time
|
||||||
|
|
||||||
|
rss = rss.convert(feeds.FeedXML)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
rss = feeds.parse(xml, url, contenttype)
|
rss = feeds.parse(xml, url, contenttype)
|
||||||
@@ -375,6 +401,7 @@ def FeedGather(rss, url, options):
|
|||||||
value = queue.get()
|
value = queue.get()
|
||||||
try:
|
try:
|
||||||
worker(*value)
|
worker(*value)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log('Thread Error: %s' % e.message)
|
log('Thread Error: %s' % e.message)
|
||||||
queue.task_done()
|
queue.task_done()
|
||||||
@@ -414,6 +441,7 @@ def FeedGather(rss, url, options):
|
|||||||
for i, item in enumerate(list(rss.items)):
|
for i, item in enumerate(list(rss.items)):
|
||||||
if threads == 1:
|
if threads == 1:
|
||||||
worker(*[i, item])
|
worker(*[i, item])
|
||||||
|
|
||||||
else:
|
else:
|
||||||
queue.put([i, item])
|
queue.put([i, item])
|
||||||
|
|
||||||
@@ -433,37 +461,38 @@ def FeedGather(rss, url, options):
|
|||||||
return rss
|
return rss
|
||||||
|
|
||||||
|
|
||||||
def FeedFormat(rss, options):
|
def FeedFormat(rss, options, encoding='utf-8'):
|
||||||
if options.callback:
|
if options.callback:
|
||||||
if re.match(r'^[a-zA-Z0-9\.]+$', options.callback) is not None:
|
if re.match(r'^[a-zA-Z0-9\.]+$', options.callback) is not None:
|
||||||
return '%s(%s)' % (options.callback, rss.tojson())
|
out = '%s(%s)' % (options.callback, rss.tojson(encoding='unicode'))
|
||||||
|
return out if encoding == 'unicode' else out.encode(encoding)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise MorssException('Invalid callback var name')
|
raise MorssException('Invalid callback var name')
|
||||||
|
|
||||||
elif options.json:
|
elif options.json:
|
||||||
if options.indent:
|
if options.indent:
|
||||||
return rss.tojson(encoding='UTF-8', indent=4)
|
return rss.tojson(encoding=encoding, indent=4)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return rss.tojson(encoding='UTF-8')
|
return rss.tojson(encoding=encoding)
|
||||||
|
|
||||||
elif options.csv:
|
elif options.csv:
|
||||||
return rss.tocsv(encoding='UTF-8')
|
return rss.tocsv(encoding=encoding)
|
||||||
|
|
||||||
elif options.reader:
|
elif options.reader:
|
||||||
if options.indent:
|
if options.indent:
|
||||||
return rss.tohtml(encoding='UTF-8', pretty_print=True)
|
return rss.tohtml(encoding=encoding, pretty_print=True)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return rss.tohtml(encoding='UTF-8')
|
return rss.tohtml(encoding=encoding)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if options.indent:
|
if options.indent:
|
||||||
return rss.torss(xml_declaration=True, encoding='UTF-8', pretty_print=True)
|
return rss.torss(xml_declaration=True, encoding=encoding, pretty_print=True)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return rss.torss(xml_declaration=True, encoding='UTF-8')
|
return rss.torss(xml_declaration=True, encoding=encoding)
|
||||||
|
|
||||||
|
|
||||||
def process(url, cache=None, options=None):
|
def process(url, cache=None, options=None):
|
||||||
@@ -475,14 +504,16 @@ def process(url, cache=None, options=None):
|
|||||||
if cache:
|
if cache:
|
||||||
crawler.default_cache = crawler.SQLiteCache(cache)
|
crawler.default_cache = crawler.SQLiteCache(cache)
|
||||||
|
|
||||||
|
url = UrlFix(url)
|
||||||
rss = FeedFetch(url, options)
|
rss = FeedFetch(url, options)
|
||||||
rss = FeedGather(rss, url, options)
|
rss = FeedGather(rss, url, options)
|
||||||
|
|
||||||
return FeedFormat(rss, options)
|
return FeedFormat(rss, options)
|
||||||
|
|
||||||
|
|
||||||
def cgi_app(environ, start_response):
|
def cgi_parse_environ(environ):
|
||||||
# get options
|
# get options
|
||||||
|
|
||||||
if 'REQUEST_URI' in environ:
|
if 'REQUEST_URI' in environ:
|
||||||
url = environ['REQUEST_URI'][1:]
|
url = environ['REQUEST_URI'][1:]
|
||||||
else:
|
else:
|
||||||
@@ -496,7 +527,7 @@ def cgi_app(environ, start_response):
|
|||||||
if url.startswith(':'):
|
if url.startswith(':'):
|
||||||
split = url.split('/', 1)
|
split = url.split('/', 1)
|
||||||
|
|
||||||
options = split[0].replace('|', '/').replace('\\\'', '\'').split(':')[1:]
|
raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:]
|
||||||
|
|
||||||
if len(split) > 1:
|
if len(split) > 1:
|
||||||
url = split[1]
|
url = split[1]
|
||||||
@@ -504,15 +535,22 @@ def cgi_app(environ, start_response):
|
|||||||
url = ''
|
url = ''
|
||||||
|
|
||||||
else:
|
else:
|
||||||
options = []
|
raw_options = []
|
||||||
|
|
||||||
# init
|
# init
|
||||||
options = Options(filterOptions(parseOptions(options)))
|
options = Options(filterOptions(parseOptions(raw_options)))
|
||||||
headers = {}
|
|
||||||
|
|
||||||
global DEBUG
|
global DEBUG
|
||||||
DEBUG = options.debug
|
DEBUG = options.debug
|
||||||
|
|
||||||
|
return (url, options)
|
||||||
|
|
||||||
|
|
||||||
|
def cgi_app(environ, start_response):
|
||||||
|
url, options = cgi_parse_environ(environ)
|
||||||
|
|
||||||
|
headers = {}
|
||||||
|
|
||||||
# headers
|
# headers
|
||||||
headers['status'] = '200 OK'
|
headers['status'] = '200 OK'
|
||||||
headers['cache-control'] = 'max-age=%s' % DELAY
|
headers['cache-control'] = 'max-age=%s' % DELAY
|
||||||
@@ -537,6 +575,7 @@ def cgi_app(environ, start_response):
|
|||||||
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
|
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
|
||||||
|
|
||||||
# get the work done
|
# get the work done
|
||||||
|
url = UrlFix(url)
|
||||||
rss = FeedFetch(url, options)
|
rss = FeedFetch(url, options)
|
||||||
|
|
||||||
if headers['content-type'] == 'text/xml':
|
if headers['content-type'] == 'text/xml':
|
||||||
@@ -547,18 +586,42 @@ def cgi_app(environ, start_response):
|
|||||||
rss = FeedGather(rss, url, options)
|
rss = FeedGather(rss, url, options)
|
||||||
out = FeedFormat(rss, options)
|
out = FeedFormat(rss, options)
|
||||||
|
|
||||||
if not options.silent:
|
if options.silent:
|
||||||
return out
|
return ['']
|
||||||
|
|
||||||
|
else:
|
||||||
|
return [out]
|
||||||
|
|
||||||
|
|
||||||
def cgi_wrapper(environ, start_response):
|
def middleware(func):
|
||||||
# simple http server for html and css
|
" Decorator to turn a function into a wsgi middleware "
|
||||||
|
# This is called when parsing the code
|
||||||
|
|
||||||
|
def app_builder(app):
|
||||||
|
# This is called when doing app = cgi_wrapper(app)
|
||||||
|
|
||||||
|
def app_wrap(environ, start_response):
|
||||||
|
# This is called when a http request is being processed
|
||||||
|
|
||||||
|
return func(environ, start_response, app)
|
||||||
|
|
||||||
|
return app_wrap
|
||||||
|
|
||||||
|
return app_builder
|
||||||
|
|
||||||
|
|
||||||
|
@middleware
|
||||||
|
def cgi_file_handler(environ, start_response, app):
|
||||||
|
" Simple HTTP server to serve static files (.html, .css, etc.) "
|
||||||
|
|
||||||
files = {
|
files = {
|
||||||
'': 'text/html',
|
'': 'text/html',
|
||||||
'index.html': 'text/html'}
|
'index.html': 'text/html',
|
||||||
|
'sheet.xsl': 'text/xsl'}
|
||||||
|
|
||||||
if 'REQUEST_URI' in environ:
|
if 'REQUEST_URI' in environ:
|
||||||
url = environ['REQUEST_URI'][1:]
|
url = environ['REQUEST_URI'][1:]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
url = environ['PATH_INFO'][1:]
|
url = environ['PATH_INFO'][1:]
|
||||||
|
|
||||||
@@ -587,16 +650,80 @@ def cgi_wrapper(environ, start_response):
|
|||||||
start_response(headers['status'], list(headers.items()))
|
start_response(headers['status'], list(headers.items()))
|
||||||
return ['Error %s' % headers['status']]
|
return ['Error %s' % headers['status']]
|
||||||
|
|
||||||
# actual morss use
|
else:
|
||||||
|
return app(environ, start_response)
|
||||||
|
|
||||||
|
|
||||||
|
def cgi_page(environ, start_response):
|
||||||
|
url, options = cgi_parse_environ(environ)
|
||||||
|
|
||||||
|
# get page
|
||||||
|
PROTOCOL = ['http', 'https']
|
||||||
|
|
||||||
|
if urlparse(url).scheme not in ['http', 'https']:
|
||||||
|
url = 'http://' + url
|
||||||
|
|
||||||
|
con = crawler.custom_handler().open(url)
|
||||||
|
data = con.read()
|
||||||
|
|
||||||
|
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
||||||
|
|
||||||
|
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
||||||
|
html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
|
||||||
|
html.make_links_absolute(con.geturl())
|
||||||
|
|
||||||
|
kill_tags = ['script', 'iframe', 'noscript']
|
||||||
|
|
||||||
|
for tag in kill_tags:
|
||||||
|
for elem in html.xpath('//'+tag):
|
||||||
|
elem.getparent().remove(elem)
|
||||||
|
|
||||||
|
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
|
||||||
|
|
||||||
|
else:
|
||||||
|
output = None
|
||||||
|
|
||||||
|
# return html page
|
||||||
|
headers = {'status': '200 OK', 'content-type': 'text/html'}
|
||||||
|
start_response(headers['status'], list(headers.items()))
|
||||||
|
return [output]
|
||||||
|
|
||||||
|
|
||||||
|
dispatch_table = {
|
||||||
|
'getpage': cgi_page
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@middleware
|
||||||
|
def cgi_dispatcher(environ, start_response, app):
|
||||||
|
url, options = cgi_parse_environ(environ)
|
||||||
|
|
||||||
|
for key in dispatch_table.keys():
|
||||||
|
if key in options:
|
||||||
|
return dispatch_table[key](environ, start_response)
|
||||||
|
|
||||||
|
return app(environ, start_response)
|
||||||
|
|
||||||
|
|
||||||
|
@middleware
|
||||||
|
def cgi_error_handler(environ, start_response, app):
|
||||||
try:
|
try:
|
||||||
return [cgi_app(environ, start_response) or '(empty)']
|
return app(environ, start_response)
|
||||||
|
|
||||||
except (KeyboardInterrupt, SystemExit):
|
except (KeyboardInterrupt, SystemExit):
|
||||||
raise
|
raise
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
headers = {'status': '500 Oops', 'content-type': 'text/plain'}
|
headers = {'status': '500 Oops', 'content-type': 'text/html'}
|
||||||
start_response(headers['status'], list(headers.items()), sys.exc_info())
|
start_response(headers['status'], list(headers.items()), sys.exc_info())
|
||||||
log('ERROR <%s>: %s' % (url, e.message), force=True)
|
log('ERROR: %s' % repr(e), force=True)
|
||||||
return ['An error happened:\n%s' % e.message]
|
return [cgitb.html(sys.exc_info())]
|
||||||
|
|
||||||
|
|
||||||
|
@middleware
|
||||||
|
def cgi_encode(environ, start_response, app):
|
||||||
|
out = app(environ, start_response)
|
||||||
|
return [x if isinstance(x, bytes) else x.encode('utf-8') for x in out]
|
||||||
|
|
||||||
|
|
||||||
def cli_app():
|
def cli_app():
|
||||||
@@ -608,6 +735,7 @@ def cli_app():
|
|||||||
|
|
||||||
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
|
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
|
||||||
|
|
||||||
|
url = UrlFix(url)
|
||||||
rss = FeedFetch(url, options)
|
rss = FeedFetch(url, options)
|
||||||
rss = FeedGather(rss, url, options)
|
rss = FeedGather(rss, url, options)
|
||||||
out = FeedFormat(rss, options)
|
out = FeedFormat(rss, options)
|
||||||
@@ -622,6 +750,7 @@ def isInt(string):
|
|||||||
try:
|
try:
|
||||||
int(string)
|
int(string)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@@ -629,7 +758,13 @@ def isInt(string):
|
|||||||
def main():
|
def main():
|
||||||
if 'REQUEST_URI' in os.environ:
|
if 'REQUEST_URI' in os.environ:
|
||||||
# mod_cgi
|
# mod_cgi
|
||||||
wsgiref.handlers.CGIHandler().run(cgi_wrapper)
|
|
||||||
|
app = cgi_app
|
||||||
|
app = cgi_dispatcher(app)
|
||||||
|
app = cgi_error_handler(app)
|
||||||
|
app = cgi_encode(app)
|
||||||
|
|
||||||
|
wsgiref.handlers.CGIHandler().run(app)
|
||||||
|
|
||||||
elif len(sys.argv) <= 1 or isInt(sys.argv[1]) or '--root' in sys.argv[1:]:
|
elif len(sys.argv) <= 1 or isInt(sys.argv[1]) or '--root' in sys.argv[1:]:
|
||||||
# start internal (basic) http server
|
# start internal (basic) http server
|
||||||
@@ -638,22 +773,31 @@ def main():
|
|||||||
argPort = int(sys.argv[1])
|
argPort = int(sys.argv[1])
|
||||||
if argPort > 0:
|
if argPort > 0:
|
||||||
port = argPort
|
port = argPort
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise MorssException('Port must be positive integer')
|
raise MorssException('Port must be positive integer')
|
||||||
|
|
||||||
else:
|
else:
|
||||||
port = PORT
|
port = PORT
|
||||||
|
|
||||||
print('Serving http://localhost:%s/'%port)
|
app = cgi_app
|
||||||
httpd = wsgiref.simple_server.make_server('', port, cgi_wrapper)
|
app = cgi_file_handler(app)
|
||||||
|
app = cgi_dispatcher(app)
|
||||||
|
app = cgi_error_handler(app)
|
||||||
|
app = cgi_encode(app)
|
||||||
|
|
||||||
|
print('Serving http://localhost:%s/' % port)
|
||||||
|
httpd = wsgiref.simple_server.make_server('', port, app)
|
||||||
httpd.serve_forever()
|
httpd.serve_forever()
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# as a CLI app
|
# as a CLI app
|
||||||
try:
|
try:
|
||||||
cli_app()
|
cli_app()
|
||||||
|
|
||||||
except (KeyboardInterrupt, SystemExit):
|
except (KeyboardInterrupt, SystemExit):
|
||||||
raise
|
raise
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print('ERROR: %s' % e.message)
|
print('ERROR: %s' % e.message)
|
||||||
|
|
||||||
|
@@ -93,6 +93,7 @@ def score_node(node):
|
|||||||
class_id = node.get('class', '') + node.get('id', '')
|
class_id = node.get('class', '') + node.get('id', '')
|
||||||
|
|
||||||
if (isinstance(node, lxml.html.HtmlComment)
|
if (isinstance(node, lxml.html.HtmlComment)
|
||||||
|
or isinstance(node, lxml.html.HtmlProcessingInstruction)
|
||||||
or node.tag in tags_bad
|
or node.tag in tags_bad
|
||||||
or regex_bad.search(class_id)):
|
or regex_bad.search(class_id)):
|
||||||
return 0
|
return 0
|
||||||
|
@@ -1,4 +1,5 @@
|
|||||||
lxml
|
lxml
|
||||||
|
bs4
|
||||||
python-dateutil <= 1.5
|
python-dateutil <= 1.5
|
||||||
chardet
|
chardet
|
||||||
pymysql
|
pymysql
|
||||||
|
@@ -13,6 +13,7 @@
|
|||||||
body {
|
body {
|
||||||
overflow-wrap: anywhere;
|
overflow-wrap: anywhere;
|
||||||
word-wrap: anywhere;
|
word-wrap: anywhere;
|
||||||
|
font-family: sans;
|
||||||
}
|
}
|
||||||
|
|
||||||
#url {
|
#url {
|
||||||
|
Reference in New Issue
Block a user