Compare commits
30 Commits
v1.1
...
e136b0feb2
Author | SHA1 | Date | |
---|---|---|---|
e136b0feb2 | |||
6cf32af6c0 | |||
568e7d7dd2 | |||
3617f86e9d | |||
d90756b337 | |||
40c69f17d2 | |||
99461ea185 | |||
bf86c1e962 | |||
d20f6237bd | |||
8a4d68d72c | |||
e6811138fd | |||
35b702fffd | |||
4a88886767 | |||
1653394cf7 | |||
a8a90cf414 | |||
bdbaf0f8a7 | |||
d0e447a2a6 | |||
e6817e01b4 | |||
7c3091d64c | |||
37b4e144a9 | |||
bd4b7b5bb2 | |||
68d920d4b5 | |||
758ff404a8 | |||
463530f02c | |||
ec0a28a91d | |||
421acb439d | |||
42c5d09ccb | |||
056de12484 | |||
961a31141f | |||
a7b01ee85e |
@@ -48,6 +48,7 @@ You do need:
|
||||
|
||||
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
|
||||
- [lxml](http://lxml.de/) for xml parsing
|
||||
- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
|
||||
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
|
||||
- [chardet](https://pypi.python.org/pypi/chardet)
|
||||
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
|
||||
@@ -76,7 +77,6 @@ The arguments are:
|
||||
- `json`: output as JSON
|
||||
- `proxy`: doesn't fill the articles
|
||||
- `clip`: stick the full article content under the original feed content (useful for twitter)
|
||||
- `keep`: by default, morss does drop feed description whenever the full-content is found (so as not to mislead users who use Firefox, since the latter only shows the description in the feed preview, so they might believe morss doens't work), but with this argument, the description is kept
|
||||
- `search=STRING`: does a basic case-sensitive search in the feed
|
||||
- Advanced
|
||||
- `csv`: export to csv
|
||||
|
@@ -27,13 +27,14 @@ except NameError:
|
||||
|
||||
MIMETYPE = {
|
||||
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
|
||||
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
||||
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
||||
|
||||
|
||||
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
||||
|
||||
|
||||
def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=False):
|
||||
def custom_handler(follow=None, delay=None, encoding=None):
|
||||
handlers = []
|
||||
|
||||
# as per urllib2 source code, these Handelers are added first
|
||||
@@ -51,14 +52,12 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F
|
||||
handlers.append(HTTPEquivHandler())
|
||||
handlers.append(HTTPRefreshHandler())
|
||||
handlers.append(UAHandler(DEFAULT_UA))
|
||||
|
||||
if not basic:
|
||||
handlers.append(AutoRefererHandler())
|
||||
handlers.append(AutoRefererHandler())
|
||||
|
||||
handlers.append(EncodingFixHandler(encoding))
|
||||
|
||||
if accept:
|
||||
handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict))
|
||||
if follow:
|
||||
handlers.append(AlternateHandler(MIMETYPE[follow]))
|
||||
|
||||
handlers.append(CacheHandler(force_min=delay))
|
||||
|
||||
@@ -198,43 +197,28 @@ class UAHandler(BaseHandler):
|
||||
|
||||
class AutoRefererHandler(BaseHandler):
|
||||
def http_request(self, req):
|
||||
req.add_unredirected_header('Referer', 'http://%s' % req.host)
|
||||
req.add_unredirected_header('Referer', '%s://%s' % (req.type, req.host))
|
||||
return req
|
||||
|
||||
https_request = http_request
|
||||
|
||||
|
||||
class ContentNegociationHandler(BaseHandler):
|
||||
" Handler for content negociation. Also parses <link rel='alternate' type='application/rss+xml' href='...' /> "
|
||||
class AlternateHandler(BaseHandler):
|
||||
" Follow <link rel='alternate' type='application/rss+xml' href='...' /> "
|
||||
|
||||
def __init__(self, accept=None, strict=False):
|
||||
self.accept = accept
|
||||
self.strict = strict
|
||||
|
||||
def http_request(self, req):
|
||||
if self.accept is not None:
|
||||
if isinstance(self.accept, basestring):
|
||||
self.accept = (self.accept,)
|
||||
|
||||
string = ','.join(self.accept)
|
||||
|
||||
if self.strict:
|
||||
string += ',*/*;q=0.9'
|
||||
|
||||
req.add_unredirected_header('Accept', string)
|
||||
|
||||
return req
|
||||
def __init__(self, follow=None):
|
||||
self.follow = follow or []
|
||||
|
||||
def http_response(self, req, resp):
|
||||
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
||||
if 200 <= resp.code < 300 and self.accept is not None and self.strict and contenttype in MIMETYPE['html'] and contenttype not in self.accept:
|
||||
if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
|
||||
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
|
||||
|
||||
data = resp.read()
|
||||
links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
|
||||
|
||||
for link in links:
|
||||
if link.get('type', '') in self.accept:
|
||||
if link.get('type', '') in self.follow:
|
||||
resp.code = 302
|
||||
resp.msg = 'Moved Temporarily'
|
||||
resp.headers['location'] = link.get('href')
|
||||
@@ -246,7 +230,6 @@ class ContentNegociationHandler(BaseHandler):
|
||||
|
||||
return resp
|
||||
|
||||
https_request = http_request
|
||||
https_response = http_response
|
||||
|
||||
|
||||
@@ -384,7 +367,7 @@ class CacheHandler(BaseHandler):
|
||||
|
||||
elif self.force_min is None and ('no-cache' in cc_list
|
||||
or 'no-store' in cc_list
|
||||
or ('private' in cc_list and not self.private)):
|
||||
or ('private' in cc_list and not self.private_cache)):
|
||||
# kindly follow web servers indications, refresh
|
||||
return None
|
||||
|
||||
@@ -419,7 +402,7 @@ class CacheHandler(BaseHandler):
|
||||
|
||||
cc_list = [x for x in cache_control if '=' not in x]
|
||||
|
||||
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private):
|
||||
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache):
|
||||
# kindly follow web servers indications
|
||||
return resp
|
||||
|
||||
|
@@ -15,6 +15,7 @@ import dateutil.parser
|
||||
from copy import deepcopy
|
||||
|
||||
import lxml.html
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
json.encoder.c_make_encoder = None
|
||||
|
||||
@@ -441,7 +442,7 @@ class ParserHTML(ParserXML):
|
||||
|
||||
def parse(self, raw):
|
||||
parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
|
||||
return etree.fromstring(raw, parser)
|
||||
return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify('utf-8'), parser)
|
||||
|
||||
def tostring(self, encoding='unicode', **k):
|
||||
return lxml.html.tostring(self.root, encoding=encoding, **k)
|
||||
|
237
morss/morss.py
237
morss/morss.py
@@ -10,6 +10,7 @@ import re
|
||||
|
||||
import lxml.etree
|
||||
import lxml.html
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from . import feeds
|
||||
from . import feedify
|
||||
@@ -18,19 +19,20 @@ from . import readabilite
|
||||
|
||||
import wsgiref.simple_server
|
||||
import wsgiref.handlers
|
||||
import cgitb
|
||||
|
||||
|
||||
try:
|
||||
# python 2
|
||||
from Queue import Queue
|
||||
from httplib import HTTPException
|
||||
from urllib import quote_plus
|
||||
from urllib import unquote
|
||||
from urlparse import urlparse, urljoin, parse_qs
|
||||
except ImportError:
|
||||
# python 3
|
||||
from queue import Queue
|
||||
from http.client import HTTPException
|
||||
from urllib.parse import quote_plus
|
||||
from urllib.parse import unquote
|
||||
from urllib.parse import urlparse, urljoin, parse_qs
|
||||
|
||||
LIM_ITEM = 100 # deletes what's beyond
|
||||
@@ -44,7 +46,7 @@ THREADS = 10 # number of threads (1 for single-threaded)
|
||||
DEBUG = False
|
||||
PORT = 8080
|
||||
|
||||
PROTOCOL = ['http', 'https', 'ftp']
|
||||
PROTOCOL = ['http', 'https']
|
||||
|
||||
|
||||
def filterOptions(options):
|
||||
@@ -52,7 +54,7 @@ def filterOptions(options):
|
||||
|
||||
# example of filtering code below
|
||||
|
||||
#allowed = ['proxy', 'clip', 'keep', 'cache', 'force', 'silent', 'pro', 'debug']
|
||||
#allowed = ['proxy', 'clip', 'cache', 'force', 'silent', 'pro', 'debug']
|
||||
#filtered = dict([(key,value) for (key,value) in options.items() if key in allowed])
|
||||
|
||||
#return filtered
|
||||
@@ -66,6 +68,7 @@ def log(txt, force=False):
|
||||
if DEBUG or force:
|
||||
if 'REQUEST_URI' in os.environ:
|
||||
open('morss.log', 'a').write("%s\n" % repr(txt))
|
||||
|
||||
else:
|
||||
print(repr(txt))
|
||||
|
||||
@@ -73,6 +76,7 @@ def log(txt, force=False):
|
||||
def len_html(txt):
|
||||
if len(txt):
|
||||
return len(lxml.html.fromstring(txt).text_content())
|
||||
|
||||
else:
|
||||
return 0
|
||||
|
||||
@@ -80,6 +84,7 @@ def len_html(txt):
|
||||
def count_words(txt):
|
||||
if len(txt):
|
||||
return len(lxml.html.fromstring(txt).text_content().split())
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
@@ -88,12 +93,14 @@ class Options:
|
||||
if len(args):
|
||||
self.options = args
|
||||
self.options.update(options or {})
|
||||
|
||||
else:
|
||||
self.options = options or {}
|
||||
|
||||
def __getattr__(self, key):
|
||||
if key in self.options:
|
||||
return self.options[key]
|
||||
|
||||
else:
|
||||
return False
|
||||
|
||||
@@ -107,17 +114,23 @@ class Options:
|
||||
def parseOptions(options):
|
||||
""" Turns ['md=True'] into {'md':True} """
|
||||
out = {}
|
||||
|
||||
for option in options:
|
||||
split = option.split('=', 1)
|
||||
|
||||
if len(split) > 1:
|
||||
if split[0].lower() == 'true':
|
||||
out[split[0]] = True
|
||||
|
||||
elif split[0].lower() == 'false':
|
||||
out[split[0]] = False
|
||||
|
||||
else:
|
||||
out[split[0]] = split[1]
|
||||
|
||||
else:
|
||||
out[split[0]] = True
|
||||
|
||||
return out
|
||||
|
||||
|
||||
@@ -158,6 +171,11 @@ def ItemFix(item, feedurl='/'):
|
||||
item.link = parse_qs(urlparse(item.link).query)['url'][0]
|
||||
log(item.link)
|
||||
|
||||
# pocket
|
||||
if fnmatch(item.link, 'https://getpocket.com/redirect?url=*'):
|
||||
item.link = parse_qs(urlparse(item.link).query)['url'][0]
|
||||
log(item.link)
|
||||
|
||||
# facebook
|
||||
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
|
||||
item.link = parse_qs(urlparse(item.link).query)['u'][0]
|
||||
@@ -208,6 +226,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||
if len(match):
|
||||
link = match[0]
|
||||
log(link)
|
||||
|
||||
else:
|
||||
link = None
|
||||
|
||||
@@ -217,6 +236,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||
if len(match) and urlparse(match[0]).netloc != 'www.facebook.com':
|
||||
link = match[0]
|
||||
log(link)
|
||||
|
||||
else:
|
||||
link = None
|
||||
|
||||
@@ -232,7 +252,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||
delay = -2
|
||||
|
||||
try:
|
||||
con = crawler.custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT)
|
||||
con = crawler.custom_handler(delay=delay, encoding=options.encoding).open(link, timeout=TIMEOUT)
|
||||
data = con.read()
|
||||
|
||||
except (IOError, HTTPException) as e:
|
||||
@@ -268,9 +288,6 @@ def ItemAfter(item, options):
|
||||
item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
|
||||
del item.desc
|
||||
|
||||
if not options.keep and not options.proxy:
|
||||
del item.desc
|
||||
|
||||
if options.nolink and item.content:
|
||||
content = lxml.html.fromstring(item.content)
|
||||
for link in content.xpath('//a'):
|
||||
@@ -284,24 +301,27 @@ def ItemAfter(item, options):
|
||||
return item
|
||||
|
||||
|
||||
def FeedFetch(url, options):
|
||||
# basic url clean-up
|
||||
def UrlFix(url):
|
||||
if url is None:
|
||||
raise MorssException('No url provided')
|
||||
|
||||
if isinstance(url, bytes):
|
||||
url = url.decode()
|
||||
|
||||
if urlparse(url).scheme not in PROTOCOL:
|
||||
url = 'http://' + url
|
||||
log(url)
|
||||
|
||||
url = url.replace(' ', '%20')
|
||||
|
||||
if isinstance(url, bytes):
|
||||
url = url.decode()
|
||||
return url
|
||||
|
||||
|
||||
def FeedFetch(url, options):
|
||||
# allow for code execution for feedify
|
||||
pre = feedify.pre_worker(url)
|
||||
if pre:
|
||||
url = pre
|
||||
url = UrlFix(pre)
|
||||
log('url redirect')
|
||||
log(url)
|
||||
|
||||
@@ -312,8 +332,7 @@ def FeedFetch(url, options):
|
||||
delay = 0
|
||||
|
||||
try:
|
||||
con = crawler.custom_handler(accept='xml', strict=True, delay=delay,
|
||||
encoding=options.encoding, basic=not options.items) \
|
||||
con = crawler.custom_handler(follow='rss', delay=delay, encoding=options.encoding) \
|
||||
.open(url, timeout=TIMEOUT * 2)
|
||||
xml = con.read()
|
||||
|
||||
@@ -324,20 +343,24 @@ def FeedFetch(url, options):
|
||||
|
||||
if options.items:
|
||||
# using custom rules
|
||||
rss = feeds.FeedHTML(xml, url, contenttype)
|
||||
feed.rule
|
||||
rss = feeds.FeedHTML(xml)
|
||||
|
||||
rss.rules['title'] = options.title if options.title else '//head/title'
|
||||
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
|
||||
|
||||
rss.rules['items'] = options.items
|
||||
|
||||
if options.item_title:
|
||||
rss.rules['item_title'] = options.item_title
|
||||
if options.item_link:
|
||||
rss.rules['item_link'] = options.item_link
|
||||
rss.rules['item_title'] = options.item_title if options.item_title else './/a|.'
|
||||
rss.rules['item_link'] = options.item_link if options.item_link else './@href|.//a/@href'
|
||||
|
||||
if options.item_content:
|
||||
rss.rules['item_content'] = options.item_content
|
||||
|
||||
if options.item_time:
|
||||
rss.rules['item_time'] = options.item_time
|
||||
|
||||
rss = rss.convert(feeds.FeedXML)
|
||||
|
||||
else:
|
||||
try:
|
||||
rss = feeds.parse(xml, url, contenttype)
|
||||
@@ -375,6 +398,7 @@ def FeedGather(rss, url, options):
|
||||
value = queue.get()
|
||||
try:
|
||||
worker(*value)
|
||||
|
||||
except Exception as e:
|
||||
log('Thread Error: %s' % e.message)
|
||||
queue.task_done()
|
||||
@@ -414,6 +438,7 @@ def FeedGather(rss, url, options):
|
||||
for i, item in enumerate(list(rss.items)):
|
||||
if threads == 1:
|
||||
worker(*[i, item])
|
||||
|
||||
else:
|
||||
queue.put([i, item])
|
||||
|
||||
@@ -433,37 +458,38 @@ def FeedGather(rss, url, options):
|
||||
return rss
|
||||
|
||||
|
||||
def FeedFormat(rss, options):
|
||||
def FeedFormat(rss, options, encoding='utf-8'):
|
||||
if options.callback:
|
||||
if re.match(r'^[a-zA-Z0-9\.]+$', options.callback) is not None:
|
||||
return '%s(%s)' % (options.callback, rss.tojson())
|
||||
out = '%s(%s)' % (options.callback, rss.tojson(encoding='unicode'))
|
||||
return out if encoding == 'unicode' else out.encode(encoding)
|
||||
|
||||
else:
|
||||
raise MorssException('Invalid callback var name')
|
||||
|
||||
elif options.json:
|
||||
if options.indent:
|
||||
return rss.tojson(encoding='UTF-8', indent=4)
|
||||
return rss.tojson(encoding=encoding, indent=4)
|
||||
|
||||
else:
|
||||
return rss.tojson(encoding='UTF-8')
|
||||
return rss.tojson(encoding=encoding)
|
||||
|
||||
elif options.csv:
|
||||
return rss.tocsv(encoding='UTF-8')
|
||||
return rss.tocsv(encoding=encoding)
|
||||
|
||||
elif options.reader:
|
||||
if options.indent:
|
||||
return rss.tohtml(encoding='UTF-8', pretty_print=True)
|
||||
return rss.tohtml(encoding=encoding, pretty_print=True)
|
||||
|
||||
else:
|
||||
return rss.tohtml(encoding='UTF-8')
|
||||
return rss.tohtml(encoding=encoding)
|
||||
|
||||
else:
|
||||
if options.indent:
|
||||
return rss.torss(xml_declaration=True, encoding='UTF-8', pretty_print=True)
|
||||
return rss.torss(xml_declaration=True, encoding=encoding, pretty_print=True)
|
||||
|
||||
else:
|
||||
return rss.torss(xml_declaration=True, encoding='UTF-8')
|
||||
return rss.torss(xml_declaration=True, encoding=encoding)
|
||||
|
||||
|
||||
def process(url, cache=None, options=None):
|
||||
@@ -475,14 +501,16 @@ def process(url, cache=None, options=None):
|
||||
if cache:
|
||||
crawler.default_cache = crawler.SQLiteCache(cache)
|
||||
|
||||
url = UrlFix(url)
|
||||
rss = FeedFetch(url, options)
|
||||
rss = FeedGather(rss, url, options)
|
||||
|
||||
return FeedFormat(rss, options)
|
||||
|
||||
|
||||
def cgi_app(environ, start_response):
|
||||
def cgi_parse_environ(environ):
|
||||
# get options
|
||||
|
||||
if 'REQUEST_URI' in environ:
|
||||
url = environ['REQUEST_URI'][1:]
|
||||
else:
|
||||
@@ -496,7 +524,7 @@ def cgi_app(environ, start_response):
|
||||
if url.startswith(':'):
|
||||
split = url.split('/', 1)
|
||||
|
||||
options = split[0].replace('|', '/').replace('\\\'', '\'').split(':')[1:]
|
||||
raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:]
|
||||
|
||||
if len(split) > 1:
|
||||
url = split[1]
|
||||
@@ -504,15 +532,22 @@ def cgi_app(environ, start_response):
|
||||
url = ''
|
||||
|
||||
else:
|
||||
options = []
|
||||
raw_options = []
|
||||
|
||||
# init
|
||||
options = Options(filterOptions(parseOptions(options)))
|
||||
headers = {}
|
||||
options = Options(filterOptions(parseOptions(raw_options)))
|
||||
|
||||
global DEBUG
|
||||
DEBUG = options.debug
|
||||
|
||||
return (url, options)
|
||||
|
||||
|
||||
def cgi_app(environ, start_response):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
headers = {}
|
||||
|
||||
# headers
|
||||
headers['status'] = '200 OK'
|
||||
headers['cache-control'] = 'max-age=%s' % DELAY
|
||||
@@ -537,6 +572,7 @@ def cgi_app(environ, start_response):
|
||||
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
|
||||
|
||||
# get the work done
|
||||
url = UrlFix(url)
|
||||
rss = FeedFetch(url, options)
|
||||
|
||||
if headers['content-type'] == 'text/xml':
|
||||
@@ -547,18 +583,42 @@ def cgi_app(environ, start_response):
|
||||
rss = FeedGather(rss, url, options)
|
||||
out = FeedFormat(rss, options)
|
||||
|
||||
if not options.silent:
|
||||
return out
|
||||
if options.silent:
|
||||
return ['']
|
||||
|
||||
else:
|
||||
return [out]
|
||||
|
||||
|
||||
def cgi_wrapper(environ, start_response):
|
||||
# simple http server for html and css
|
||||
def middleware(func):
|
||||
" Decorator to turn a function into a wsgi middleware "
|
||||
# This is called when parsing the code
|
||||
|
||||
def app_builder(app):
|
||||
# This is called when doing app = cgi_wrapper(app)
|
||||
|
||||
def app_wrap(environ, start_response):
|
||||
# This is called when a http request is being processed
|
||||
|
||||
return func(environ, start_response, app)
|
||||
|
||||
return app_wrap
|
||||
|
||||
return app_builder
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_file_handler(environ, start_response, app):
|
||||
" Simple HTTP server to serve static files (.html, .css, etc.) "
|
||||
|
||||
files = {
|
||||
'': 'text/html',
|
||||
'index.html': 'text/html'}
|
||||
'index.html': 'text/html',
|
||||
'sheet.xsl': 'text/xsl'}
|
||||
|
||||
if 'REQUEST_URI' in environ:
|
||||
url = environ['REQUEST_URI'][1:]
|
||||
|
||||
else:
|
||||
url = environ['PATH_INFO'][1:]
|
||||
|
||||
@@ -587,16 +647,80 @@ def cgi_wrapper(environ, start_response):
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return ['Error %s' % headers['status']]
|
||||
|
||||
# actual morss use
|
||||
else:
|
||||
return app(environ, start_response)
|
||||
|
||||
|
||||
def cgi_page(environ, start_response):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
# get page
|
||||
PROTOCOL = ['http', 'https']
|
||||
|
||||
if urlparse(url).scheme not in ['http', 'https']:
|
||||
url = 'http://' + url
|
||||
|
||||
con = crawler.custom_handler().open(url)
|
||||
data = con.read()
|
||||
|
||||
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
||||
|
||||
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
||||
html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
|
||||
html.make_links_absolute(con.geturl())
|
||||
|
||||
kill_tags = ['script', 'iframe', 'noscript']
|
||||
|
||||
for tag in kill_tags:
|
||||
for elem in html.xpath('//'+tag):
|
||||
elem.getparent().remove(elem)
|
||||
|
||||
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
|
||||
|
||||
else:
|
||||
output = None
|
||||
|
||||
# return html page
|
||||
headers = {'status': '200 OK', 'content-type': 'text/html'}
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return [output]
|
||||
|
||||
|
||||
dispatch_table = {
|
||||
'getpage': cgi_page
|
||||
}
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_dispatcher(environ, start_response, app):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
for key in dispatch_table.keys():
|
||||
if key in options:
|
||||
return dispatch_table[key](environ, start_response)
|
||||
|
||||
return app(environ, start_response)
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_error_handler(environ, start_response, app):
|
||||
try:
|
||||
return [cgi_app(environ, start_response) or '(empty)']
|
||||
return app(environ, start_response)
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
headers = {'status': '500 Oops', 'content-type': 'text/plain'}
|
||||
headers = {'status': '500 Oops', 'content-type': 'text/html'}
|
||||
start_response(headers['status'], list(headers.items()), sys.exc_info())
|
||||
log('ERROR <%s>: %s' % (url, e.message), force=True)
|
||||
return ['An error happened:\n%s' % e.message]
|
||||
log('ERROR: %s' % repr(e), force=True)
|
||||
return [cgitb.html(sys.exc_info())]
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_encode(environ, start_response, app):
|
||||
out = app(environ, start_response)
|
||||
return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
|
||||
|
||||
|
||||
def cli_app():
|
||||
@@ -608,6 +732,7 @@ def cli_app():
|
||||
|
||||
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
|
||||
|
||||
url = UrlFix(url)
|
||||
rss = FeedFetch(url, options)
|
||||
rss = FeedGather(rss, url, options)
|
||||
out = FeedFormat(rss, options)
|
||||
@@ -622,6 +747,7 @@ def isInt(string):
|
||||
try:
|
||||
int(string)
|
||||
return True
|
||||
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
@@ -629,7 +755,13 @@ def isInt(string):
|
||||
def main():
|
||||
if 'REQUEST_URI' in os.environ:
|
||||
# mod_cgi
|
||||
wsgiref.handlers.CGIHandler().run(cgi_wrapper)
|
||||
|
||||
app = cgi_app
|
||||
app = cgi_dispatcher(app)
|
||||
app = cgi_error_handler(app)
|
||||
app = cgi_encode(app)
|
||||
|
||||
wsgiref.handlers.CGIHandler().run(app)
|
||||
|
||||
elif len(sys.argv) <= 1 or isInt(sys.argv[1]) or '--root' in sys.argv[1:]:
|
||||
# start internal (basic) http server
|
||||
@@ -638,22 +770,31 @@ def main():
|
||||
argPort = int(sys.argv[1])
|
||||
if argPort > 0:
|
||||
port = argPort
|
||||
|
||||
else:
|
||||
raise MorssException('Port must be positive integer')
|
||||
|
||||
else:
|
||||
port = PORT
|
||||
|
||||
print('Serving http://localhost:%s/'%port)
|
||||
httpd = wsgiref.simple_server.make_server('', port, cgi_wrapper)
|
||||
app = cgi_app
|
||||
app = cgi_file_handler(app)
|
||||
app = cgi_dispatcher(app)
|
||||
app = cgi_error_handler(app)
|
||||
app = cgi_encode(app)
|
||||
|
||||
print('Serving http://localhost:%s/' % port)
|
||||
httpd = wsgiref.simple_server.make_server('', port, app)
|
||||
httpd.serve_forever()
|
||||
|
||||
else:
|
||||
# as a CLI app
|
||||
try:
|
||||
cli_app()
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
print('ERROR: %s' % e.message)
|
||||
|
||||
|
@@ -1,5 +1,6 @@
|
||||
import lxml.etree
|
||||
import lxml.html
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
|
||||
|
||||
@@ -9,7 +10,7 @@ def parse(data, encoding=None):
|
||||
else:
|
||||
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
|
||||
|
||||
return lxml.html.fromstring(data, parser=parser)
|
||||
return lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify('utf-8'), parser=parser)
|
||||
|
||||
|
||||
def count_words(string):
|
||||
@@ -62,7 +63,7 @@ regex_good = re.compile('|'.join(class_good), re.I)
|
||||
|
||||
tags_junk = ['script', 'head', 'iframe', 'object', 'noscript',
|
||||
'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea',
|
||||
'button', 'footer']
|
||||
'button', 'footer', 'link', 'meta']
|
||||
|
||||
tags_bad = tags_junk + ['a', 'aside']
|
||||
|
||||
@@ -93,10 +94,18 @@ def score_node(node):
|
||||
class_id = node.get('class', '') + node.get('id', '')
|
||||
|
||||
if (isinstance(node, lxml.html.HtmlComment)
|
||||
or node.tag in tags_bad
|
||||
or regex_bad.search(class_id)):
|
||||
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
|
||||
return 0
|
||||
|
||||
if node.tag in tags_junk:
|
||||
score += -1 # actuall -2 as tags_junk is included tags_bad
|
||||
|
||||
if node.tag in tags_bad:
|
||||
score += -1
|
||||
|
||||
if regex_bad.search(class_id):
|
||||
score += -1
|
||||
|
||||
if node.tag in tags_good:
|
||||
score += 4
|
||||
|
||||
@@ -124,7 +133,7 @@ def score_all(node, grades=None):
|
||||
score = score_node(child)
|
||||
child.attrib['seen'] = 'yes, ' + str(int(score))
|
||||
|
||||
if score > 0:
|
||||
if score > 0 or not len(grades):
|
||||
spread_score(child, score, grades)
|
||||
score_all(child, grades)
|
||||
|
||||
|
@@ -1,4 +1,5 @@
|
||||
lxml
|
||||
bs4
|
||||
python-dateutil <= 1.5
|
||||
chardet
|
||||
pymysql
|
||||
|
@@ -13,6 +13,7 @@
|
||||
body {
|
||||
overflow-wrap: anywhere;
|
||||
word-wrap: anywhere;
|
||||
font-family: sans;
|
||||
}
|
||||
|
||||
#url {
|
||||
|
Reference in New Issue
Block a user