Compare commits

...

33 Commits

Author SHA1 Message Date
a82ec96eb7 Delete feedify.py leftover code
iTunes integration untested, unreliable and not working...
2020-04-05 22:16:52 +02:00
aad2398e69 feeds: turns out lxml.etree doesn't have drop_tag 2020-04-05 21:50:38 +02:00
eeac630855 crawler: add more "realistic" headers 2020-04-05 21:11:57 +02:00
e136b0feb2 readabilite: loosen the slayer
Previous impl. lead to too many empty results
2020-04-05 20:47:30 +02:00
6cf32af6c0 readabilite: also use BS 2020-04-05 20:46:42 +02:00
568e7d7dd2 feeds: make BS's output bytes for lxml's sake 2020-04-05 20:46:04 +02:00
3617f86e9d morss: make cgi_encore more robust 2020-04-05 16:43:11 +02:00
d90756b337 morss: drop 'keep' option
Because the Firefox behaviour it is working around is no longer in use
2020-04-05 16:37:27 +02:00
40c69f17d2 feeds: parse html with BS
More robust & to make it consistent with :getpage
2020-04-05 16:12:41 +02:00
99461ea185 crawler: fix var name issues (private_cache) 2020-04-05 16:11:36 +02:00
bf86c1e962 crawler: make AutoUA match http(s) type 2020-04-05 16:07:51 +02:00
d20f6237bd crawler: replace ContentNegoHandler with AlternateHandler
More basic. Sends the same headers no matter what. Make requests more "replicable".
Also, drop "text/xml" from RSS contenttype, too broad, matches garbage
2020-04-05 16:05:59 +02:00
8a4d68d72c crawler: drop 'basic' toggle
Can't even remember the use case
2020-04-05 16:03:06 +02:00
e6811138fd morss: use redirected url in :getpage
Still have to find how to do the same thing with feeds...
2020-04-04 20:04:57 +02:00
35b702fffd morss: default values for feed creation 2020-04-04 19:39:32 +02:00
4a88886767 morss: get_page to act as a basic proxy (for iframes) 2020-04-04 16:37:15 +02:00
1653394cf7 morss: cgi_dispatcher to be able to create extra functions 2020-04-04 16:35:16 +02:00
a8a90cf414 morss: move url/options parsing to own function
For future re-use
2020-04-04 16:33:52 +02:00
bdbaf0f8a7 morss/cgi: fix handling of special chars in url 2020-04-04 16:21:37 +02:00
d0e447a2a6 ItemFix: clean up Pocket links 2020-04-04 16:20:39 +02:00
e6817e01b4 sheet.xsl: set font to "sans"
Browsers don't all have the same default font. Overriding for consistency
2020-04-03 17:47:19 +02:00
7c3091d64c morss: code spacing
One of those commits that make me feel useful
2020-03-21 23:41:46 +01:00
37b4e144a9 morss: small fixes
Includes dropping off ftp support
2020-03-21 23:30:18 +01:00
bd4b7b5bb2 morss: convert HTML feeds to XML ones for completeness 2020-03-21 23:27:42 +01:00
68d920d4b5 morss: make FeedFormat more flexible with encoding 2020-03-21 23:26:35 +01:00
758ff404a8 morss: fix cgi_app silent output
*Must* return sth
2020-03-21 23:25:25 +01:00
463530f02c morss: middleware to enforce encoding
bytes are always expected
2020-03-21 23:23:50 +01:00
ec0a28a91d morss: use middleware for wsgi apps 2020-03-21 23:23:21 +01:00
421acb439d morss: make errors more readable over http 2020-03-21 23:08:29 +01:00
42c5d09ccb morss: split "options" var into "raw_options" & "options"
To make it clearer who-is-what
2020-03-21 23:07:07 +01:00
056de12484 morss: add sheet.xsl to file handled by http server 2020-03-21 23:06:28 +01:00
961a31141f morss: fix url fixing 2020-03-21 17:28:00 +01:00
a7b01ee85e readabilite: further html processing instructions fix 2020-03-21 17:23:50 +01:00
8 changed files with 231 additions and 126 deletions

View File

@@ -48,6 +48,7 @@ You do need:
- [python](http://www.python.org/) >= 2.6 (python 3 is supported) - [python](http://www.python.org/) >= 2.6 (python 3 is supported)
- [lxml](http://lxml.de/) for xml parsing - [lxml](http://lxml.de/) for xml parsing
- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
- [dateutil](http://labix.org/python-dateutil) to parse feed dates - [dateutil](http://labix.org/python-dateutil) to parse feed dates
- [chardet](https://pypi.python.org/pypi/chardet) - [chardet](https://pypi.python.org/pypi/chardet)
- [six](https://pypi.python.org/pypi/six), a dependency of chardet - [six](https://pypi.python.org/pypi/six), a dependency of chardet
@@ -76,7 +77,6 @@ The arguments are:
- `json`: output as JSON - `json`: output as JSON
- `proxy`: doesn't fill the articles - `proxy`: doesn't fill the articles
- `clip`: stick the full article content under the original feed content (useful for twitter) - `clip`: stick the full article content under the original feed content (useful for twitter)
- `keep`: by default, morss does drop feed description whenever the full-content is found (so as not to mislead users who use Firefox, since the latter only shows the description in the feed preview, so they might believe morss doens't work), but with this argument, the description is kept
- `search=STRING`: does a basic case-sensitive search in the feed - `search=STRING`: does a basic case-sensitive search in the feed
- Advanced - Advanced
- `csv`: export to csv - `csv`: export to csv

View File

@@ -27,13 +27,14 @@ except NameError:
MIMETYPE = { MIMETYPE = {
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'], 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
'html': ['text/html', 'application/xhtml+xml', 'application/xml']} 'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0' DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=False): def custom_handler(follow=None, delay=None, encoding=None):
handlers = [] handlers = []
# as per urllib2 source code, these Handelers are added first # as per urllib2 source code, these Handelers are added first
@@ -51,14 +52,12 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F
handlers.append(HTTPEquivHandler()) handlers.append(HTTPEquivHandler())
handlers.append(HTTPRefreshHandler()) handlers.append(HTTPRefreshHandler())
handlers.append(UAHandler(DEFAULT_UA)) handlers.append(UAHandler(DEFAULT_UA))
handlers.append(BrowserlyHeaderHandler())
if not basic:
handlers.append(AutoRefererHandler())
handlers.append(EncodingFixHandler(encoding)) handlers.append(EncodingFixHandler(encoding))
if accept: if follow:
handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict)) handlers.append(AlternateHandler(MIMETYPE[follow]))
handlers.append(CacheHandler(force_min=delay)) handlers.append(CacheHandler(force_min=delay))
@@ -196,45 +195,34 @@ class UAHandler(BaseHandler):
https_request = http_request https_request = http_request
class AutoRefererHandler(BaseHandler): class BrowserlyHeaderHandler(BaseHandler):
""" Add more headers to look less suspicious """
def http_request(self, req): def http_request(self, req):
req.add_unredirected_header('Referer', 'http://%s' % req.host) req.add_unredirected_header('Referer', '%s://%s' % (req.type, req.host))
req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
req.add_unredirected_header('Accept-Language', 'en-US,en;q=0.5')
return req return req
https_request = http_request https_request = http_request
class ContentNegociationHandler(BaseHandler): class AlternateHandler(BaseHandler):
" Handler for content negociation. Also parses <link rel='alternate' type='application/rss+xml' href='...' /> " " Follow <link rel='alternate' type='application/rss+xml' href='...' /> "
def __init__(self, accept=None, strict=False): def __init__(self, follow=None):
self.accept = accept self.follow = follow or []
self.strict = strict
def http_request(self, req):
if self.accept is not None:
if isinstance(self.accept, basestring):
self.accept = (self.accept,)
string = ','.join(self.accept)
if self.strict:
string += ',*/*;q=0.9'
req.add_unredirected_header('Accept', string)
return req
def http_response(self, req, resp): def http_response(self, req, resp):
contenttype = resp.info().get('Content-Type', '').split(';')[0] contenttype = resp.info().get('Content-Type', '').split(';')[0]
if 200 <= resp.code < 300 and self.accept is not None and self.strict and contenttype in MIMETYPE['html'] and contenttype not in self.accept: if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
data = resp.read() data = resp.read()
links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]') links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
for link in links: for link in links:
if link.get('type', '') in self.accept: if link.get('type', '') in self.follow:
resp.code = 302 resp.code = 302
resp.msg = 'Moved Temporarily' resp.msg = 'Moved Temporarily'
resp.headers['location'] = link.get('href') resp.headers['location'] = link.get('href')
@@ -246,7 +234,6 @@ class ContentNegociationHandler(BaseHandler):
return resp return resp
https_request = http_request
https_response = http_response https_response = http_response
@@ -384,7 +371,7 @@ class CacheHandler(BaseHandler):
elif self.force_min is None and ('no-cache' in cc_list elif self.force_min is None and ('no-cache' in cc_list
or 'no-store' in cc_list or 'no-store' in cc_list
or ('private' in cc_list and not self.private)): or ('private' in cc_list and not self.private_cache)):
# kindly follow web servers indications, refresh # kindly follow web servers indications, refresh
return None return None
@@ -419,7 +406,7 @@ class CacheHandler(BaseHandler):
cc_list = [x for x in cache_control if '=' not in x] cc_list = [x for x in cache_control if '=' not in x]
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private): if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache):
# kindly follow web servers indications # kindly follow web servers indications
return resp return resp

View File

@@ -1,28 +0,0 @@
import re
import json
from . import crawler
try:
basestring
except NameError:
basestring = str
def pre_worker(url):
if url.startswith('http://itunes.apple.com/') or url.startswith('https://itunes.apple.com/'):
match = re.search('/id([0-9]+)(\?.*)?$', url)
if match:
iid = match.groups()[0]
redirect = 'https://itunes.apple.com/lookup?id=%s' % iid
try:
con = crawler.custom_handler(basic=True).open(redirect, timeout=4)
data = con.read()
except (IOError, HTTPException):
raise
return json.loads(data.decode('utf-8', 'replace'))['results'][0]['feedUrl']
return None

View File

@@ -15,6 +15,7 @@ import dateutil.parser
from copy import deepcopy from copy import deepcopy
import lxml.html import lxml.html
from bs4 import BeautifulSoup
json.encoder.c_make_encoder = None json.encoder.c_make_encoder = None
@@ -401,13 +402,14 @@ class ParserXML(ParserBase):
else: else:
if html_rich: if html_rich:
# atom stuff
if 'atom' in rule:
match.attrib['type'] = 'xhtml'
self._clean_node(match) self._clean_node(match)
match.append(lxml.html.fragment_fromstring(value, create_parent='div')) match.append(lxml.html.fragment_fromstring(value, create_parent='div'))
match.find('div').drop_tag()
if self.rules['mode'] == 'html':
match.find('div').drop_tag() # not supported by lxml.etree
else: # i.e. if atom
match.attrib['type'] = 'xhtml'
else: else:
if match is not None and len(match): if match is not None and len(match):
@@ -441,7 +443,7 @@ class ParserHTML(ParserXML):
def parse(self, raw): def parse(self, raw):
parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
return etree.fromstring(raw, parser) return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify('utf-8'), parser)
def tostring(self, encoding='unicode', **k): def tostring(self, encoding='unicode', **k):
return lxml.html.tostring(self.root, encoding=encoding, **k) return lxml.html.tostring(self.root, encoding=encoding, **k)

View File

@@ -10,27 +10,28 @@ import re
import lxml.etree import lxml.etree
import lxml.html import lxml.html
from bs4 import BeautifulSoup
from . import feeds from . import feeds
from . import feedify
from . import crawler from . import crawler
from . import readabilite from . import readabilite
import wsgiref.simple_server import wsgiref.simple_server
import wsgiref.handlers import wsgiref.handlers
import cgitb
try: try:
# python 2 # python 2
from Queue import Queue from Queue import Queue
from httplib import HTTPException from httplib import HTTPException
from urllib import quote_plus from urllib import unquote
from urlparse import urlparse, urljoin, parse_qs from urlparse import urlparse, urljoin, parse_qs
except ImportError: except ImportError:
# python 3 # python 3
from queue import Queue from queue import Queue
from http.client import HTTPException from http.client import HTTPException
from urllib.parse import quote_plus from urllib.parse import unquote
from urllib.parse import urlparse, urljoin, parse_qs from urllib.parse import urlparse, urljoin, parse_qs
LIM_ITEM = 100 # deletes what's beyond LIM_ITEM = 100 # deletes what's beyond
@@ -44,7 +45,7 @@ THREADS = 10 # number of threads (1 for single-threaded)
DEBUG = False DEBUG = False
PORT = 8080 PORT = 8080
PROTOCOL = ['http', 'https', 'ftp'] PROTOCOL = ['http', 'https']
def filterOptions(options): def filterOptions(options):
@@ -52,7 +53,7 @@ def filterOptions(options):
# example of filtering code below # example of filtering code below
#allowed = ['proxy', 'clip', 'keep', 'cache', 'force', 'silent', 'pro', 'debug'] #allowed = ['proxy', 'clip', 'cache', 'force', 'silent', 'pro', 'debug']
#filtered = dict([(key,value) for (key,value) in options.items() if key in allowed]) #filtered = dict([(key,value) for (key,value) in options.items() if key in allowed])
#return filtered #return filtered
@@ -66,6 +67,7 @@ def log(txt, force=False):
if DEBUG or force: if DEBUG or force:
if 'REQUEST_URI' in os.environ: if 'REQUEST_URI' in os.environ:
open('morss.log', 'a').write("%s\n" % repr(txt)) open('morss.log', 'a').write("%s\n" % repr(txt))
else: else:
print(repr(txt)) print(repr(txt))
@@ -73,6 +75,7 @@ def log(txt, force=False):
def len_html(txt): def len_html(txt):
if len(txt): if len(txt):
return len(lxml.html.fromstring(txt).text_content()) return len(lxml.html.fromstring(txt).text_content())
else: else:
return 0 return 0
@@ -80,6 +83,7 @@ def len_html(txt):
def count_words(txt): def count_words(txt):
if len(txt): if len(txt):
return len(lxml.html.fromstring(txt).text_content().split()) return len(lxml.html.fromstring(txt).text_content().split())
return 0 return 0
@@ -88,12 +92,14 @@ class Options:
if len(args): if len(args):
self.options = args self.options = args
self.options.update(options or {}) self.options.update(options or {})
else: else:
self.options = options or {} self.options = options or {}
def __getattr__(self, key): def __getattr__(self, key):
if key in self.options: if key in self.options:
return self.options[key] return self.options[key]
else: else:
return False return False
@@ -107,17 +113,23 @@ class Options:
def parseOptions(options): def parseOptions(options):
""" Turns ['md=True'] into {'md':True} """ """ Turns ['md=True'] into {'md':True} """
out = {} out = {}
for option in options: for option in options:
split = option.split('=', 1) split = option.split('=', 1)
if len(split) > 1: if len(split) > 1:
if split[0].lower() == 'true': if split[0].lower() == 'true':
out[split[0]] = True out[split[0]] = True
elif split[0].lower() == 'false': elif split[0].lower() == 'false':
out[split[0]] = False out[split[0]] = False
else: else:
out[split[0]] = split[1] out[split[0]] = split[1]
else: else:
out[split[0]] = True out[split[0]] = True
return out return out
@@ -158,6 +170,11 @@ def ItemFix(item, feedurl='/'):
item.link = parse_qs(urlparse(item.link).query)['url'][0] item.link = parse_qs(urlparse(item.link).query)['url'][0]
log(item.link) log(item.link)
# pocket
if fnmatch(item.link, 'https://getpocket.com/redirect?url=*'):
item.link = parse_qs(urlparse(item.link).query)['url'][0]
log(item.link)
# facebook # facebook
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'): if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
item.link = parse_qs(urlparse(item.link).query)['u'][0] item.link = parse_qs(urlparse(item.link).query)['u'][0]
@@ -208,6 +225,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
if len(match): if len(match):
link = match[0] link = match[0]
log(link) log(link)
else: else:
link = None link = None
@@ -217,6 +235,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
if len(match) and urlparse(match[0]).netloc != 'www.facebook.com': if len(match) and urlparse(match[0]).netloc != 'www.facebook.com':
link = match[0] link = match[0]
log(link) log(link)
else: else:
link = None link = None
@@ -232,7 +251,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
delay = -2 delay = -2
try: try:
con = crawler.custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT) con = crawler.custom_handler(delay=delay, encoding=options.encoding).open(link, timeout=TIMEOUT)
data = con.read() data = con.read()
except (IOError, HTTPException) as e: except (IOError, HTTPException) as e:
@@ -268,9 +287,6 @@ def ItemAfter(item, options):
item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
del item.desc del item.desc
if not options.keep and not options.proxy:
del item.desc
if options.nolink and item.content: if options.nolink and item.content:
content = lxml.html.fromstring(item.content) content = lxml.html.fromstring(item.content)
for link in content.xpath('//a'): for link in content.xpath('//a'):
@@ -284,27 +300,23 @@ def ItemAfter(item, options):
return item return item
def FeedFetch(url, options): def UrlFix(url):
# basic url clean-up
if url is None: if url is None:
raise MorssException('No url provided') raise MorssException('No url provided')
if isinstance(url, bytes):
url = url.decode()
if urlparse(url).scheme not in PROTOCOL: if urlparse(url).scheme not in PROTOCOL:
url = 'http://' + url url = 'http://' + url
log(url) log(url)
url = url.replace(' ', '%20') url = url.replace(' ', '%20')
if isinstance(url, bytes): return url
url = url.decode()
# allow for code execution for feedify
pre = feedify.pre_worker(url)
if pre:
url = pre
log('url redirect')
log(url)
def FeedFetch(url, options):
# fetch feed # fetch feed
delay = DELAY delay = DELAY
@@ -312,8 +324,7 @@ def FeedFetch(url, options):
delay = 0 delay = 0
try: try:
con = crawler.custom_handler(accept='xml', strict=True, delay=delay, con = crawler.custom_handler(follow='rss', delay=delay, encoding=options.encoding) \
encoding=options.encoding, basic=not options.items) \
.open(url, timeout=TIMEOUT * 2) .open(url, timeout=TIMEOUT * 2)
xml = con.read() xml = con.read()
@@ -324,20 +335,24 @@ def FeedFetch(url, options):
if options.items: if options.items:
# using custom rules # using custom rules
rss = feeds.FeedHTML(xml, url, contenttype) rss = feeds.FeedHTML(xml)
feed.rule
rss.rules['title'] = options.title if options.title else '//head/title'
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
rss.rules['items'] = options.items rss.rules['items'] = options.items
if options.item_title: rss.rules['item_title'] = options.item_title if options.item_title else './/a|.'
rss.rules['item_title'] = options.item_title rss.rules['item_link'] = options.item_link if options.item_link else './@href|.//a/@href'
if options.item_link:
rss.rules['item_link'] = options.item_link
if options.item_content: if options.item_content:
rss.rules['item_content'] = options.item_content rss.rules['item_content'] = options.item_content
if options.item_time: if options.item_time:
rss.rules['item_time'] = options.item_time rss.rules['item_time'] = options.item_time
rss = rss.convert(feeds.FeedXML)
else: else:
try: try:
rss = feeds.parse(xml, url, contenttype) rss = feeds.parse(xml, url, contenttype)
@@ -375,6 +390,7 @@ def FeedGather(rss, url, options):
value = queue.get() value = queue.get()
try: try:
worker(*value) worker(*value)
except Exception as e: except Exception as e:
log('Thread Error: %s' % e.message) log('Thread Error: %s' % e.message)
queue.task_done() queue.task_done()
@@ -414,6 +430,7 @@ def FeedGather(rss, url, options):
for i, item in enumerate(list(rss.items)): for i, item in enumerate(list(rss.items)):
if threads == 1: if threads == 1:
worker(*[i, item]) worker(*[i, item])
else: else:
queue.put([i, item]) queue.put([i, item])
@@ -433,37 +450,38 @@ def FeedGather(rss, url, options):
return rss return rss
def FeedFormat(rss, options): def FeedFormat(rss, options, encoding='utf-8'):
if options.callback: if options.callback:
if re.match(r'^[a-zA-Z0-9\.]+$', options.callback) is not None: if re.match(r'^[a-zA-Z0-9\.]+$', options.callback) is not None:
return '%s(%s)' % (options.callback, rss.tojson()) out = '%s(%s)' % (options.callback, rss.tojson(encoding='unicode'))
return out if encoding == 'unicode' else out.encode(encoding)
else: else:
raise MorssException('Invalid callback var name') raise MorssException('Invalid callback var name')
elif options.json: elif options.json:
if options.indent: if options.indent:
return rss.tojson(encoding='UTF-8', indent=4) return rss.tojson(encoding=encoding, indent=4)
else: else:
return rss.tojson(encoding='UTF-8') return rss.tojson(encoding=encoding)
elif options.csv: elif options.csv:
return rss.tocsv(encoding='UTF-8') return rss.tocsv(encoding=encoding)
elif options.reader: elif options.reader:
if options.indent: if options.indent:
return rss.tohtml(encoding='UTF-8', pretty_print=True) return rss.tohtml(encoding=encoding, pretty_print=True)
else: else:
return rss.tohtml(encoding='UTF-8') return rss.tohtml(encoding=encoding)
else: else:
if options.indent: if options.indent:
return rss.torss(xml_declaration=True, encoding='UTF-8', pretty_print=True) return rss.torss(xml_declaration=True, encoding=encoding, pretty_print=True)
else: else:
return rss.torss(xml_declaration=True, encoding='UTF-8') return rss.torss(xml_declaration=True, encoding=encoding)
def process(url, cache=None, options=None): def process(url, cache=None, options=None):
@@ -475,14 +493,16 @@ def process(url, cache=None, options=None):
if cache: if cache:
crawler.default_cache = crawler.SQLiteCache(cache) crawler.default_cache = crawler.SQLiteCache(cache)
url = UrlFix(url)
rss = FeedFetch(url, options) rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options) rss = FeedGather(rss, url, options)
return FeedFormat(rss, options) return FeedFormat(rss, options)
def cgi_app(environ, start_response): def cgi_parse_environ(environ):
# get options # get options
if 'REQUEST_URI' in environ: if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:] url = environ['REQUEST_URI'][1:]
else: else:
@@ -496,7 +516,7 @@ def cgi_app(environ, start_response):
if url.startswith(':'): if url.startswith(':'):
split = url.split('/', 1) split = url.split('/', 1)
options = split[0].replace('|', '/').replace('\\\'', '\'').split(':')[1:] raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:]
if len(split) > 1: if len(split) > 1:
url = split[1] url = split[1]
@@ -504,15 +524,22 @@ def cgi_app(environ, start_response):
url = '' url = ''
else: else:
options = [] raw_options = []
# init # init
options = Options(filterOptions(parseOptions(options))) options = Options(filterOptions(parseOptions(raw_options)))
headers = {}
global DEBUG global DEBUG
DEBUG = options.debug DEBUG = options.debug
return (url, options)
def cgi_app(environ, start_response):
url, options = cgi_parse_environ(environ)
headers = {}
# headers # headers
headers['status'] = '200 OK' headers['status'] = '200 OK'
headers['cache-control'] = 'max-age=%s' % DELAY headers['cache-control'] = 'max-age=%s' % DELAY
@@ -537,6 +564,7 @@ def cgi_app(environ, start_response):
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db')) crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
# get the work done # get the work done
url = UrlFix(url)
rss = FeedFetch(url, options) rss = FeedFetch(url, options)
if headers['content-type'] == 'text/xml': if headers['content-type'] == 'text/xml':
@@ -547,18 +575,42 @@ def cgi_app(environ, start_response):
rss = FeedGather(rss, url, options) rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options) out = FeedFormat(rss, options)
if not options.silent: if options.silent:
return out return ['']
else:
return [out]
def cgi_wrapper(environ, start_response): def middleware(func):
# simple http server for html and css " Decorator to turn a function into a wsgi middleware "
# This is called when parsing the code
def app_builder(app):
# This is called when doing app = cgi_wrapper(app)
def app_wrap(environ, start_response):
# This is called when a http request is being processed
return func(environ, start_response, app)
return app_wrap
return app_builder
@middleware
def cgi_file_handler(environ, start_response, app):
" Simple HTTP server to serve static files (.html, .css, etc.) "
files = { files = {
'': 'text/html', '': 'text/html',
'index.html': 'text/html'} 'index.html': 'text/html',
'sheet.xsl': 'text/xsl'}
if 'REQUEST_URI' in environ: if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:] url = environ['REQUEST_URI'][1:]
else: else:
url = environ['PATH_INFO'][1:] url = environ['PATH_INFO'][1:]
@@ -587,16 +639,80 @@ def cgi_wrapper(environ, start_response):
start_response(headers['status'], list(headers.items())) start_response(headers['status'], list(headers.items()))
return ['Error %s' % headers['status']] return ['Error %s' % headers['status']]
# actual morss use else:
return app(environ, start_response)
def cgi_page(environ, start_response):
url, options = cgi_parse_environ(environ)
# get page
PROTOCOL = ['http', 'https']
if urlparse(url).scheme not in ['http', 'https']:
url = 'http://' + url
con = crawler.custom_handler().open(url)
data = con.read()
contenttype = con.info().get('Content-Type', '').split(';')[0]
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
html.make_links_absolute(con.geturl())
kill_tags = ['script', 'iframe', 'noscript']
for tag in kill_tags:
for elem in html.xpath('//'+tag):
elem.getparent().remove(elem)
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
else:
output = None
# return html page
headers = {'status': '200 OK', 'content-type': 'text/html'}
start_response(headers['status'], list(headers.items()))
return [output]
dispatch_table = {
'getpage': cgi_page
}
@middleware
def cgi_dispatcher(environ, start_response, app):
url, options = cgi_parse_environ(environ)
for key in dispatch_table.keys():
if key in options:
return dispatch_table[key](environ, start_response)
return app(environ, start_response)
@middleware
def cgi_error_handler(environ, start_response, app):
try: try:
return [cgi_app(environ, start_response) or '(empty)'] return app(environ, start_response)
except (KeyboardInterrupt, SystemExit): except (KeyboardInterrupt, SystemExit):
raise raise
except Exception as e: except Exception as e:
headers = {'status': '500 Oops', 'content-type': 'text/plain'} headers = {'status': '500 Oops', 'content-type': 'text/html'}
start_response(headers['status'], list(headers.items()), sys.exc_info()) start_response(headers['status'], list(headers.items()), sys.exc_info())
log('ERROR <%s>: %s' % (url, e.message), force=True) log('ERROR: %s' % repr(e), force=True)
return ['An error happened:\n%s' % e.message] return [cgitb.html(sys.exc_info())]
@middleware
def cgi_encode(environ, start_response, app):
out = app(environ, start_response)
return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
def cli_app(): def cli_app():
@@ -608,6 +724,7 @@ def cli_app():
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db')) crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
url = UrlFix(url)
rss = FeedFetch(url, options) rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options) rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options) out = FeedFormat(rss, options)
@@ -622,6 +739,7 @@ def isInt(string):
try: try:
int(string) int(string)
return True return True
except ValueError: except ValueError:
return False return False
@@ -629,7 +747,13 @@ def isInt(string):
def main(): def main():
if 'REQUEST_URI' in os.environ: if 'REQUEST_URI' in os.environ:
# mod_cgi # mod_cgi
wsgiref.handlers.CGIHandler().run(cgi_wrapper)
app = cgi_app
app = cgi_dispatcher(app)
app = cgi_error_handler(app)
app = cgi_encode(app)
wsgiref.handlers.CGIHandler().run(app)
elif len(sys.argv) <= 1 or isInt(sys.argv[1]) or '--root' in sys.argv[1:]: elif len(sys.argv) <= 1 or isInt(sys.argv[1]) or '--root' in sys.argv[1:]:
# start internal (basic) http server # start internal (basic) http server
@@ -638,22 +762,31 @@ def main():
argPort = int(sys.argv[1]) argPort = int(sys.argv[1])
if argPort > 0: if argPort > 0:
port = argPort port = argPort
else: else:
raise MorssException('Port must be positive integer') raise MorssException('Port must be positive integer')
else: else:
port = PORT port = PORT
print('Serving http://localhost:%s/'%port) app = cgi_app
httpd = wsgiref.simple_server.make_server('', port, cgi_wrapper) app = cgi_file_handler(app)
app = cgi_dispatcher(app)
app = cgi_error_handler(app)
app = cgi_encode(app)
print('Serving http://localhost:%s/' % port)
httpd = wsgiref.simple_server.make_server('', port, app)
httpd.serve_forever() httpd.serve_forever()
else: else:
# as a CLI app # as a CLI app
try: try:
cli_app() cli_app()
except (KeyboardInterrupt, SystemExit): except (KeyboardInterrupt, SystemExit):
raise raise
except Exception as e: except Exception as e:
print('ERROR: %s' % e.message) print('ERROR: %s' % e.message)

View File

@@ -1,5 +1,6 @@
import lxml.etree import lxml.etree
import lxml.html import lxml.html
from bs4 import BeautifulSoup
import re import re
@@ -9,7 +10,7 @@ def parse(data, encoding=None):
else: else:
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True) parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
return lxml.html.fromstring(data, parser=parser) return lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify('utf-8'), parser=parser)
def count_words(string): def count_words(string):
@@ -62,7 +63,7 @@ regex_good = re.compile('|'.join(class_good), re.I)
tags_junk = ['script', 'head', 'iframe', 'object', 'noscript', tags_junk = ['script', 'head', 'iframe', 'object', 'noscript',
'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea', 'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea',
'button', 'footer'] 'button', 'footer', 'link', 'meta']
tags_bad = tags_junk + ['a', 'aside'] tags_bad = tags_junk + ['a', 'aside']
@@ -93,10 +94,18 @@ def score_node(node):
class_id = node.get('class', '') + node.get('id', '') class_id = node.get('class', '') + node.get('id', '')
if (isinstance(node, lxml.html.HtmlComment) if (isinstance(node, lxml.html.HtmlComment)
or node.tag in tags_bad or isinstance(node, lxml.html.HtmlProcessingInstruction)):
or regex_bad.search(class_id)):
return 0 return 0
if node.tag in tags_junk:
score += -1 # actuall -2 as tags_junk is included tags_bad
if node.tag in tags_bad:
score += -1
if regex_bad.search(class_id):
score += -1
if node.tag in tags_good: if node.tag in tags_good:
score += 4 score += 4
@@ -124,7 +133,7 @@ def score_all(node, grades=None):
score = score_node(child) score = score_node(child)
child.attrib['seen'] = 'yes, ' + str(int(score)) child.attrib['seen'] = 'yes, ' + str(int(score))
if score > 0: if score > 0 or not len(grades):
spread_score(child, score, grades) spread_score(child, score, grades)
score_all(child, grades) score_all(child, grades)

View File

@@ -1,4 +1,5 @@
lxml lxml
bs4
python-dateutil <= 1.5 python-dateutil <= 1.5
chardet chardet
pymysql pymysql

View File

@@ -13,6 +13,7 @@
body { body {
overflow-wrap: anywhere; overflow-wrap: anywhere;
word-wrap: anywhere; word-wrap: anywhere;
font-family: sans;
} }
#url { #url {