Compare commits
No commits in common. "9e7b9d95ee6178081b90c79c46f38ba026ffa5e7" and "e5a82ff1f4af88d76ba87a2340c0f937e5b37cb3" have entirely different histories.
9e7b9d95ee
...
e5a82ff1f4
|
@ -99,7 +99,7 @@ item_link = ./a/@href
|
||||||
item_desc = ./div[class=desc]
|
item_desc = ./div[class=desc]
|
||||||
item_content = ./div[class=content]
|
item_content = ./div[class=content]
|
||||||
|
|
||||||
base = file:reader.html.template
|
base = <!DOCTYPE html> <html> <head> <title>Feed reader by morss</title> <meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" /> </head> <body> <div id="header"> <h1>@feed.title</h1> <h2>@feed.desc</h2> <p>- via morss</p> </div> <div id="content"> <div class="item"> <a class="title link" href="@item.link" target="_blank">@item.title</a> <div class="desc">@item.desc</div> <div class="content">@item.content</div> </div> </div> <script> var items = document.getElementsByClassName('item') for (var i in items) items[i].onclick = function() { this.classList.toggle('active') document.body.classList.toggle('noscroll') } </script> </body> </html>
|
||||||
|
|
||||||
[twitter]
|
[twitter]
|
||||||
mode = html
|
mode = html
|
||||||
|
|
|
@ -47,11 +47,7 @@ def parse_rules(filename=None):
|
||||||
|
|
||||||
for section in rules.keys():
|
for section in rules.keys():
|
||||||
for arg in rules[section].keys():
|
for arg in rules[section].keys():
|
||||||
if rules[section][arg].startswith('file:'):
|
if '\n' in rules[section][arg]:
|
||||||
import_file = os.path.join(os.path.dirname(__file__), rules[section][arg][5:])
|
|
||||||
rules[section][arg] = open(import_file).read()
|
|
||||||
|
|
||||||
elif '\n' in rules[section][arg]:
|
|
||||||
rules[section][arg] = rules[section][arg].split('\n')[1:]
|
rules[section][arg] = rules[section][arg].split('\n')[1:]
|
||||||
|
|
||||||
return rules
|
return rules
|
||||||
|
@ -73,13 +69,19 @@ def parse(data, url=None, mimetype=None, encoding=None):
|
||||||
parser = [x for x in parsers if x.mode == ruleset['mode']][0]
|
parser = [x for x in parsers if x.mode == ruleset['mode']][0]
|
||||||
return parser(data, ruleset, encoding=encoding)
|
return parser(data, ruleset, encoding=encoding)
|
||||||
|
|
||||||
# 2) Try each and every parser
|
# 2) Look for a parser based on mimetype
|
||||||
|
|
||||||
|
if mimetype is not None:
|
||||||
|
parser_candidates = [x for x in parsers if mimetype in x.mimetype]
|
||||||
|
|
||||||
|
if mimetype is None or len(parser_candidates) == 0:
|
||||||
|
parser_candidates = parsers
|
||||||
|
|
||||||
# 3) Look for working ruleset for given parser
|
# 3) Look for working ruleset for given parser
|
||||||
# 3a) See if parsing works
|
# 3a) See if parsing works
|
||||||
# 3b) See if .items matches anything
|
# 3b) See if .items matches anything
|
||||||
|
|
||||||
for parser in parsers:
|
for parser in parser_candidates:
|
||||||
ruleset_candidates = [x for x in rulesets.values() if x['mode'] == parser.mode and 'path' not in x]
|
ruleset_candidates = [x for x in rulesets.values() if x['mode'] == parser.mode and 'path' not in x]
|
||||||
# 'path' as they should have been caught beforehands
|
# 'path' as they should have been caught beforehands
|
||||||
|
|
||||||
|
@ -148,15 +150,15 @@ class ParserBase(object):
|
||||||
c = csv.writer(out, dialect=csv.excel)
|
c = csv.writer(out, dialect=csv.excel)
|
||||||
|
|
||||||
for item in self.items:
|
for item in self.items:
|
||||||
c.writerow([getattr(item, x) for x in item.dic])
|
row = [getattr(item, x) for x in item.dic]
|
||||||
|
|
||||||
out.seek(0)
|
|
||||||
out = out.read()
|
|
||||||
|
|
||||||
if encoding != 'unicode':
|
if encoding != 'unicode':
|
||||||
out = out.encode(encoding)
|
row = [x.encode(encoding) if isinstance(x, unicode) else x for x in row]
|
||||||
|
|
||||||
return out
|
c.writerow(row)
|
||||||
|
|
||||||
|
out.seek(0)
|
||||||
|
return out.read()
|
||||||
|
|
||||||
def tohtml(self, **k):
|
def tohtml(self, **k):
|
||||||
return self.convert(FeedHTML).tostring(**k)
|
return self.convert(FeedHTML).tostring(**k)
|
||||||
|
@ -267,14 +269,7 @@ class ParserBase(object):
|
||||||
|
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
# does not exist, have to create it
|
# does not exist, have to create it
|
||||||
try:
|
|
||||||
self.rule_create(self.rules[rule_name])
|
self.rule_create(self.rules[rule_name])
|
||||||
|
|
||||||
except AttributeError:
|
|
||||||
# no way to create it, give up
|
|
||||||
pass
|
|
||||||
|
|
||||||
else:
|
|
||||||
self.rule_set(self.rules[rule_name], value)
|
self.rule_set(self.rules[rule_name], value)
|
||||||
|
|
||||||
def rmv(self, rule_name):
|
def rmv(self, rule_name):
|
||||||
|
@ -474,9 +469,6 @@ class ParserHTML(ParserXML):
|
||||||
element = deepcopy(match)
|
element = deepcopy(match)
|
||||||
match.getparent().append(element)
|
match.getparent().append(element)
|
||||||
|
|
||||||
else:
|
|
||||||
raise AttributeError('no way to create item')
|
|
||||||
|
|
||||||
|
|
||||||
def parse_time(value):
|
def parse_time(value):
|
||||||
if value is None or value == 0:
|
if value is None or value == 0:
|
||||||
|
|
|
@ -471,10 +471,10 @@ def FeedFormat(rss, options, encoding='utf-8'):
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if options.indent:
|
if options.indent:
|
||||||
return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding, pretty_print=True)
|
return rss.torss(xml_declaration=True, encoding=encoding, pretty_print=True)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding)
|
return rss.torss(xml_declaration=True, encoding=encoding)
|
||||||
|
|
||||||
|
|
||||||
def process(url, cache=None, options=None):
|
def process(url, cache=None, options=None):
|
||||||
|
@ -554,8 +554,6 @@ def cgi_app(environ, start_response):
|
||||||
else:
|
else:
|
||||||
headers['content-type'] = 'text/xml'
|
headers['content-type'] = 'text/xml'
|
||||||
|
|
||||||
headers['content-type'] += '; charset=utf-8'
|
|
||||||
|
|
||||||
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
|
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
|
||||||
|
|
||||||
# get the work done
|
# get the work done
|
||||||
|
@ -638,7 +636,7 @@ def cgi_file_handler(environ, start_response, app):
|
||||||
return app(environ, start_response)
|
return app(environ, start_response)
|
||||||
|
|
||||||
|
|
||||||
def cgi_get(environ, start_response):
|
def cgi_page(environ, start_response):
|
||||||
url, options = cgi_parse_environ(environ)
|
url, options = cgi_parse_environ(environ)
|
||||||
|
|
||||||
# get page
|
# get page
|
||||||
|
@ -650,7 +648,6 @@ def cgi_get(environ, start_response):
|
||||||
data, con, contenttype, encoding = crawler.adv_get(url=url)
|
data, con, contenttype, encoding = crawler.adv_get(url=url)
|
||||||
|
|
||||||
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
||||||
if options.get == 'page':
|
|
||||||
html = readabilite.parse(data, encoding=encoding)
|
html = readabilite.parse(data, encoding=encoding)
|
||||||
html.make_links_absolute(con.geturl())
|
html.make_links_absolute(con.geturl())
|
||||||
|
|
||||||
|
@ -662,23 +659,17 @@ def cgi_get(environ, start_response):
|
||||||
|
|
||||||
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
|
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
|
||||||
|
|
||||||
elif options.get == 'article':
|
|
||||||
output = readabilite.get_article(data, url=con.geturl(), encoding=encoding, debug=options.debug)
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise MorssException('no :get option passed')
|
output = None
|
||||||
|
|
||||||
else:
|
|
||||||
output = data
|
|
||||||
|
|
||||||
# return html page
|
# return html page
|
||||||
headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8'}
|
headers = {'status': '200 OK', 'content-type': 'text/html'}
|
||||||
start_response(headers['status'], list(headers.items()))
|
start_response(headers['status'], list(headers.items()))
|
||||||
return [output]
|
return [output]
|
||||||
|
|
||||||
|
|
||||||
dispatch_table = {
|
dispatch_table = {
|
||||||
'get': cgi_get,
|
'getpage': cgi_page
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -726,10 +717,10 @@ def cli_app():
|
||||||
url = UrlFix(url)
|
url = UrlFix(url)
|
||||||
rss = FeedFetch(url, options)
|
rss = FeedFetch(url, options)
|
||||||
rss = FeedGather(rss, url, options)
|
rss = FeedGather(rss, url, options)
|
||||||
out = FeedFormat(rss, options, 'unicode')
|
out = FeedFormat(rss, options)
|
||||||
|
|
||||||
if not options.silent:
|
if not options.silent:
|
||||||
print(out)
|
print(out.decode('utf-8', 'replace') if isinstance(out, bytes) else out)
|
||||||
|
|
||||||
log('done')
|
log('done')
|
||||||
|
|
||||||
|
|
|
@ -307,7 +307,7 @@ def get_best_node(ranked_grades):
|
||||||
return lowest
|
return lowest
|
||||||
|
|
||||||
|
|
||||||
def get_article(data, url=None, encoding=None, debug=False):
|
def get_article(data, url=None, encoding=None):
|
||||||
" Input a raw html string, returns a raw html string of the article "
|
" Input a raw html string, returns a raw html string of the article "
|
||||||
|
|
||||||
html = parse(data, encoding)
|
html = parse(data, encoding)
|
||||||
|
@ -319,17 +319,16 @@ def get_article(data, url=None, encoding=None, debug=False):
|
||||||
|
|
||||||
best = get_best_node(scores)
|
best = get_best_node(scores)
|
||||||
|
|
||||||
if not debug:
|
|
||||||
keep_threshold = percentile([x[1] for x in scores], 0.1)
|
keep_threshold = percentile([x[1] for x in scores], 0.1)
|
||||||
clean_root(best, keep_threshold)
|
clean_root(best, keep_threshold)
|
||||||
|
|
||||||
wc = count_words(best.text_content())
|
wc = count_words(best.text_content())
|
||||||
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
|
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
|
||||||
|
|
||||||
if not debug and (wc - wca < 50 or float(wca) / wc > 0.3):
|
if wc - wca < 50 or float(wca) / wc > 0.3:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if url:
|
if url:
|
||||||
best.make_links_absolute(url)
|
best.make_links_absolute(url)
|
||||||
|
|
||||||
return lxml.etree.tostring(best if not debug else html, pretty_print=True)
|
return lxml.etree.tostring(best, pretty_print=True)
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
|
@require(feed)
|
||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
<html>
|
<html>
|
||||||
<head>
|
<head>
|
||||||
<title>Feed reader by morss</title>
|
<title>@feed.title – via morss</title>
|
||||||
|
<meta charset="UTF-8" />
|
||||||
|
<meta name="description" content="@feed.desc (via morss)" />
|
||||||
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
|
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
|
||||||
<meta name="robots" content="noindex" />
|
|
||||||
|
|
||||||
<style type="text/css">
|
<style type="text/css">
|
||||||
/* columns - from https://thisisdallas.github.io/Simple-Grid/simpleGrid.css */
|
/* columns - from https://thisisdallas.github.io/Simple-Grid/simpleGrid.css */
|
||||||
|
@ -30,7 +32,7 @@
|
||||||
padding-right: 20px; /* column-space */
|
padding-right: 20px; /* column-space */
|
||||||
}
|
}
|
||||||
|
|
||||||
@media handheld, only screen and (max-width: 767px) {
|
@@media handheld, only screen and (max-width: 767px) { /* @@ to escape from the template engine */
|
||||||
#content {
|
#content {
|
||||||
width: 100%;
|
width: 100%;
|
||||||
min-width: 0;
|
min-width: 0;
|
||||||
|
@ -80,7 +82,6 @@
|
||||||
|
|
||||||
#content {
|
#content {
|
||||||
text-align: justify;
|
text-align: justify;
|
||||||
line-height: 1.5em;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
.item .title {
|
.item .title {
|
||||||
|
@ -170,18 +171,31 @@
|
||||||
|
|
||||||
<body>
|
<body>
|
||||||
<div id="header">
|
<div id="header">
|
||||||
<h1>RSS feed</h1>
|
<h1>@feed.title</h1>
|
||||||
<h2>with full text articles</h2>
|
@if feed.desc:
|
||||||
|
<h2>@feed.desc</h2>
|
||||||
|
@end
|
||||||
<p>- via morss</p>
|
<p>- via morss</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div id="content">
|
<div id="content">
|
||||||
|
@for item in feed.items:
|
||||||
<div class="item">
|
<div class="item">
|
||||||
<a class="title link" href="@item.link" target="_blank"></a>
|
@if item.link:
|
||||||
<div class="desc"></div>
|
<a class="title link" href="@item.link" target="_blank">@item.title</a>
|
||||||
<div class="content"></div>
|
@else:
|
||||||
|
<span class="title">@item.title</span>
|
||||||
|
@end
|
||||||
|
<div class="article">
|
||||||
|
@if item.content:
|
||||||
|
@item.content
|
||||||
|
@else:
|
||||||
|
@item.desc
|
||||||
|
@end
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
@end
|
||||||
|
</div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
var items = document.getElementsByClassName('item')
|
var items = document.getElementsByClassName('item')
|
||||||
|
|
Loading…
Reference in New Issue