Compare commits

...

2 Commits

Author SHA1 Message Date
pictuga 27a42c47aa morss: use final request url
Code is not very elegant...
2020-04-28 22:30:21 +02:00
pictuga c27c38f7c7 crawler: return dict instead of tuple 2020-04-28 22:29:07 +02:00
5 changed files with 32 additions and 26 deletions

View File

@ -251,7 +251,7 @@ options = morss.Options(csv=True) # arguments
morss.crawler.sqlite_default = '/tmp/morss-cache.db' # sqlite cache location
url = morss.UrlFix(url) # make sure the url is properly formatted
rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
url, rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
rss = morss.FeedGather(rss, url, options) # this fills the feed and cleans it up
output = morss.FeedFormat(rss, options, 'unicode') # formats final feed

View File

@ -55,7 +55,7 @@ PROTOCOL = ['http', 'https']
def get(*args, **kwargs):
return adv_get(*args, **kwargs)[0]
return adv_get(*args, **kwargs)['data']
def adv_get(url, timeout=None, *args, **kwargs):
@ -72,7 +72,13 @@ def adv_get(url, timeout=None, *args, **kwargs):
contenttype = con.info().get('Content-Type', '').split(';')[0]
encoding= detect_encoding(data, con)
return data, con, contenttype, encoding
return {
'data':data,
'url': con.geturl(),
'con': con,
'contenttype': contenttype,
'encoding': encoding
}
def custom_handler(follow=None, delay=None, encoding=None):
@ -621,7 +627,7 @@ class MySQLCacheHandler(BaseCache):
if __name__ == '__main__':
data, con, contenttype, encoding = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
req = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
if not sys.flags.interactive:
print(data.decode(encoding))
print(req['data'].decode(req['encoding']))

View File

@ -759,8 +759,8 @@ class ItemJSON(Item, ParserJSON):
if __name__ == '__main__':
from . import crawler
data, con, contenttype, encoding = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')
feed = parse(data, url=con.geturl(), encoding=encoding)
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')
feed = parse(req['data'], url=req['url'], encoding=req['encoding'])
if not sys.flags.interactive:
for item in feed.items:

View File

@ -248,17 +248,17 @@ def ItemFill(item, options, feedurl='/', fast=False):
delay = -2
try:
data, con, contenttype, encoding = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
req = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
except (IOError, HTTPException) as e:
log('http error')
return False # let's just delete errors stuff when in cache mode
if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
if req['contenttype'] not in crawler.MIMETYPE['html'] and req['contenttype'] != 'text/plain':
log('non-text page')
return True
out = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode')
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
if out is not None:
item.content = out
@ -303,14 +303,14 @@ def FeedFetch(url, options):
delay = 0
try:
xml, con, contenttype, encoding = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2)
req = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2)
except (IOError, HTTPException):
raise MorssException('Error downloading feed')
if options.items:
# using custom rules
rss = feeds.FeedHTML(xml, encoding=encoding)
rss = feeds.FeedHTML(req['data'], encoding=req['encoding'])
rss.rules['title'] = options.title if options.title else '//head/title'
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
@ -330,16 +330,16 @@ def FeedFetch(url, options):
else:
try:
rss = feeds.parse(xml, url, encoding=encoding)
rss = feeds.parse(req['data'], url=url, encoding=req['encoding'])
rss = rss.convert(feeds.FeedXML)
# contains all fields, otherwise much-needed data can be lost
except TypeError:
log('random page')
log(contenttype)
log(req['contenttype'])
raise MorssException('Link provided is not a valid feed')
return rss
return req['url'], rss
def FeedGather(rss, url, options):
@ -438,7 +438,7 @@ def process(url, cache=None, options=None):
if cache:
crawler.default_cache = crawler.SQLiteCache(cache)
rss = FeedFetch(url, options)
url, rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options)
return FeedFormat(rss, options, 'unicode')
@ -510,7 +510,7 @@ def cgi_app(environ, start_response):
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
# get the work done
rss = FeedFetch(url, options)
url, rss = FeedFetch(url, options)
if headers['content-type'] == 'text/xml':
headers['content-type'] = rss.mimetype[0]
@ -594,12 +594,12 @@ def cgi_get(environ, start_response):
url, options = cgi_parse_environ(environ)
# get page
data, con, contenttype, encoding = crawler.adv_get(url=url, timeout=TIMEOUT)
req = crawler.adv_get(url=url, timeout=TIMEOUT)
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
if options.get == 'page':
html = readabilite.parse(data, encoding=encoding)
html.make_links_absolute(con.geturl())
html = readabilite.parse(req['data'], encoding=req['encoding'])
html.make_links_absolute(req['url'])
kill_tags = ['script', 'iframe', 'noscript']
@ -610,13 +610,13 @@ def cgi_get(environ, start_response):
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
elif options.get == 'article':
output = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='utf-8', debug=options.debug)
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
else:
raise MorssException('no :get option passed')
else:
output = data
output = req['data']
# return html page
headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8'}
@ -673,7 +673,7 @@ def cli_app():
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
rss = FeedFetch(url, options)
url, rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options, 'unicode')

View File

@ -348,8 +348,8 @@ if __name__ == '__main__':
import sys
from . import crawler
data, con, contenttype, encoding = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
article = get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode')
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
article = get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
if not sys.flags.interactive:
print(article)