From 4d6d3c92397d4c22e9708eb4460daa31c7614733 Mon Sep 17 00:00:00 2001 From: pictuga Date: Sun, 23 Jan 2022 11:44:07 +0100 Subject: [PATCH] wsgi: limit supported mimetypes & return actual mimetype --- morss/crawler.py | 4 +++- morss/wsgi.py | 34 +++++++++++++++++++--------------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/morss/crawler.py b/morss/crawler.py index 01e9ba5..0eb6499 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -59,7 +59,9 @@ except NameError: MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'], 'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'], - 'html': ['text/html', 'application/xhtml+xml', 'application/xml']} + 'html': ['text/html', 'application/xhtml+xml', 'application/xml'], + 'json': ['application/json'], + } DEFAULT_UAS = [ diff --git a/morss/wsgi.py b/morss/wsgi.py index 43f08dc..ef829e8 100644 --- a/morss/wsgi.py +++ b/morss/wsgi.py @@ -192,32 +192,36 @@ def cgi_get(environ, start_response): url, options = cgi_parse_environ(environ) # get page - req = crawler.adv_get(url=url, timeout=TIMEOUT) + if options['get'] in ('page', 'article'): + req = crawler.adv_get(url=url, timeout=TIMEOUT) - if req['contenttype'] in crawler.MIMETYPE['html']: - if options['get'] == 'page': - html = readabilite.parse(req['data'], encoding=req['encoding']) - html.make_links_absolute(req['url']) + if req['contenttype'] in crawler.MIMETYPE['html']: + if options['get'] == 'page': + html = readabilite.parse(req['data'], encoding=req['encoding']) + html.make_links_absolute(req['url']) - kill_tags = ['script', 'iframe', 'noscript'] + kill_tags = ['script', 'iframe', 'noscript'] - for tag in kill_tags: - for elem in html.xpath('//'+tag): - elem.getparent().remove(elem) + for tag in kill_tags: + for elem in html.xpath('//'+tag): + elem.getparent().remove(elem) - output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html') + output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html') - elif options['get'] == 'article': - output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug) + else: # i.e. options['get'] == 'article' + output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug) + + elif req['contenttype'] in crawler.MIMETYPE['xml'] + crawler.MIMETYPE['rss'] + crawler.MIMETYPE['json']: + output = req['data'] else: - raise MorssException('no :get option passed') + raise MorssException('unsupported mimetype') else: - output = req['data'] + raise MorssException('no :get option passed') # return html page - headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse + headers = {'status': '200 OK', 'content-type': req['contenttype'], 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse start_response(headers['status'], list(headers.items())) return [output]