Compare commits

..

5 Commits

Author SHA1 Message Date
pictuga 41a63900c2 README: improve docker instructions 2020-04-19 13:01:08 +02:00
pictuga ec8edb02f1 Various small bug fixes 2020-04-19 12:54:02 +02:00
pictuga d01b943597 Remove leftover threading var 2020-04-19 12:51:11 +02:00
pictuga b361aa2867 Add timeout to :get 2020-04-19 12:50:26 +02:00
pictuga 4ce3c7cb32 Small code clean ups 2020-04-19 12:50:05 +02:00
6 changed files with 15 additions and 16 deletions

View File

@ -108,7 +108,6 @@ morss will auto-detect what "mode" to use.
For this, you'll want to change a bit the architecture of the files, for example For this, you'll want to change a bit the architecture of the files, for example
into something like this. into something like this.
``` ```
/ /
├── cgi ├── cgi
@ -151,20 +150,19 @@ gunicorn morss:cgi_standalone_app
#### Using docker #### Using docker
Build Build & run
```shell ```shell
docker build https://git.pictuga.com/pictuga/morss.git docker build https://git.pictuga.com/pictuga/morss.git -t morss
docker run -p 8080:8080 morss
``` ```
Run & Build in one go In one line
```shell ```shell
docker run -p 8080:8080 $(docker build -q https://git.pictuga.com/pictuga/morss.git) docker run -p 8080:8080 $(docker build -q https://git.pictuga.com/pictuga/morss.git)
``` ```
It will run on port 8080 by default
#### Using morss' internal HTTP server #### Using morss' internal HTTP server
Morss can run its own HTTP server. The later should start when you run morss Morss can run its own HTTP server. The later should start when you run morss
@ -256,9 +254,10 @@ output = morss.Format(rss, options) # formats final feed
## Cache information ## Cache information
morss uses caching to make loading faster. There are 2 possible cache backends morss uses caching to make loading faster. There are 3 possible cache backends
(visible in `morss/crawler.py`): (visible in `morss/crawler.py`):
- `{}`: a simple python in-memory dict() object
- `SQLiteCache`: sqlite3 cache. Default file location is in-memory (i.e. it will - `SQLiteCache`: sqlite3 cache. Default file location is in-memory (i.e. it will
be cleared every time the program is run be cleared every time the program is run
- `MySQLCacheHandler` - `MySQLCacheHandler`

View File

@ -72,7 +72,6 @@ def custom_handler(follow=None, delay=None, encoding=None):
handlers.append(HTTPRefreshHandler()) handlers.append(HTTPRefreshHandler())
handlers.append(UAHandler(DEFAULT_UA)) handlers.append(UAHandler(DEFAULT_UA))
handlers.append(BrowserlyHeaderHandler()) handlers.append(BrowserlyHeaderHandler())
handlers.append(EncodingFixHandler(encoding)) handlers.append(EncodingFixHandler(encoding))
if follow: if follow:
@ -466,6 +465,8 @@ class CacheHandler(BaseHandler):
class BaseCache: class BaseCache:
""" Subclasses must behave like a dict """
def __contains__(self, url): def __contains__(self, url):
try: try:
self[url] self[url]

View File

@ -102,7 +102,7 @@ item_link = ./a/@href
item_desc = ./div[class=desc] item_desc = ./div[class=desc]
item_content = ./div[class=content] item_content = ./div[class=content]
base = file:www/sheet.xsl base = file:sheet.xsl
[twitter] [twitter]
mode = html mode = html

View File

@ -85,7 +85,7 @@ def parse(data, url=None, mimetype=None, encoding=None):
for path in ruleset['path']: for path in ruleset['path']:
if fnmatch(url, path): if fnmatch(url, path):
parser = [x for x in parsers if x.mode == ruleset['mode']][0] parser = [x for x in parsers if x.mode == ruleset['mode']][0]
return parser(data, ruleset, encoding=encoding) return parser(data, ruleset, encoding=encoding)
# 2) Try each and every parser # 2) Try each and every parser

View File

@ -40,7 +40,6 @@ LIM_TIME = 2.5 # deletes what's after
DELAY = 10 * 60 # xml cache & ETag cache (in sec) DELAY = 10 * 60 # xml cache & ETag cache (in sec)
TIMEOUT = 4 # http timeout (in sec) TIMEOUT = 4 # http timeout (in sec)
THREADS = MAX_ITEM # number of threads (1 for single-threaded)
DEBUG = False DEBUG = False
PORT = 8080 PORT = 8080
@ -137,7 +136,7 @@ def ItemFix(item, feedurl='/'):
""" Improves feed items (absolute links, resolve feedburner links, etc) """ """ Improves feed items (absolute links, resolve feedburner links, etc) """
# check unwanted uppercase title # check unwanted uppercase title
if len(item.title) > 20 and item.title.isupper(): if item.title is not None and len(item.title) > 20 and item.title.isupper():
item.title = item.title.title() item.title = item.title.title()
# check if it includes link # check if it includes link
@ -200,7 +199,7 @@ def ItemFix(item, feedurl='/'):
# reddit # reddit
if urlparse(feedurl).netloc == 'www.reddit.com': if urlparse(feedurl).netloc == 'www.reddit.com':
match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href') match = lxml.html.fromstring(item.content).xpath('//a[text()="[link]"]/@href')
if len(match): if len(match):
item.link = match[0] item.link = match[0]
log(item.link) log(item.link)
@ -550,7 +549,7 @@ def cgi_app(environ, start_response):
def middleware(func): def middleware(func):
" Decorator to turn a function into a wsgi middleware " " Decorator to turn a function into a wsgi middleware "
# This is called when parsing the code # This is called when parsing the "@middleware" code
def app_builder(app): def app_builder(app):
# This is called when doing app = cgi_wrapper(app) # This is called when doing app = cgi_wrapper(app)
@ -620,7 +619,7 @@ def cgi_get(environ, start_response):
if urlparse(url).scheme not in ['http', 'https']: if urlparse(url).scheme not in ['http', 'https']:
url = 'http://' + url url = 'http://' + url
data, con, contenttype, encoding = crawler.adv_get(url=url) data, con, contenttype, encoding = crawler.adv_get(url=url, timeout=TIMEOUT)
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']: if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
if options.get == 'page': if options.get == 'page':

View File

@ -137,7 +137,7 @@ def score_all(node):
for child in node: for child in node:
score = score_node(child) score = score_node(child)
child.attrib['seen'] = 'yes, ' + str(int(score)) child.attrib['morss_own_score'] = str(float(score))
if score > 0 or len(list(child.iterancestors())) <= 2: if score > 0 or len(list(child.iterancestors())) <= 2:
spread_score(child, score) spread_score(child, score)