Save auto version number

Fixed #108
Clean up sqlite code
2023-06-27 22:36:29 +02:00 · 2023-06-26 01:30:47 +02:00 · 2023-06-26 01:29:00 +02:00 · 2023-06-26 01:28:33 +02:00 · 2023-06-24 01:50:12 +02:00 · 2023-06-23 23:14:32 +02:00
49 changed files with 10674 additions and 523 deletions
--- a/.github/workflows/default.yml
+++ b/.github/workflows/default.yml
@@ -0,0 +1,78 @@
+name: default
+on:
+    push:
+        branches:
+            - master
+
+jobs:
+    test-lint:
+        runs-on: ubuntu-latest
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v3
+              with:
+                  fetch-depth: 0
+
+            - name: Prepare image
+              run: apt-get -y update && apt-get -y install python3-pip libenchant-2-2 aspell-en
+
+            - name: Install dependencies
+              run: pip3 install .[full] .[dev]
+            - run: isort --check-only --diff .
+            - run: pylint morss --rcfile=.pylintrc --disable=C,R,W --fail-under=8
+            - run: pytest --cov=morss tests
+
+    python-publish:
+        runs-on: ubuntu-latest
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v3
+              with:
+                  fetch-depth: 0
+
+            - name: Prepare image
+              run: apt-get -y update && apt-get -y install python3-pip python3-build
+
+            - name: Build package
+              run: python3 -m build
+
+            - name: Publish package
+              uses: https://github.com/pypa/gh-action-pypi-publish@release/v1
+              with:
+                  password: ${{ secrets.pypi_api_token }}
+
+    docker-publish-deploy:
+        runs-on: ubuntu-latest
+        container:
+            image: catthehacker/ubuntu:act-latest
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v3
+
+            - name: Set up QEMU
+              uses: https://github.com/docker/setup-qemu-action@v2
+
+            - name: Set up Docker Buildx
+              uses: https://github.com/docker/setup-buildx-action@v2
+
+            - name: Login to Docker Hub
+              uses: https://github.com/docker/login-action@v2
+              with:
+                  username: ${{ secrets.docker_user }}
+                  password: ${{ secrets.docker_pwd }}
+
+            - name: Build and push
+              uses: https://github.com/docker/build-push-action@v4
+              with:
+                  context: .
+                  platforms: linux/amd64,linux/arm64,linux/arm/v7
+                  push: true
+                  tags: ${{ secrets.docker_repo }}
+
+            - name: Deploy on server
+              uses: https://github.com/appleboy/ssh-action@v0.1.10
+              with:
+                  host: ${{ secrets.ssh_host }}
+                  username: ${{ secrets.ssh_user }}
+                  key: ${{ secrets.ssh_key }}
+                  script: morss-update
--- a/.pylintrc
+++ b/.pylintrc
@@ -0,0 +1,50 @@
+[MASTER]
+ignore=CVS
+suggestion-mode=yes
+extension-pkg-allow-list=lxml.etree
+
+[MESSAGES CONTROL]
+disable=missing-function-docstring,
+        missing-class-docstring,
+        missing-module-docstring,
+        wrong-spelling-in-comment,
+
+[REPORTS]
+reports=yes
+score=yes
+
+[SPELLING]
+spelling-dict=en_GB
+spelling-ignore-words=morss
+
+[STRING]
+check-quote-consistency=yes
+check-str-concat-over-line-jumps=yes
+
+[VARIABLES]
+allow-global-unused-variables=no
+init-import=no
+
+[FORMAT]
+expected-line-ending-format=LF
+indent-string='    '
+max-line-length=120
+max-module-lines=1000
+
+[BASIC]
+argument-naming-style=snake_case
+attr-naming-style=snake_case
+class-attribute-naming-style=snake_case
+class-const-naming-style=UPPER_CASE
+class-naming-style=PascalCase
+const-naming-style=UPPER_CASE
+function-naming-style=snake_case
+inlinevar-naming-style=snake_case
+method-naming-style=snake_case
+module-naming-style=snake_case
+variable-naming-style=snake_case
+
+include-naming-hint=yes
+
+bad-names=foo, bar
+good-names=i, j, k
--- a/18
+++ b/18
@@ -1,8 +1,16 @@
-FROM alpine:latest
-
-RUN apk add --no-cache python3 py3-lxml py3-pip py3-wheel git
+FROM alpine:edge

 ADD . /app
-RUN pip3 install --no-cache-dir /app gunicorn

-CMD gunicorn --bind 0.0.0.0:8080 -w 4 --preload --access-logfile - morss
+RUN set -ex; \
+	apk add --no-cache --virtual .run-deps python3 py3-lxml py3-setproctitle py3-setuptools; \
+	apk add --no-cache --virtual .build-deps py3-pip py3-wheel; \
+	pip3 install --no-cache-dir /app[full]; \
+	apk del .build-deps
+
+USER 1000:1000
+
+ENTRYPOINT ["/bin/sh", "/app/morss-helper"]
+CMD ["run"]
+
+HEALTHCHECK CMD /bin/sh /app/morss-helper check
--- a/README.md
+++ b/README.md
@@ -1,11 +1,14 @@
 # Morss - Get full-text RSS feeds

-_GNU AGPLv3 code_  
-_Provided logo is CC BY-NC-SA 4.0_
+[Homepage](https://morss.it/) • 
+[Upstream source code](https://git.pictuga.com/pictuga/morss) • 
+[Github mirror](https://github.com/pictuga/morss) (for Issues & Pull requests)

-Upstream source code: https://git.pictuga.com/pictuga/morss  
-Github mirror (for Issues & Pull requests): https://github.com/pictuga/morss  
-Homepage: https://morss.it/
+[![Build Status](https://ci.pictuga.com/api/badges/pictuga/morss/status.svg)](https://ci.pictuga.com/pictuga/morss)
+[![Github Stars](https://img.shields.io/github/stars/pictuga/morss?logo=github)](https://github.com/pictuga/morss/stargazers)
+[![Github Forks](https://img.shields.io/github/forks/pictuga/morss?logo=github)](https://github.com/pictuga/morss/network/members)
+[![GNU AGPLv3 code](https://img.shields.io/static/v1?label=license&message=AGPLv3)](https://git.pictuga.com/pictuga/morss/src/branch/master/LICENSE)
+[![Logo is CC BY-NC-SA 4.0](https://img.shields.io/static/v1?label=CC&message=BY-NC-SA%204.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/)

 This tool's goal is to get full-text RSS feeds out of striped RSS feeds,
 commonly available on internet. Indeed most newspapers only make a small
@@ -38,7 +41,7 @@ Some features of morss:
 - Follow 301/meta redirects
 - Recover xml feeds with corrupt encoding
 - Supports gzip-compressed http content
- HTTP caching with 3 different backends (in-memory/sqlite/mysql)
+- HTTP caching with different backends (in-memory/redis/diskcache)
 - Works as server/cli tool
 - Deobfuscate various tracking links

@@ -46,38 +49,79 @@ Some features of morss:

 ### Python package

+![Build Python](https://img.shields.io/badge/dynamic/json?label=build%20python&query=$.stages[?(@.name=='python')].status&url=https://ci.pictuga.com/api/repos/pictuga/morss/builds/latest)
+[![PyPI](https://img.shields.io/pypi/v/morss)](https://pypi.org/project/morss/)
+[![PyPI Downloads](https://img.shields.io/pypi/dm/morss)](https://pypistats.org/packages/morss)
+
+Simple install (without optional dependencies)
+
+From pip
+
+```shell
+pip install morss
+```
+
+From git
+
 ```shell
 pip install git+https://git.pictuga.com/pictuga/morss.git
 ```

+Full installation (including optional dependencies)
+
+From pip
+
+```shell
+pip install morss[full]
+```
+
+From git
+
+```shell
+pip install git+https://git.pictuga.com/pictuga/morss.git#egg=morss[full]
+```
+
+The full install includes all the cache backends. Otherwise, only in-memory
+cache is available. The full install also includes gunicorn (for more efficient
+HTTP handling).
+
 The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
 C code needs to be compiled). If possible on your distribution, try installing
 it with the system package manager.

-Dependencies:
-
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
- [lxml](http://lxml.de/) for xml parsing
- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
- [chardet](https://pypi.python.org/pypi/chardet)
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
- pymysql
-
-You may also need:
- Apache, with python-cgi support, to run on a server
- a fast internet connection
-
 ### Docker

-Build & run
+![Build Docker](https://img.shields.io/badge/dynamic/json?label=build%20docker&query=$.stages[?(@.name=='docker')].status&url=https://ci.pictuga.com/api/repos/pictuga/morss/builds/latest)
+[![Docker Hub](https://img.shields.io/docker/pulls/pictuga/morss)](https://hub.docker.com/r/pictuga/morss)
+[![Docker Arch](https://img.shields.io/badge/dynamic/json?color=blue&label=docker%20arch&query=$.results[0].images[*].architecture&url=https://hub.docker.com/v2/repositories/pictuga/morss/tags)](https://hub.docker.com/r/pictuga/morss/tags)
+
+From docker hub
+
+With cli
+
+```shell
+docker pull pictuga/morss
+```
+
+With docker-compose **(recommended)**
+
+```yml
+services:
+    app:
+        image: pictuga/morss
+        ports:
+            - '8000:8000'
+```
+
+Build from source
+
+With cli

 ```shell
 docker build --tag morss https://git.pictuga.com/pictuga/morss.git --no-cache --pull
-docker run -p 8080:8080 morss
 ```

-With docker-compose:
+With docker-compose

 ```yml
 services:
@@ -85,14 +129,53 @@ services:
        build: https://git.pictuga.com/pictuga/morss.git
        image: morss
        ports:
-            - '8080:8080'
+            - '8000:8000'
 ```

 Then execute

 ```shell
 docker-compose build --no-cache --pull
-docker-compose up
+```
+
+### Cloud providers
+
+One-click deployment:
+
+[![Heroku](https://img.shields.io/static/v1?label=deploy%20to&message=heroku&logo=heroku&color=79589F)](https://heroku.com/deploy?template=https://github.com/pictuga/morss)
+[![Google Cloud](https://img.shields.io/static/v1?label=deploy%20to&message=google&logo=google&color=4285F4)](https://deploy.cloud.run/?git_repo=https://github.com/pictuga/morss.git)
+
+Providers supporting `cloud-init` (AWS, Oracle Cloud Infrastructure), based on Ubuntu:
+
+``` yml
+#cloud-config
+
+packages:
+  - python3-pip
+  - python3-wheel
+  - python3-lxml
+  - python3-setproctitle
+  - ca-certificates
+
+write_files:
+  - path: /etc/environment
+    append: true
+    content: |
+      DEBUG=1
+      CACHE=diskcache
+      CACHE_SIZE=1073741824 # 1GiB
+  - path: /var/lib/cloud/scripts/per-boot/morss.sh
+    permissions: 744
+    content: |
+      #!/bin/sh
+      /usr/local/bin/morss-helper daemon
+
+runcmd:
+  - source /etc/environment
+  - update-ca-certificates
+  - iptables -I INPUT 6 -m state --state NEW -p tcp --dport ${PORT:-8000} -j ACCEPT
+  - netfilter-persistent save
+  - pip install morss[full]
 ```

 ## Run
@@ -118,9 +201,25 @@ Works like a charm with [Tiny Tiny RSS](https://tt-rss.org/), and most probably
 other clients.


-#### Via Docker
+#### Using Docker

-See above (in Install)
+From docker hub
+
+```shell
+docker run -p 8000:8000 pictuga/morss
+```
+
+From source
+
+```shell
+docker run -p 8000:8000 morss
+```
+
+With docker-compose **(recommended)**
+
+```shell
+docker-compose up
+```

 #### Using Gunicorn

@@ -133,13 +232,13 @@ gunicorn --preload morss
 Running this command should do:

 ```shell
-uwsgi --http :8080 --plugin python --wsgi-file main.py
+uwsgi --http :8000 --plugin python --wsgi-file main.py
 ```

 #### Using morss' internal HTTP server

 Morss can run its own, **very basic**, HTTP server, meant for debugging mostly.
-The latter should start when you run morss without any argument, on port 8080.
+The latter should start when you run morss without any argument, on port 8000.
 I'd highly recommend you to use gunicorn or something similar for better
 performance.

@@ -177,8 +276,30 @@ For this, you need to make sure your host allows python script execution. This
 method uses HTTP calls to fetch the RSS feeds, which will be handled through
 `mod_cgi` for example on Apache severs.

-Please pay attention to `main.py` permissions for it to be executable. Also
-ensure that the provided `/www/.htaccess` works well with your server.
+Please pay attention to `main.py` permissions for it to be executable. See below
+some tips for the `.htaccess` file.
+
+```htaccess
+Options -Indexes
+
+ErrorDocument 404 /cgi/main.py
+
+# Turn debug on for all requests
+SetEnv DEBUG 1
+
+# Turn debug on for requests with :debug in the url
+SetEnvIf Request_URI :debug DEBUG=1
+
+<Files ~ "\.(py|pyc|db|log)$">
+	deny from all
+</Files>
+
+<Files main.py>
+	allow from all
+	AddHandler cgi-script .py
+	Options +ExecCGI
+</Files>
+```

 ### As a CLI application

@@ -192,6 +313,12 @@ For example: `morss --clip http://feeds.bbci.co.uk/news/rss.xml`

 *(Brackets indicate optional text)*

+If using Docker:
+
+```shell
+docker run morss --clip http://feeds.bbci.co.uk/news/rss.xml
+```
+
 ### As a newsreader hook

 To use it, the newsreader [Liferea](http://lzone.de/liferea/) is required
@@ -226,7 +353,7 @@ Using cache and passing arguments:
 ```python
 >>> import morss
 >>> url = 'http://feeds.bbci.co.uk/news/rss.xml'
->>> cache = '/tmp/morss-cache.db' # sqlite cache location
+>>> cache = '/tmp/morss-cache' # diskcache cache location
 >>> options = {'csv':True}
 >>> xml_string = morss.process(url, cache, options)
 >>> xml_string[:50]
@@ -240,11 +367,10 @@ under the hood.
 Doing it step-by-step:

 ```python
-import morss, morss.crawler
+import morss

 url = 'http://newspaper.example/feed.xml'
 options = morss.Options(csv=True) # arguments
-morss.crawler.sqlite_default = '/tmp/morss-cache.db' # sqlite cache location

 url, rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
 rss = morss.FeedGather(rss, url, options) # this fills the feed and cleans it up
@@ -263,11 +389,13 @@ arguments to morss is explained in Run above.
 The list of arguments can be obtained by running `morss --help`

 ```
-usage: morss [-h] [--post STRING] [--format {rss,json,html,csv}]
-             [--search STRING] [--clip] [--indent] [--cache] [--force]
-             [--proxy] [--newest] [--firstlink] [--resolve] [--items XPATH]
-             [--item_link XPATH] [--item_title XPATH] [--item_content XPATH]
-             [--item_time XPATH] [--nolink] [--noref] [--silent]
+usage: morss [-h] [--post STRING] [--xpath XPATH]
+             [--format {rss,json,html,csv}] [--search STRING] [--clip]
+             [--indent] [--cache] [--force] [--proxy]
+             [--order {first,last,newest,oldest}] [--firstlink] [--resolve]
+             [--items XPATH] [--item_link XPATH] [--item_title XPATH]
+             [--item_content XPATH] [--item_time XPATH]
+             [--mode {xml,html,json}] [--nolink] [--noref] [--silent]
             url

 Get full-text RSS feeds
@@ -275,9 +403,10 @@ Get full-text RSS feeds
 positional arguments:
  url                   feed url

-optional arguments:
+options:
  -h, --help            show this help message and exit
  --post STRING         POST request
+  --xpath XPATH         xpath rule to manually detect the article

 output:
  --format {rss,json,html,csv}
@@ -293,8 +422,9 @@ action:
                        articles' content), so as to save time
  --force               force refetch the rss feed and articles
  --proxy               doesn't fill the articles
-  --newest              return the feed items in chronological order (morss
-                        ohterwise shows the items by appearing order)
+  --order {first,last,newest,oldest}
+                        order in which to process items (which are however NOT
+                        sorted in the output)
  --firstlink           pull the first article mentioned in the description
                        instead of the default link
  --resolve             replace tracking links with direct links to articles
@@ -309,6 +439,8 @@ custom feeds:
  --item_content XPATH  entry's content
  --item_time XPATH     entry's date & time (accepts a wide range of time
                        formats)
+  --mode {xml,html,json}
+                        parser to use for the custom feeds

 misc:
  --nolink              drop links, but keeps links' inner text
@@ -330,10 +462,11 @@ servers)

 To pass environment variables:

- Docker-cli: `docker run -p 8080:8080 morss --env KEY=value`
+- Docker-cli: `docker run -p 8000:8000 morss --env KEY=value`
 - docker-compose: add an `environment:` section in the .yml file
 - Gunicorn/uWSGI/CLI: prepend `KEY=value` before the command
 - Apache: via the `SetEnv` instruction (see sample `.htaccess` provided)
+- cloud-init: in the `/etc/environment` file

 Generic:

@@ -342,6 +475,7 @@ debugging.
 - `IGNORE_SSL=1`: to ignore SSL certs when fetch feeds and articles
 - `DELAY` (seconds) sets the browser cache delay, only for HTTP clients
 - `TIMEOUT` (seconds) sets the HTTP timeout when fetching rss feeds and articles
+- `DATA_PATH`: to set custom file location for the `www` folder

 When parsing long feeds, with a lot of items (100+), morss might take a lot of
 time to parse it, or might even run into a memory overflow on some shared
@@ -368,20 +502,22 @@ be dropped from the feed, even if they're cached. `-1` for unlimited.
 morss uses caching to make loading faster. There are 3 possible cache backends:

 - `(nothing/default)`: a simple python in-memory dict-like object.
- `CACHE=sqlite`: sqlite3 cache. Default file location is in-memory (i.e. it
-will be cleared every time the program is run). Path can be defined with
-`SQLITE_PATH`.
- `CACHE=mysql`: MySQL cache. Connection can be defined with the following
-environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
+- `CACHE=redis`: Redis cache. Connection can be defined with the following
+environment variables: `REDIS_HOST`, `REDIS_PORT`, `REDIS_DB`, `REDIS_PWD`
+- `CACHE=diskcache`: disk-based cache. Target directory canbe defined with
+`DISKCACHE_DIR`.

 To limit the size of the cache:

 - `CACHE_SIZE` sets the target number of items in the cache (further items will
 be deleted but the cache might be temporarily bigger than that). Defaults to 1k
-entries.
+entries. NB. When using `diskcache`, this is the cache max size in Bytes.
 - `CACHE_LIFESPAN` (seconds) sets how often the cache must be trimmed (i.e. cut
 down to the number of items set in `CACHE_SIZE`). Defaults to 1min.

+Gunicorn also accepts command line arguments via the `GUNICORN_CMD_ARGS`
+environment variable.
+
 ### Content matching

 The content of articles is grabbed with our own readability fork. This means
--- a/app.json
+++ b/app.json
@@ -0,0 +1,21 @@
+{
+	"stack": "container",
+	"env": {
+		"DEBUG": {
+			"value": 1,
+			"required": false
+		},
+		"GUNICORN_CMD_ARGS": {
+			"value": "",
+			"required": false
+		},
+		"CACHE": {
+			"value": "diskcache",
+			"required": false
+		},
+		"CACHE_SIZE": {
+			"value": 1073741824,
+			"required": false
+		}
+	}
+}
--- a/heroku.yml
+++ b/heroku.yml
@@ -0,0 +1,3 @@
+build: 
+  docker:
+    web: Dockerfile
--- a/main.py
+++ b/main.py
--- a/47
+++ b/47
@@ -0,0 +1,47 @@
+#! /bin/sh
+set -ex
+
+if ! command -v python && command -v python3 ; then
+	alias python='python3'
+fi
+
+run() {
+	gunicorn --bind 0.0.0.0:${PORT:-8000} --preload --access-logfile - morss
+}
+
+daemon() {
+	gunicorn --bind 0.0.0.0:${PORT:-8000} --preload --access-logfile - --daemon morss
+}
+
+reload() {
+	pid=$(pidof 'gunicorn: master [morss]' || true)
+		# NB. requires python-setproctitle
+		# `|| true` due to `set -e`
+
+	if [ -z "$pid" ]; then
+		# if gunicorn is not currently running
+		daemon
+
+	else
+		kill -s USR2 $pid
+		kill -s WINCH $pid
+		sleep 1 # give gunicorn some time to reload
+		kill -s TERM $pid
+
+	fi
+}
+
+check() {
+	python -m morss.crawler http://localhost:${PORT:-8000}/ > /dev/null 2>&1
+}
+
+if [ -z "$1" ]; then
+	run
+
+elif [ "$1" = "sh" ] || [ "$1" = "bash" ] || command -v "$1" ; then
+	$@
+
+else
+	python -m morss $@
+
+fi
--- a/morss.service
+++ b/morss.service
@@ -0,0 +1,13 @@
+[Unit]
+Description=morss server (gunicorn)
+After=network.target
+
+[Service]
+ExecStart=/usr/local/bin/morss-helper run
+ExecReload=/usr/local/bin/morss-helper reload
+KillMode=process
+Restart=always
+User=http
+
+[Install]
+WantedBy=multi-user.target
--- a/morss/init.py
+++ b/morss/init.py
@@ -16,5 +16,10 @@
 # with this program. If not, see <https://www.gnu.org/licenses/>.

 # ran on `import morss`
+
+# pylint: disable=unused-import,unused-variable
+
+__version__ = ""
+
 from .morss import *
 from .wsgi import application
--- a/morss/main.py
+++ b/morss/main.py
@@ -20,9 +20,7 @@
 import os
 import sys

-from . import wsgi
-from . import cli
-
+from . import cli, wsgi
 from .morss import MorssException


--- a/morss/caching.py
+++ b/morss/caching.py
@@ -0,0 +1,122 @@
+# This file is part of morss
+#
+# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License along
+# with this program. If not, see <https://www.gnu.org/licenses/>.
+
+import os
+import threading
+import time
+from collections import OrderedDict
+
+CACHE_SIZE = int(os.getenv('CACHE_SIZE', 1000)) # max number of items in cache (default: 1k items)
+CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60)) # how often to auto-clear the cache (default: 1min)
+
+
+class BaseCache:
+    """ Subclasses must behave like a dict """
+
+    def trim(self):
+        pass
+
+    def autotrim(self, delay=CACHE_LIFESPAN):
+        # trim the cache every so often
+
+        self.trim()
+
+        t = threading.Timer(delay, self.autotrim)
+        t.daemon = True
+        t.start()
+
+    def __contains__(self, url):
+        try:
+            self[url]
+
+        except KeyError:
+            return False
+
+        else:
+            return True
+
+
+class CappedDict(OrderedDict, BaseCache):
+    def trim(self):
+        if CACHE_SIZE >= 0:
+            for i in range( max( len(self) - CACHE_SIZE , 0 )):
+                self.popitem(False)
+
+    def __setitem__(self, key, data):
+        # https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
+        if key in self:
+            del self[key]
+        OrderedDict.__setitem__(self, key, data)
+
+
+try:
+    import redis # isort:skip
+except ImportError:
+    pass
+
+
+class RedisCacheHandler(BaseCache):
+    def __init__(self, host='localhost', port=6379, db=0, password=None):
+        self.r = redis.Redis(host=host, port=port, db=db, password=password)
+
+    def __getitem__(self, key):
+        return self.r.get(key)
+
+    def __setitem__(self, key, data):
+        self.r.set(key, data)
+
+
+try:
+    import diskcache # isort:skip
+except ImportError:
+    pass
+
+
+class DiskCacheHandler(BaseCache):
+    def __init__(self, directory=None, **kwargs):
+        self.cache = diskcache.Cache(directory=directory, eviction_policy='least-frequently-used', **kwargs)
+
+    def __del__(self):
+        self.cache.close()
+
+    def trim(self):
+        self.cache.cull()
+
+    def __getitem__(self, key):
+        return self.cache[key]
+
+    def __setitem__(self, key, data):
+        self.cache.set(key, data)
+
+
+if 'CACHE' in os.environ:
+    if os.environ['CACHE'] == 'redis':
+        default_cache = RedisCacheHandler(
+            host = os.getenv('REDIS_HOST', 'localhost'),
+            port = int(os.getenv('REDIS_PORT', 6379)),
+            db = int(os.getenv('REDIS_DB', 0)),
+            password = os.getenv('REDIS_PWD', None)
+        )
+
+    elif os.environ['CACHE'] == 'diskcache':
+        default_cache = DiskCacheHandler(
+            directory = os.getenv('DISKCACHE_DIR', '/tmp/morss-diskcache'),
+            size_limit = CACHE_SIZE # in Bytes
+        )
+
+else:
+        default_cache = CappedDict()
--- a/morss/cli.py
+++ b/morss/cli.py
@@ -15,12 +15,11 @@
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.

-import sys
-import os.path
 import argparse
+import os.path
+import sys

-from .morss import FeedFetch, FeedGather, FeedFormat
-from .morss import Options
+from .morss import FeedFetch, FeedFormat, FeedGather, Options


 def cli_app():
@@ -33,6 +32,7 @@ def cli_app():
    parser.add_argument('url', help='feed url')

    parser.add_argument('--post', action='store', type=str, metavar='STRING', help='POST request')
+    parser.add_argument('--xpath', action='store', type=str, metavar='XPATH', help='xpath rule to manually detect the article')

    group = parser.add_argument_group('output')
    group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
@@ -44,7 +44,7 @@ def cli_app():
    group.add_argument('--cache', action='store_true', help='only take articles from the cache (ie. don\'t grab new articles\' content), so as to save time')
    group.add_argument('--force', action='store_true', help='force refetch the rss feed and articles')
    group.add_argument('--proxy', action='store_true', help='doesn\'t fill the articles')
-    group.add_argument('--newest', action='store_true', help='return the feed items in chronological order (morss ohterwise shows the items by appearing order)')
+    group.add_argument('--order', default='first', choices=('first', 'last', 'newest', 'oldest'), help='order in which to process items (which are however NOT sorted in the output)')
    group.add_argument('--firstlink', action='store_true', help='pull the first article mentioned in the description instead of the default link')
    group.add_argument('--resolve', action='store_true', help='replace tracking links with direct links to articles (not compatible with --proxy)')

@@ -54,6 +54,7 @@ def cli_app():
    group.add_argument('--item_title', action='store', type=str, metavar='XPATH', help='entry\'s title')
    group.add_argument('--item_content', action='store', type=str, metavar='XPATH', help='entry\'s content')
    group.add_argument('--item_time', action='store', type=str, metavar='XPATH', help='entry\'s date & time (accepts a wide range of time formats)')
+    group.add_argument('--mode', default=None, choices=('xml', 'html', 'json'), help='parser to use for the custom feeds')

    group = parser.add_argument_group('misc')
    group.add_argument('--nolink', action='store_true', help='drop links, but keeps links\' inner text')
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -16,30 +16,37 @@
 # with this program. If not, see <https://www.gnu.org/licenses/>.

 import os
-import sys
-
-import zlib
-from io import BytesIO, StringIO
-import re
-import chardet
-from cgi import parse_header
-import time
-import threading
+import pickle
 import random
+import re
+import sys
+import time
+import zlib
+from cgi import parse_header
 from collections import OrderedDict
+from io import BytesIO, StringIO
+
+import chardet
+
+from .caching import default_cache

 try:
    # python 2
-    from urllib2 import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
    from urllib import quote
-    from urlparse import urlparse, urlunparse
-    import mimetools
+
+    from httplib import HTTPMessage
+    from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
+                         Request, addinfourl, build_opener, parse_http_list,
+                         parse_keqv_list)
+    from urlparse import urlsplit
 except ImportError:
    # python 3
-    from urllib.request import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
-    from urllib.parse import quote
-    from urllib.parse import urlparse, urlunparse
-    import email
+    from email import message_from_string
+    from http.client import HTTPMessage
+    from urllib.parse import quote, urlsplit
+    from urllib.request import (BaseHandler, HTTPCookieProcessor,
+                                HTTPRedirectHandler, Request, addinfourl,
+                                build_opener, parse_http_list, parse_keqv_list)

 try:
    # python 2
@@ -49,14 +56,12 @@ except NameError:
    basestring = unicode = str


-CACHE_SIZE = int(os.getenv('CACHE_SIZE', 1000)) # max number of items in cache (default: 1k items)
-CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60)) # how often to auto-clear the cache (default: 1min)
-
-
 MIMETYPE = {
    'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
    'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
-    'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
+    'html': ['text/html', 'application/xhtml+xml', 'application/xml'],
+    'json': ['application/json'],
+    }


 DEFAULT_UAS = [
@@ -99,7 +104,7 @@ def adv_get(url, post=None, timeout=None, *args, **kwargs):
    encoding= detect_encoding(data, con)

    return {
-        'data':data,
+        'data': data,
        'url': con.geturl(),
        'con': con,
        'contenttype': contenttype,
@@ -107,9 +112,7 @@ def adv_get(url, post=None, timeout=None, *args, **kwargs):
    }


-def custom_opener(follow=None, delay=None):
-    handlers = []
-
+def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
    # as per urllib2 source code, these Handelers are added first
    # *unless* one of the custom handlers inherits from one of them
    #
@@ -127,20 +130,23 @@ def custom_opener(follow=None, delay=None):
    # http_error_* are run until sth is returned (other than None). If they all
    # return nothing, a python error is raised

-    #handlers.append(DebugHandler())
-    handlers.append(SizeLimitHandler(500*1024)) # 500KiB
-    handlers.append(HTTPCookieProcessor())
-    handlers.append(GZIPHandler())
-    handlers.append(HTTPEquivHandler())
-    handlers.append(HTTPRefreshHandler())
-    handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
-    handlers.append(BrowserlyHeaderHandler())
-    handlers.append(EncodingFixHandler())
+    handlers = [
+        #DebugHandler(),
+        SizeLimitHandler(500*1024), # 500KiB
+        HTTPCookieProcessor(),
+        GZIPHandler(),
+        HTTPAllRedirectHandler(),
+        HTTPEquivHandler(),
+        HTTPRefreshHandler(),
+        UAHandler(random.choice(DEFAULT_UAS)),
+        BrowserlyHeaderHandler(),
+        EncodingFixHandler(),
+    ]

    if follow:
        handlers.append(AlternateHandler(MIMETYPE[follow]))

-    handlers.append(CacheHandler(force_min=delay))
+    handlers.append(CacheHandler(policy=policy, force_min=force_min, force_max=force_max))

    return build_opener(*handlers)

@@ -157,10 +163,20 @@ def is_ascii(string):
        return True


+def soft_quote(string):
+    " url-quote only when not a valid ascii string "
+
+    if is_ascii(string):
+        return string
+
+    else:
+        return quote(string.encode('utf-8'))
+
+
 def sanitize_url(url):
    # make sure the url is unicode, i.e. not bytes
    if isinstance(url, bytes):
-        url = url.decode()
+        url = url.decode('utf-8')

    # make sure there's a protocol (http://)
    if url.split(':', 1)[0] not in PROTOCOL:
@@ -173,18 +189,19 @@ def sanitize_url(url):
    url = url.replace(' ', '%20')

    # escape non-ascii unicode characters
-    # https://stackoverflow.com/a/4391299
-    parts = list(urlparse(url))
+    parts = urlsplit(url)

-    for i in range(len(parts)):
-        if not is_ascii(parts[i]):
-            if i == 1:
-                parts[i] = parts[i].encode('idna').decode('ascii')
+    parts = parts._replace(
+        netloc=parts.netloc.replace(
+            parts.hostname,
+            parts.hostname.encode('idna').decode('ascii')
+            ),
+        path=soft_quote(parts.path),
+        query=soft_quote(parts.query),
+        fragment=soft_quote(parts.fragment),
+    )

-            else:
-                parts[i] = quote(parts[i].encode('utf-8'))
-
-    return urlunparse(parts)
+    return parts.geturl()


 class RespDataHandler(BaseHandler):
@@ -351,7 +368,7 @@ class BrowserlyHeaderHandler(BaseHandler):
 def iter_html_tag(html_str, tag_name):
    " To avoid parsing whole pages when looking for a simple tag "

-    re_tag = r'<%s(\s*[^>])*>' % tag_name
+    re_tag = r'<%s\s+[^>]+>' % tag_name
    re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'

    for tag_match in re.finditer(re_tag, html_str):
@@ -397,13 +414,18 @@ class HTTPEquivHandler(RespStrHandler):
                    resp.headers[meta.get('http-equiv').lower()] = meta.get('content')


+class HTTPAllRedirectHandler(HTTPRedirectHandler):
+    def http_error_308(self, req, fp, code, msg, headers):
+        return self.http_error_301(req, fp, 301, msg, headers)
+
+
 class HTTPRefreshHandler(BaseHandler):
    handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000

    def http_response(self, req, resp):
        if 200 <= resp.code < 300:
            if resp.headers.get('refresh'):
-                regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url=(["\']?)(?P<url>.+)\2$'
+                regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url\s*=\s*(["\']?)(?P<url>.+)\2$'
                match = re.search(regex, resp.headers.get('refresh'))

                if match:
@@ -419,65 +441,105 @@ class HTTPRefreshHandler(BaseHandler):
    https_response = http_response


+def parse_headers(text=u'\n\n'):
+    if sys.version_info[0] >= 3:
+        # python 3
+        return message_from_string(text, _class=HTTPMessage)
+
+    else:
+        # python 2
+        return HTTPMessage(StringIO(text))
+
+
+def error_response(code, msg, url=''):
+    # return an error as a response
+    resp = addinfourl(BytesIO(), parse_headers(), url, code)
+    resp.msg = msg
+    return resp
+
+
 class CacheHandler(BaseHandler):
    " Cache based on etags/last-modified "

-    private_cache = False # Websites can indicate whether the page should be
-                          # cached by CDNs (e.g. shouldn't be the case for
-                          # private/confidential/user-specific pages.
-                          # With this setting, decide whether (False) you want
-                          # the cache to behave like a CDN (i.e. don't cache
-                          # private pages), or (True) to behave like a end-cache
-                          # private pages. If unsure, False is the safest bet.
+    privacy = 'private' # Websites can indicate whether the page should be cached
+                        # by CDNs (e.g. shouldn't be the case for
+                        # private/confidential/user-specific pages. With this
+                        # setting, decide whether you want the cache to behave
+                        # like a CDN (i.e. don't cache private pages, 'public'),
+                        # or to behave like a end-user private pages
+                        # ('private'). If unsure, 'public' is the safest bet,
+                        # but many websites abuse this feature...
+
+                      # NB. This overrides all the other min/max/policy settings.
    handler_order = 499

-    def __init__(self, cache=None, force_min=None):
+    def __init__(self, cache=None, force_min=None, force_max=None, policy=None):
        self.cache = cache or default_cache
        self.force_min = force_min
-            # Servers indicate how long they think their content is "valid".
-            # With this parameter (force_min, expressed in seconds), we can
-            # override the validity period (i.e. bypassing http headers)
-            # Special values:
-            #   -1: valid forever, i.e. use the cache no matter what (and fetch
-            #       the page online if not present in cache)
-            #    0: valid zero second, i.e. force refresh
-            #   -2: same as -1, i.e. use the cache no matter what, but do NOT
-            #       fetch the page online if not present in cache, throw an
-            #       error instead
+        self.force_max = force_max
+        self.policy = policy # can be cached/refresh/offline/None (default)
+
+        # Servers indicate how long they think their content is "valid". With
+        # this parameter (force_min/max, expressed in seconds), we can override
+        # the validity period (i.e. bypassing http headers)
+        # Special choices, via "policy":
+        #   cached: use the cache no matter what (and fetch the page online if
+        #           not present in cache)
+        #   refresh: valid zero second, i.e. force refresh
+        #   offline: same as cached, i.e. use the cache no matter what, but do
+        #            NOT fetch the page online if not present in cache, throw an
+        #            error instead
+        #   None: just follow protocols
+
+        # sanity checks
+        assert self.force_max is None or self.force_max >= 0
+        assert self.force_min is None or self.force_min >= 0
+        assert self.force_max is None or self.force_min is None or self.force_max >= self.force_min

    def load(self, url):
        try:
-            out = list(self.cache[url])
+            data = pickle.loads(self.cache[url])
+
        except KeyError:
-            out = [None, None, unicode(), bytes(), 0]
+            data = None

-        if sys.version_info[0] >= 3:
-            out[2] = email.message_from_string(out[2] or unicode()) # headers
        else:
-            out[2] = mimetools.Message(StringIO(out[2] or unicode()))
+            data['headers'] = parse_headers(data['headers'] or unicode())

-        return out
+        return data

-    def save(self, url, code, msg, headers, data, timestamp):
-        self.cache[url] = (code, msg, unicode(headers), data, timestamp)
+    def save(self, key, data):
+        data['headers'] = unicode(data['headers'])
+        self.cache[key] = pickle.dumps(data, 0)

-    def is_cached(self, url):
-        return self.load(url)[0] is not None
+    def cached_response(self, req, fallback=None):
+        req.from_morss_cache = True

-    def cached_response(self, req):
-        # this does NOT check whether it's already cached, use with care
-        (code, msg, headers, data, timestamp) = self.load(req.get_full_url())
+        data = self.load(req.get_full_url())

-        # return the cache as a response
-        resp = addinfourl(BytesIO(data), headers, req.get_full_url(), code)
-        resp.msg = msg
+        if data is not None:
+            # return the cache as a response
+            resp = addinfourl(BytesIO(data['data']), data['headers'], req.get_full_url(), data['code'])
+            resp.msg = data['msg']
+            return resp

-        return resp
+        else:
+            return fallback

    def save_response(self, req, resp):
+        if req.from_morss_cache:
+            # do not re-save (would reset the timing)
+            return resp
+
        data = resp.read()

-        self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
+        self.save(req.get_full_url(), {
+            'code': resp.code,
+            'msg': resp.msg,
+            'headers': resp.headers,
+            'data': data,
+            'timestamp': time.time()
+            })

        fp = BytesIO(data)
        old_resp = resp
@@ -487,13 +549,16 @@ class CacheHandler(BaseHandler):
        return resp

    def http_request(self, req):
-        (code, msg, headers, data, timestamp) = self.load(req.get_full_url())
+        req.from_morss_cache = False # to track whether it comes from cache

-        if 'etag' in headers:
-            req.add_unredirected_header('If-None-Match', headers['etag'])
+        data = self.load(req.get_full_url())

-        if 'last-modified' in headers:
-            req.add_unredirected_header('If-Modified-Since', headers.get('last-modified'))
+        if data is not None:
+            if 'etag' in data['headers']:
+                req.add_unredirected_header('If-None-Match', data['headers']['etag'])
+
+            if 'last-modified' in data['headers']:
+                req.add_unredirected_header('If-Modified-Since', data['headers']['last-modified'])

        return req

@@ -502,60 +567,74 @@ class CacheHandler(BaseHandler):
        # If 'None' is returned, try your chance with the next-available handler
        # If a 'resp' is returned, stop there, and proceed with 'http_response'

-        (code, msg, headers, data, timestamp) = self.load(req.get_full_url())
+        # Here, we try to see whether we want to use data from cache (i.e.
+        # return 'resp'), or whether we want to refresh the content (return
+        # 'None')

-        # some info needed to process everything
-        cache_control = parse_http_list(headers.get('cache-control', ()))
-        cache_control += parse_http_list(headers.get('pragma', ()))
+        data = self.load(req.get_full_url())

-        cc_list = [x for x in cache_control if '=' not in x]
-        cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
+        if data is not None:
+            # some info needed to process everything
+            cache_control = parse_http_list(data['headers'].get('cache-control', ()))
+            cache_control += parse_http_list(data['headers'].get('pragma', ()))

-        cache_age = time.time() - timestamp
+            cc_list = [x for x in cache_control if '=' not in x]
+            cc_values = parse_keqv_list([x for x in cache_control if '=' in x])

-        # list in a simple way what to do when
-        if self.force_min == -2:
-            if code is not None:
-                # already in cache, perfect, use cache
-                return self.cached_response(req)
+            cache_age = time.time() - data['timestamp']

-            else:
-                # raise an error, via urllib handlers
-                resp = addinfourl(BytesIO(), headers, req.get_full_url(), 409)
-                resp.msg = 'Conflict'
-                return resp
+        # list in a simple way what to do in special cases

-        elif code is None:
-            # cache empty, refresh
+        if data is not None and 'private' in cc_list and self.privacy == 'public':
+            # private data but public cache, do not use cache
+            # privacy concern, so handled first and foremost
+            # (and doesn't need to be addressed anymore afterwards)
            return None

-        elif self.force_min == -1:
-            # force use cache
-            return self.cached_response(req)
+        elif self.policy == 'offline':
+            # use cache, or return an error
+            return self.cached_response(
+                req,
+                error_response(409, 'Conflict', req.get_full_url())
+            )

-        elif self.force_min == 0:
+        elif self.policy == 'cached':
+            # use cache, or fetch online
+            return self.cached_response(req, None)
+
+        elif self.policy == 'refresh':
            # force refresh
            return None

-        elif code == 301 and cache_age < 7*24*3600:
+        elif data is None:
+            # we have already settled all the cases that don't need the cache.
+            # all the following ones need the cached item
+            return None
+
+        elif self.force_max is not None and cache_age > self.force_max:
+            # older than we want, refresh
+            return None
+
+        elif self.force_min is not None and cache_age < self.force_min:
+            # recent enough, use cache
+            return self.cached_response(req)
+
+        elif data['code'] == 301 and cache_age < 7*24*3600:
            # "301 Moved Permanently" has to be cached...as long as we want
            # (awesome HTTP specs), let's say a week (why not?). Use force_min=0
            # if you want to bypass this (needed for a proper refresh)
            return self.cached_response(req)

-        elif (self.force_min is None or self.force_min > 0) and ('no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache)):
-            # kindly follow web servers indications, refresh
-            # if the same settings are used all along, this section shouldn't be
-            # of any use, since the page woudln't be cached in the first place
-            # the check is only performed "just in case"
+        elif self.force_min is None and ('no-cache' in cc_list or 'no-store' in cc_list):
+            # kindly follow web servers indications, refresh if the same
+            # settings are used all along, this section shouldn't be of any use,
+            # since the page woudln't be cached in the first place the check is
+            # only performed "just in case"
+            # NB. NOT respected if force_min is set
            return None

        elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
-            # server says it's still fine (and we trust him, if not, use force_min=0), use cache
-            return self.cached_response(req)
-
-        elif self.force_min is not None and self.force_min > cache_age:
-            # still recent enough for us, use cache
+            # server says it's still fine (and we trust him, if not, use overrides), use cache
            return self.cached_response(req)

        else:
@@ -563,22 +642,21 @@ class CacheHandler(BaseHandler):
            return None

    def http_response(self, req, resp):
-        # code for after-fetch, to know whether to save to hard-drive (if stiking to http headers' will)
-        # NB. It might re-save requests pulled from cache, which will re-set the time() to the latest, i.e. lenghten its useful life
+        # code for after-fetch, to know whether to save to hard-drive (if sticking to http headers' will)

-        if resp.code == 304 and self.is_cached(resp.url):
+        if resp.code == 304 and resp.url in self.cache:
            # we are hopefully the first after the HTTP handler, so no need
            # to re-run all the *_response
            # here: cached page, returning from cache
            return self.cached_response(req)

-        elif ('cache-control' in resp.headers or 'pragma' in resp.headers) and self.force_min is None:
+        elif self.force_min is None and ('cache-control' in resp.headers or 'pragma' in resp.headers):
            cache_control = parse_http_list(resp.headers.get('cache-control', ()))
            cache_control += parse_http_list(resp.headers.get('pragma', ()))

            cc_list = [x for x in cache_control if '=' not in x]

-            if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache):
+            if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and self.privacy == 'public'):
                # kindly follow web servers indications (do not save & return)
                return resp

@@ -594,142 +672,6 @@ class CacheHandler(BaseHandler):
    https_response = http_response


-class BaseCache:
-    """ Subclasses must behave like a dict """
-
-    def trim(self):
-        pass
-
-    def autotrim(self, delay=CACHE_LIFESPAN):
-        # trim the cache every so often
-
-        self.trim()
-
-        t = threading.Timer(delay, self.autotrim)
-        t.daemon = True
-        t.start()
-
-    def __contains__(self, url):
-        try:
-            self[url]
-
-        except KeyError:
-            return False
-
-        else:
-            return True
-
-
-import sqlite3
-
-
-class SQLiteCache(BaseCache):
-    def __init__(self, filename=':memory:'):
-        self.con = sqlite3.connect(filename, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
-
-        with self.con:
-            self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)')
-            self.con.execute('pragma journal_mode=WAL')
-
-        self.trim()
-
-    def __del__(self):
-        self.con.close()
-
-    def trim(self):
-        with self.con:
-            self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,))
-
-    def __getitem__(self, url):
-        row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
-
-        if not row:
-            raise KeyError
-
-        return row[1:]
-
-    def __setitem__(self, url, value): # value = (code, msg, headers, data, timestamp)
-        value = list(value)
-        value[3] = sqlite3.Binary(value[3]) # data
-        value = tuple(value)
-
-        with self.con:
-            self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?) ON CONFLICT(url) DO UPDATE SET code=?, msg=?, headers=?, data=?, timestamp=?', (url,) + value + value)
-
-
-import pymysql.cursors
-
-
-class MySQLCacheHandler(BaseCache):
-    def __init__(self, user, password, database, host='localhost'):
-        self.user = user
-        self.password = password
-        self.database = database
-        self.host = host
-
-        with self.cursor() as cursor:
-            cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)')
-
-        self.trim()
-
-    def cursor(self):
-        return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
-
-    def trim(self):
-        with self.cursor() as cursor:
-            cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,))
-
-    def __getitem__(self, url):
-        cursor = self.cursor()
-        cursor.execute('SELECT * FROM data WHERE url=%s', (url,))
-        row = cursor.fetchone()
-
-        if not row:
-            raise KeyError
-
-        return row[1:]
-
-    def __setitem__(self, url, value): # (code, msg, headers, data, timestamp)
-        with self.cursor() as cursor:
-            cursor.execute('INSERT INTO data VALUES (%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE code=%s, msg=%s, headers=%s, data=%s, timestamp=%s',
-                (url,) + value + value)
-
-
-class CappedDict(OrderedDict, BaseCache):
-    def trim(self):
-        if CACHE_SIZE >= 0:
-            for i in range( max( len(self) - CACHE_SIZE , 0 )):
-                self.popitem(False)
-
-    def __setitem__(self, key, value):
-        # https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
-        if key in self:
-            del self[key]
-        OrderedDict.__setitem__(self, key, value)
-
-
-if 'CACHE' in os.environ:
-    if os.environ['CACHE'] == 'mysql':
-        default_cache = MySQLCacheHandler(
-            user = os.getenv('MYSQL_USER'),
-            password = os.getenv('MYSQL_PWD'),
-            database = os.getenv('MYSQL_DB'),
-            host = os.getenv('MYSQL_HOST', 'localhost')
-        )
-
-    elif os.environ['CACHE'] == 'sqlite':
-        if 'SQLITE_PATH' in os.environ:
-            path = os.getenv('SQLITE_PATH')
-
-        else:
-            path = ':memory:'
-
-        default_cache = SQLiteCache(path)
-
-else:
-        default_cache = CappedDict()
-
-
 if 'IGNORE_SSL' in os.environ:
    import ssl
    ssl._create_default_https_context = ssl._create_unverified_context
--- a/morss/feedify.ini
+++ b/morss/feedify.ini
@@ -90,9 +90,6 @@ item_updated = updated
 [html]
 mode = html

-path =
-  http://localhost/
-
 title = //div[@id='header']/h1
 desc = //div[@id='header']/p
 items = //div[@id='content']/div
--- a/morss/feeds.py
+++ b/morss/feeds.py
@@ -15,35 +15,31 @@
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.

-import sys
-import os.path
-
-from datetime import datetime
-
-import re
-import json
 import csv
-
+import json
+import re
+from copy import deepcopy
+from datetime import datetime
 from fnmatch import fnmatch

-from lxml import etree
-from dateutil import tz
 import dateutil.parser
-from copy import deepcopy
-
 import lxml.html
+from dateutil import tz
+from lxml import etree
+
 from .readabilite import parse as html_parse
+from .util import *

 json.encoder.c_make_encoder = None

 try:
    # python 2
-    from StringIO import StringIO
    from ConfigParser import RawConfigParser
+    from StringIO import StringIO
 except ImportError:
    # python 3
-    from io import StringIO
    from configparser import RawConfigParser
+    from io import StringIO

 try:
    # python 2
@@ -55,7 +51,7 @@ except NameError:

 def parse_rules(filename=None):
    if not filename:
-        filename = os.path.join(os.path.dirname(__file__), 'feedify.ini')
+        filename = pkg_path('feedify.ini')

    config = RawConfigParser()
    config.read(filename)
@@ -69,18 +65,10 @@ def parse_rules(filename=None):
            # for each rule

            if rules[section][arg].startswith('file:'):
-                paths = [os.path.join(sys.prefix, 'share/morss/www', rules[section][arg][5:]),
-                    os.path.join(os.path.dirname(__file__), '../www', rules[section][arg][5:]),
-                    os.path.join(os.path.dirname(__file__), '../..', rules[section][arg][5:])]
-
-                for path in paths:
-                    try:
-                        file_raw = open(path).read()
-                        file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
-                        rules[section][arg] = file_clean
-
-                    except IOError:
-                        pass
+                path = data_path('www', rules[section][arg][5:])
+                file_raw = open(path).read()
+                file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
+                rules[section][arg] = file_clean

            elif '\n' in rules[section][arg]:
                rules[section][arg] = rules[section][arg].split('\n')[1:]
@@ -106,7 +94,7 @@ def parse(data, url=None, encoding=None, ruleset=None):
            if 'path' in ruleset:
                for path in ruleset['path']:
                    if fnmatch(url, path):
-                        parser = [x for x in parsers if x.mode == ruleset['mode']][0]
+                        parser = [x for x in parsers if x.mode == ruleset.get('mode')][0] # FIXME what if no mode specified?
                        return parser(data, ruleset, encoding=encoding)

    # 2) Try each and every parser
@@ -126,7 +114,7 @@ def parse(data, url=None, encoding=None, ruleset=None):
        else:
            # parsing worked, now we try the rulesets

-            ruleset_candidates = [x for x in rulesets if x.get('mode', None) in (parser.mode, None) and 'path' not in x]
+            ruleset_candidates = [x for x in rulesets if x.get('mode') in (parser.mode, None) and 'path' not in x]
                # 'path' as they should have been caught beforehands
                # try anyway if no 'mode' specified

@@ -199,11 +187,12 @@ class ParserBase(object):
        return self.convert(FeedHTML).tostring(**k)

    def convert(self, TargetParser):
-        if type(self) == TargetParser:
-            return self
-
        target = TargetParser()

+        if type(self) == TargetParser and self.rules == target.rules:
+            # check both type *AND* rules (e.g. when going from freeform xml to rss)
+            return self
+
        for attr in target.dic:
            if attr == 'items':
                for item in self.items:
@@ -372,7 +361,13 @@ class ParserXML(ParserBase):

    def rule_search_all(self, rule):
        try:
-            return self.root.xpath(rule, namespaces=self.NSMAP)
+            match = self.root.xpath(rule, namespaces=self.NSMAP)
+            if isinstance(match, str):
+                # some xpath rules return a single string instead of an array (e.g. concatenate() )
+                return [match,]
+
+            else:
+                return match

        except etree.XPathEvalError:
            return []
@@ -435,7 +430,7 @@ class ParserXML(ParserBase):

        match = self.rule_search(rrule)

-        html_rich = ('atom' in rule or self.rules['mode'] == 'html') \
+        html_rich = ('atom' in rule or self.rules.get('mode') == 'html') \
            and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]

        if key is not None:
@@ -446,7 +441,7 @@ class ParserXML(ParserBase):
                self._clean_node(match)
                match.append(lxml.html.fragment_fromstring(value, create_parent='div'))

-                if self.rules['mode'] == 'html':
+                if self.rules.get('mode') == 'html':
                    match.find('div').drop_tag() # not supported by lxml.etree

                else: # i.e. if atom
@@ -495,7 +490,14 @@ class ParserHTML(ParserXML):
            repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
            rule = re.sub(pattern, repl, rule)

-            return self.root.xpath(rule)
+            match = self.root.xpath(rule)
+
+            if isinstance(match, str):
+                # for some xpath rules, see XML parser
+                return [match,]
+
+            else:
+                return match

        except etree.XPathEvalError:
            return []
@@ -697,7 +699,7 @@ class Feed(object):
                try:
                    setattr(item, attr, new[attr])

-                except (IndexError, TypeError):
+                except (KeyError, IndexError, TypeError):
                    pass

        return item
@@ -813,6 +815,8 @@ class FeedJSON(Feed, ParserJSON):


 if __name__ == '__main__':
+    import sys
+
    from . import crawler

    req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -16,30 +16,26 @@
 # with this program. If not, see <https://www.gnu.org/licenses/>.

 import os
-
+import re
+import sys
 import time
 from datetime import datetime
-from dateutil import tz
-
 from fnmatch import fnmatch
-import re

 import lxml.etree
 import lxml.html
+from dateutil import tz

-from . import feeds
-from . import crawler
-from . import readabilite
-
+from . import caching, crawler, feeds, readabilite

 try:
    # python 2
    from httplib import HTTPException
-    from urlparse import urlparse, urljoin, parse_qs
+    from urlparse import parse_qs, urljoin, urlparse
 except ImportError:
    # python 3
    from http.client import HTTPException
-    from urllib.parse import urlparse, urljoin, parse_qs
+    from urllib.parse import parse_qs, urljoin, urlparse


 MAX_ITEM = int(os.getenv('MAX_ITEM', 5)) # cache-only beyond
@@ -64,7 +60,7 @@ def log(txt):

        else:
            # when using internal server or cli
-            print(repr(txt))
+            print(repr(txt), file=sys.stderr)


 def len_html(txt):
@@ -91,12 +87,12 @@ class Options:
        else:
            self.options = options or {}

-    def __getattr__(self, key):
+    def __getattr__(self, key, default=None):
        if key in self.options:
            return self.options[key]

        else:
-            return None
+            return default

    def __setitem__(self, key, value):
        self.options[key] = value
@@ -104,12 +100,7 @@ class Options:
    def __contains__(self, key):
        return key in self.options

-    def get(self, key, default=None):
-        if key in self.options:
-            return self.options[key]
-
-        else:
-            return default
+    get = __getitem__ = __getattr__


 def ItemFix(item, options, feedurl='/'):
@@ -204,21 +195,20 @@ def ItemFill(item, options, feedurl='/', fast=False):
    log(item.link)

    # download
-    delay = -1

-    if fast or options.fast:
+    if fast or options.cache:
        # force cache, don't fetch
-        delay = -2
+        policy = 'offline'

    elif options.force:
        # force refresh
-        delay = 0
+        policy = 'refresh'

    else:
-        delay = 24*60*60 # 24h
+        policy = None

    try:
-        req = crawler.adv_get(url=item.link, delay=delay, timeout=TIMEOUT)
+        req = crawler.adv_get(url=item.link, policy=policy, force_min=24*60*60, timeout=TIMEOUT)

    except (IOError, HTTPException) as e:
        log('http error')
@@ -228,7 +218,11 @@ def ItemFill(item, options, feedurl='/', fast=False):
        log('non-text page')
        return True

-    out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
+    if not req['data']:
+        log('empty page')
+        return True
+
+    out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode', xpath=options.xpath)

    if out is not None:
        item.content = out
@@ -272,11 +266,17 @@ def FeedFetch(url, options):
    # fetch feed
    delay = DELAY

-    if options.force:
-        delay = 0
+    if options.cache:
+        policy = 'offline'
+
+    elif options.force:
+        policy = 'refresh'
+
+    else:
+        policy = None

    try:
-        req = crawler.adv_get(url=url, post=options.post, follow=('rss' if not options.items else None), delay=delay, timeout=TIMEOUT * 2)
+        req = crawler.adv_get(url=url, post=options.post, follow=('rss' if not options.items else None), policy=policy, force_min=5*60, force_max=60*60, timeout=TIMEOUT)

    except (IOError, HTTPException):
        raise MorssException('Error downloading feed')
@@ -287,11 +287,14 @@ def FeedFetch(url, options):

        ruleset['items'] = options.items

+        if options.mode:
+            ruleset['mode'] = options.mode
+
        ruleset['title'] = options.get('title', '//head/title')
        ruleset['desc'] = options.get('desc', '//head/meta[@name="description"]/@content')

        ruleset['item_title'] = options.get('item_title', '.')
-        ruleset['item_link'] = options.get('item_link', './@href|.//a/@href|ancestor::a/@href')
+        ruleset['item_link'] = options.get('item_link', '(.|.//a|ancestor::a)/@href')

        if options.item_content:
            ruleset['item_content'] = options.item_content
@@ -329,16 +332,23 @@ def FeedGather(rss, url, options):
    if options.cache:
        max_time = 0

-    if options.newest:
-        # :newest take the newest items
-        now = datetime.now(tz.tzutc())
-        sorted_items = sorted(rss.items, key=lambda x:x.updated or x.time or now, reverse=True)
+    # sort
+    sorted_items = list(rss.items)

-    else:
-        # default behavior, take the first items (in appearing order)
-        sorted_items = list(rss.items)
+    if options.order == 'last':
+    # `first` does nothing from a practical standpoint, so only `last` needs
+    # to be addressed
+        sorted_items = reversed(sorted_items)
+
+    elif options.order in ['newest', 'oldest']:
+        now = datetime.now(tz.tzutc())
+        sorted_items = sorted(sorted_items, key=lambda x:x.updated or x.time or now) # oldest to newest
+
+        if options.order == 'newest':
+            sorted_items = reversed(sorted_items)

    for i, item in enumerate(sorted_items):
+        # hard cap
        if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0:
            log('dropped')
            item.remove()
@@ -351,6 +361,7 @@ def FeedGather(rss, url, options):

        item = ItemFix(item, options, url)

+        # soft cap
        if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
            if not options.proxy:
                if ItemFill(item, options, url, True) is False:
@@ -417,7 +428,7 @@ def process(url, cache=None, options=None):
    options = Options(options)

    if cache:
-        crawler.default_cache = crawler.SQLiteCache(cache)
+        caching.default_cache = caching.DiskCacheHandler(cache)

    url, rss = FeedFetch(url, options)
    rss = FeedGather(rss, url, options)
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@@ -15,22 +15,22 @@
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.

+import re
+
+import bs4.builder._lxml
 import lxml.etree
 import lxml.html
-from bs4 import BeautifulSoup
-import re
+import lxml.html.soupparser
+
+
+class CustomTreeBuilder(bs4.builder._lxml.LXMLTreeBuilder):
+    def default_parser(self, encoding):
+        return lxml.html.HTMLParser(target=self, remove_comments=True, remove_pis=True, encoding=encoding)


 def parse(data, encoding=None):
-    if encoding:
-        data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')
-
-    else:
-        data = BeautifulSoup(data, 'lxml').prettify('utf-8')
-
-    parser = lxml.html.HTMLParser(remove_comments=True, encoding='utf-8')
-
-    return lxml.html.fromstring(data, parser=parser)
+    kwargs = {'from_encoding': encoding} if encoding else {}
+    return lxml.html.soupparser.fromstring(data, builder=CustomTreeBuilder, **kwargs)


 def count_words(string):
@@ -43,6 +43,8 @@ def count_words(string):
    if string is None:
        return 0

+    string = string.strip()
+
    i = 0
    count = 0

@@ -152,15 +154,20 @@ def score_all(node):

    for child in node:
        score = score_node(child)
-        child.attrib['morss_own_score'] = str(float(score))
+        set_score(child, score, 'morss_own_score')

        if score > 0 or len(list(child.iterancestors())) <= 2:
            spread_score(child, score)
            score_all(child)


-def set_score(node, value):
-    node.attrib['morss_score'] = str(float(value))
+def set_score(node, value, label='morss_score'):
+    try:
+        node.attrib[label] = str(float(value))
+
+    except KeyError:
+        # catch issues with e.g. html comments
+        pass


 def get_score(node):
@@ -200,6 +207,12 @@ def clean_root(root, keep_threshold=None):
 def clean_node(node, keep_threshold=None):
    parent = node.getparent()

+    # remove comments
+    if (isinstance(node, lxml.html.HtmlComment)
+            or isinstance(node, lxml.html.HtmlProcessingInstruction)):
+        parent.remove(node)
+        return
+
    if parent is None:
        # this is <html/> (or a removed element waiting for GC)
        return
@@ -210,7 +223,7 @@ def clean_node(node, keep_threshold=None):
        return

    # high score, so keep
-    if keep_threshold is not None and get_score(node) >= keep_threshold:
+    if keep_threshold is not None and keep_threshold > 0 and get_score(node) >= keep_threshold:
        return

    gdparent = parent.getparent()
@@ -231,11 +244,6 @@ def clean_node(node, keep_threshold=None):
        parent.remove(node)
        return

-    # remove comments
-    if isinstance(node, lxml.html.HtmlComment) or isinstance(node, lxml.html.HtmlProcessingInstruction):
-        parent.remove(node)
-        return
-
    # remove if too many kids & too high link density
    wc = count_words(node.text_content())
    if wc != 0 and len(list(node.iter())) > 3:
@@ -293,28 +301,26 @@ def clean_node(node, keep_threshold=None):
            gdparent.insert(gdparent.index(parent)+1, new_node)


-def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
-    ancestorsA = list(nodeA.iterancestors())
-    ancestorsB = list(nodeB.iterancestors())
+def lowest_common_ancestor(node_a, node_b, max_depth=None):
+    ancestors_a = list(node_a.iterancestors())
+    ancestors_b = list(node_b.iterancestors())

    if max_depth is not None:
-        ancestorsA = ancestorsA[:max_depth]
-        ancestorsB = ancestorsB[:max_depth]
+        ancestors_a = ancestors_a[:max_depth]
+        ancestors_b = ancestors_b[:max_depth]

-    ancestorsA.insert(0, nodeA)
-    ancestorsB.insert(0, nodeB)
+    ancestors_a.insert(0, node_a)
+    ancestors_b.insert(0, node_b)

-    for ancestorA in ancestorsA:
-        if ancestorA in ancestorsB:
-            return ancestorA
+    for ancestor_a in ancestors_a:
+        if ancestor_a in ancestors_b:
+            return ancestor_a

-    return nodeA # should always find one tho, at least <html/>, but needed for max_depth
+    return node_a # should always find one tho, at least <html/>, but needed for max_depth


-def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5):
-    " Input a raw html string, returns a raw html string of the article "
-
-    html = parse(data, encoding_in)
+def get_best_node(html, threshold=5):
+    # score all nodes
    score_all(html)

    # rank all nodes (largest to smallest)
@@ -331,9 +337,33 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=
    else:
        best = ranked_nodes[0]

+    return best
+
+
+def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5, xpath=None):
+    " Input a raw html string, returns a raw html string of the article "
+
+    html = parse(data, encoding_in)
+
+    if xpath is not None:
+        xpath_match = html.xpath(xpath)
+
+        if len(xpath_match):
+            best = xpath_match[0]
+
+        else:
+            best = get_best_node(html, threshold)
+
+    else:
+        best = get_best_node(html, threshold)
+
+    if best is None:
+        # if threshold not met
+        return None
+
    # clean up
    if not debug:
-        keep_threshold = get_score(ranked_nodes[0]) * 3/4
+        keep_threshold = get_score(best) * 3/4
        clean_root(best, keep_threshold)

    # check for spammy content (links only)
@@ -352,6 +382,7 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=

 if __name__ == '__main__':
    import sys
+
    from . import crawler

    req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
--- a/morss/util.py
+++ b/morss/util.py
@@ -0,0 +1,57 @@
+# This file is part of morss
+#
+# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License along
+# with this program. If not, see <https://www.gnu.org/licenses/>.
+
+import os
+import os.path
+import sys
+
+
+def pkg_path(*path_elements):
+    return os.path.join(os.path.dirname(__file__), *path_elements)
+
+
+data_path_base = None
+
+
+def data_path(*path_elements):
+    global data_path_base
+
+    path = os.path.join(*path_elements)
+
+    if data_path_base is not None:
+        return os.path.join(data_path_base, path)
+
+    bases = [
+        os.path.join(sys.prefix, 'share/morss'), # when installed as root
+        pkg_path('../../../share/morss'), 
+        pkg_path('../../../../share/morss'),
+        pkg_path('../share/morss'), # for `pip install --target=dir morss`
+        pkg_path('..'), # when running from source tree
+    ]
+
+    if 'DATA_PATH' in os.environ:
+        bases.append(os.environ['DATA_PATH'])
+
+    for base in bases:
+        full_path = os.path.join(base, path)
+
+        if os.path.isfile(full_path):
+            data_path_base = os.path.abspath(base)
+            return data_path(path)
+
+    else:
+        raise IOError()
--- a/morss/wsgi.py
+++ b/morss/wsgi.py
@@ -15,16 +15,16 @@
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.

-import sys
+import cgitb
+import mimetypes
 import os.path
 import re
-import lxml.etree
-
-import cgitb
-import wsgiref.util
-import wsgiref.simple_server
+import sys
 import wsgiref.handlers
-import mimetypes
+import wsgiref.simple_server
+import wsgiref.util
+
+import lxml.etree

 try:
    # python 2
@@ -33,13 +33,12 @@ except ImportError:
    # python 3
    from urllib.parse import unquote

-from . import crawler
-from . import readabilite
-from .morss import FeedFetch, FeedGather, FeedFormat
-from .morss import Options, log, TIMEOUT, DELAY, MorssException
+from . import caching, crawler, readabilite
+from .morss import (DELAY, TIMEOUT, FeedFetch, FeedFormat, FeedGather,
+                    MorssException, Options, log)
+from .util import data_path

-
-PORT = int(os.getenv('PORT', 8080))
+PORT = int(os.getenv('PORT', 8000))


 def parse_options(options):
@@ -169,26 +168,21 @@ def cgi_file_handler(environ, start_response, app):

    if re.match(r'^/?([a-zA-Z0-9_-][a-zA-Z0-9\._-]+/?)*$', url):
        # if it is a legitimate url (no funny relative paths)
-        paths = [
-            os.path.join(sys.prefix, 'share/morss/www', url),
-            os.path.join(os.path.dirname(__file__), '../www', url)
-            ]
+        try:
+            path = data_path('www', url)
+            f = open(path, 'rb')

-        for path in paths:
-            try:
-                f = open(path, 'rb')
+        except IOError:
+            # problem with file (cannot open or not found)
+            pass

-            except IOError:
-                # problem with file (cannot open or not found)
-                continue
-
-            else:
-                # file successfully open
-                headers = {}
-                headers['status'] = '200 OK'
-                headers['content-type'] = mimetypes.guess_type(path)[0] or 'application/octet-stream'
-                start_response(headers['status'], list(headers.items()))
-                return wsgiref.util.FileWrapper(f)
+        else:
+            # file successfully open
+            headers = {}
+            headers['status'] = '200 OK'
+            headers['content-type'] = mimetypes.guess_type(path)[0] or 'application/octet-stream'
+            start_response(headers['status'], list(headers.items()))
+            return wsgiref.util.FileWrapper(f)

    # regex didn't validate or no file found
    return app(environ, start_response)
@@ -198,32 +192,36 @@ def cgi_get(environ, start_response):
    url, options = cgi_parse_environ(environ)

    # get page
-    req = crawler.adv_get(url=url, timeout=TIMEOUT)
+    if options['get'] in ('page', 'article'):
+        req = crawler.adv_get(url=url, timeout=TIMEOUT)

-    if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
-        if options.get == 'page':
-            html = readabilite.parse(req['data'], encoding=req['encoding'])
-            html.make_links_absolute(req['url'])
+        if req['contenttype'] in crawler.MIMETYPE['html']:
+            if options['get'] == 'page':
+                html = readabilite.parse(req['data'], encoding=req['encoding'])
+                html.make_links_absolute(req['url'])

-            kill_tags = ['script', 'iframe', 'noscript']
+                kill_tags = ['script', 'iframe', 'noscript']

-            for tag in kill_tags:
-                for elem in html.xpath('//'+tag):
-                    elem.getparent().remove(elem)
+                for tag in kill_tags:
+                    for elem in html.xpath('//'+tag):
+                        elem.getparent().remove(elem)

-            output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
+                output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')

-        elif options.get == 'article':
-            output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
+            else: # i.e. options['get'] == 'article'
+                output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
+
+        elif req['contenttype'] in crawler.MIMETYPE['xml'] + crawler.MIMETYPE['rss'] + crawler.MIMETYPE['json']:
+            output = req['data']

        else:
-            raise MorssException('no :get option passed')
+            raise MorssException('unsupported mimetype')

    else:
-        output = req['data']
+        raise MorssException('no :get option passed')

    # return html page
-    headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
+    headers = {'status': '200 OK', 'content-type': req['contenttype'], 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
    start_response(headers['status'], list(headers.items()))
    return [output]

@@ -253,7 +251,7 @@ def cgi_error_handler(environ, start_response, app):
        raise

    except Exception as e:
-        headers = {'status': '500 Oops', 'content-type': 'text/html'}
+        headers = {'status': '404 Not Found', 'content-type': 'text/html', 'x-morss-error': repr(e)}
        start_response(headers['status'], list(headers.items()), sys.exc_info())
        log('ERROR: %s' % repr(e))
        return [cgitb.html(sys.exc_info())]
@@ -283,13 +281,13 @@ def cgi_handle_request():

 class WSGIRequestHandlerRequestUri(wsgiref.simple_server.WSGIRequestHandler):
    def get_environ(self):
-        env = super().get_environ()
+        env = wsgiref.simple_server.WSGIRequestHandler.get_environ(self)
        env['REQUEST_URI'] = self.path
        return env


 def cgi_start_server():
-    crawler.default_cache.autotrim()
+    caching.default_cache.autotrim()

    print('Serving http://localhost:%s/' % PORT)
    httpd = wsgiref.simple_server.make_server('', PORT, application, handler_class=WSGIRequestHandlerRequestUri)
@@ -297,4 +295,4 @@ def cgi_start_server():


 if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''):
-    crawler.default_cache.autotrim()
+    caching.default_cache.autotrim()
--- a/setup.py
+++ b/setup.py
@@ -1,24 +1,60 @@
-from setuptools import setup
+from datetime import datetime
 from glob import glob

+from setuptools import setup
+
+
+def get_version():
+    with open('morss/__init__.py', 'r+') as file:
+        lines = file.readlines()
+
+        # look for hard coded version number
+        for i in range(len(lines)):
+            if lines[i].startswith('__version__'):
+                version = lines[i].split('"')[1]
+                break
+
+        # create (& save) one if none found
+        if version == '':
+            version = datetime.now().strftime('%Y%m%d.%H%M')
+            lines[i] = '__version__ = "' + version + '"\n'
+
+            file.seek(0)
+            file.writelines(lines)
+
+        # return version number
+        return version
+
 package_name = 'morss'

 setup(
    name = package_name,
+    version = get_version(),
    description = 'Get full-text RSS feeds',
-    author = 'pictuga, Samuel Marks',
-    author_email = 'contact at pictuga dot com',
+    long_description = open('README.md').read(),
+    long_description_content_type = 'text/markdown',
+    author = 'pictuga',
+    author_email = 'contact@pictuga.com',
    url = 'http://morss.it/',
-    download_url = 'https://git.pictuga.com/pictuga/morss',
+    project_urls = {
+        'Source': 'https://git.pictuga.com/pictuga/morss',
+        'Bug Tracker': 'https://github.com/pictuga/morss/issues',
+    },
    license = 'AGPL v3',
    packages = [package_name],
-    install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet', 'pymysql'],
+    install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'],
+    extras_require = {
+        'full': ['redis', 'diskcache', 'gunicorn', 'setproctitle'],
+        'dev': ['pylint', 'pyenchant', 'pytest', 'pytest-cov'],
+    },
+    python_requires = '>=2.7',
    package_data = {package_name: ['feedify.ini']},
    data_files = [
        ('share/' + package_name, ['README.md', 'LICENSE']),
        ('share/' + package_name + '/www', glob('www/*.*')),
-        ('share/' + package_name + '/www/cgi', [])
    ],
    entry_points = {
-        'console_scripts': [package_name + '=' + package_name + '.__main__:main']
-    })
+        'console_scripts': [package_name + '=' + package_name + '.__main__:main'],
+    },
+    scripts = ['morss-helper'],
+)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,60 @@
+import os
+import os.path
+import threading
+
+import pytest
+
+try:
+    # python2
+    from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
+    from SimpleHTTPServer import SimpleHTTPRequestHandler
+except:
+    # python3
+    from http.server import (BaseHTTPRequestHandler, HTTPServer,
+                             SimpleHTTPRequestHandler)
+
+class HTTPReplayHandler(SimpleHTTPRequestHandler):
+    " Serves pages saved alongside with headers. See `curl --http1.1 -is http://...` "
+
+    directory = os.path.join(os.path.dirname(__file__), './samples/')
+
+    __init__ = BaseHTTPRequestHandler.__init__
+
+    def do_GET(self):
+        path = self.translate_path(self.path)
+
+        if os.path.isdir(path):
+            f = self.list_directory(path)
+
+        else:
+            f = open(path, 'rb')
+
+        try:
+            self.copyfile(f, self.wfile)
+
+        finally:
+            f.close()
+
+class MuteHTTPServer(HTTPServer):
+    def handle_error(self, request, client_address):
+        # mute errors
+        pass
+
+def make_server(port=8888):
+    print('Serving http://localhost:%s/' % port)
+    return MuteHTTPServer(('', port), RequestHandlerClass=HTTPReplayHandler)
+
+@pytest.fixture
+def replay_server():
+    httpd = make_server()
+    thread = threading.Thread(target=httpd.serve_forever)
+    thread.start()
+
+    yield
+
+    httpd.shutdown()
+    thread.join()
+
+if __name__ == '__main__':
+    httpd = make_server()
+    httpd.serve_forever()
--- a/tests/samples/200-ok.txt
+++ b/tests/samples/200-ok.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 200 OK
+content-type: text/plain
+
+success
--- a/tests/samples/301-redirect-abs.txt
+++ b/tests/samples/301-redirect-abs.txt
@@ -0,0 +1,3 @@
+HTTP/1.1 301 Moved Permanently
+location: /200-ok.txt
+
--- a/tests/samples/301-redirect-rel.txt
+++ b/tests/samples/301-redirect-rel.txt
@@ -0,0 +1,3 @@
+HTTP/1.1 301 Moved Permanently
+location: ./200-ok.txt
+
--- a/tests/samples/301-redirect-url.txt
+++ b/tests/samples/301-redirect-url.txt
@@ -0,0 +1,3 @@
+HTTP/1.1 301 Moved Permanently
+location: http://localhost:8888/200-ok.txt
+
--- a/tests/samples/308-redirect.txt
+++ b/tests/samples/308-redirect.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 308 Permanent Redirect
+location: /200-ok.txt
+
+/200-ok.txt
--- a/tests/samples/alternate-abs.txt
+++ b/tests/samples/alternate-abs.txt
@@ -0,0 +1,8 @@
+HTTP/1.1 200 OK
+content-type: text/html; charset=UTF-8
+
+<!DOCTYPE html>
+<html>
+<head><link rel="alternate" type="application/rss+xml" href="/200-ok.txt" /></head>
+<body>meta redirect</body>
+</html>
--- a/tests/samples/enc-gb2312-header.txt
+++ b/tests/samples/enc-gb2312-header.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 200 OK
+content-type: text/plain; charset=gb2312
+
+<EFBFBD>ɹ<EFBFBD>
--- a/tests/samples/enc-gb2312-meta.txt
+++ b/tests/samples/enc-gb2312-meta.txt
@@ -0,0 +1,10 @@
+HTTP/1.1 200 OK
+content-type: text/html
+
+
+<!DOCTYPE html>
+<html>
+<head><meta charset="gb2312"/></head>
+<body>
+<EFBFBD>ɹ<EFBFBD>
+</body></html>
--- a/tests/samples/enc-iso-8859-1-header.txt
+++ b/tests/samples/enc-iso-8859-1-header.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 200 OK
+content-type: text/plain; charset=iso-8859-1
+
+succ<EFBFBD>s
--- a/tests/samples/enc-iso-8859-1-missing.txt
+++ b/tests/samples/enc-iso-8859-1-missing.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 200 OK
+content-type: text/plain
+
+succ<EFBFBD>s
--- a/tests/samples/enc-utf-8-header.txt
+++ b/tests/samples/enc-utf-8-header.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 200 OK
+content-type: text/plain; charset=UTF-8
+
+succès
--- a/tests/samples/feed-atom-utf-8.txt
+++ b/tests/samples/feed-atom-utf-8.txt
@@ -0,0 +1,16 @@
+HTTP/1.1 200 OK
+Content-Type: text/xml; charset=utf-8
+
+<?xml version='1.0' encoding='utf-8'?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+	<title>!TITLE!</title>
+	<subtitle>!DESC!</subtitle>
+	<entry>
+		<title>!ITEM_TITLE!</title>
+		<summary>!ITEM_DESC!</summary>
+		<content type="html">!ITEM_CONTENT!</content>
+		<link href="!ITEM_LINK!"/>
+		<updated>2022-01-01T00:00:01+01:00</updated>
+		<published>2022-01-01T00:00:02+01:00</published>
+	</entry>
+</feed>
--- a/tests/samples/feed-atom03-utf-8.txt
+++ b/tests/samples/feed-atom03-utf-8.txt
@@ -0,0 +1,15 @@
+HTTP/1.1 200 OK
+content-type: application/xml
+
+<?xml version='1.0' encoding='utf-8' ?>
+<feed version='0.3' xmlns='http://purl.org/atom/ns#'>
+	<title>!TITLE!</title>
+	<subtitle>!DESC!</subtitle>
+	<entry>
+		<title>!ITEM_TITLE!</title>
+		<link rel='alternate' type='text/html' href='!ITEM_LINK!' />
+		<summary>!ITEM_DESC!</summary>
+		<content>!ITEM_CONTENT!</content>
+		<issued>2022-01-01T00:00:01+01:00</issued> <!-- FIXME -->
+	</entry>
+</feed>
--- a/tests/samples/feed-html-utf-8.txt
+++ b/tests/samples/feed-html-utf-8.txt
@@ -0,0 +1,22 @@
+HTTP/1.1 200 OK
+Content-Type: text/html; charset=utf-8
+
+<html>
+<head></head>
+
+<body>
+<div id="header">
+	<h1>!TITLE!</h1>
+	<p>!DESC!</p>
+</div>
+
+<div id="content">
+	<div class="item">
+		<a target="_blank" href="!ITEM_LINK!">!ITEM_TITLE!</a>
+		<div class="desc">!ITEM_DESC!</div>
+		<div class="content">!ITEM_CONTENT!</div>
+	</div>
+</div>
+
+</body>
+</html>
--- a/tests/samples/feed-json-utf-8.txt
+++ b/tests/samples/feed-json-utf-8.txt
@@ -0,0 +1,16 @@
+HTTP/1.1 200 OK
+Content-Type: application/json; charset=utf-8
+
+{
+	"title": "!TITLE!",
+	"desc": "!DESC!",
+	"items": [
+		{
+			"title": "!ITEM_TITLE!",
+			"time": "2022-01-01T00:00:01+0100",
+			"url": "!ITEM_LINK!",
+			"desc": "!ITEM_DESC!",
+			"content": "!ITEM_CONTENT!"
+		}
+	]
+}
--- a/tests/samples/feed-rss-channel-utf-8.txt
+++ b/tests/samples/feed-rss-channel-utf-8.txt
@@ -0,0 +1,17 @@
+HTTP/1.1 200 OK
+Content-Type: text/xml; charset=utf-8
+
+<?xml version='1.0' encoding='utf-8'?>
+<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
+  <channel>
+    <title>!TITLE!</title>
+    <description>!DESC!</description>
+    <item>
+      <title>!ITEM_TITLE!</title>
+      <pubDate>Mon, 01 Jan 2022 00:00:01 +0100</pubDate>
+      <link>!ITEM_LINK!</link>
+      <description>!ITEM_DESC!</description>
+      <content:encoded>!ITEM_CONTENT!</content:encoded>
+    </item>
+  </channel>
+</rss>
--- a/tests/samples/gzip.txt
+++ b/tests/samples/gzip.txt
--- a/tests/samples/header-refresh.txt
+++ b/tests/samples/header-refresh.txt
@@ -0,0 +1,3 @@
+HTTP/1.1 200 OK
+refresh: 0;url=/200-ok.txt
+
--- a/tests/samples/meta-redirect-abs.txt
+++ b/tests/samples/meta-redirect-abs.txt
@@ -0,0 +1,8 @@
+HTTP/1.1 200 OK
+content-type: text/html; charset=UTF-8
+
+<!DOCTYPE html>
+<html>
+<head><meta http-equiv="refresh" content="2; url = /200-ok.txt" /></head>
+<body>meta redirect</body>
+</html>
--- a/tests/samples/meta-redirect-rel.txt
+++ b/tests/samples/meta-redirect-rel.txt
@@ -0,0 +1,8 @@
+HTTP/1.1 200 OK
+content-type: text/html; charset=UTF-8
+
+<!DOCTYPE html>
+<html>
+<head><meta http-equiv="refresh" content="2; url = ./200-ok.txt" /></head>
+<body>meta redirect</body>
+</html>
--- a/tests/samples/meta-redirect-url.txt
+++ b/tests/samples/meta-redirect-url.txt
@@ -0,0 +1,8 @@
+HTTP/1.1 200 OK
+content-type: text/html; charset=UTF-8
+
+<!DOCTYPE html>
+<html>
+<head><meta http-equiv="refresh" content="2; url = http://localhost:8888/200-ok.txt" /></head>
+<body>meta redirect</body>
+</html>
--- a/tests/samples/size-1MiB.txt
+++ b/tests/samples/size-1MiB.txt
--- a/tests/test_crawler.py
+++ b/tests/test_crawler.py
@@ -0,0 +1,62 @@
+import pytest
+
+from morss.crawler import *
+
+
+def test_get(replay_server):
+    assert get('http://localhost:8888/200-ok.txt') == b'success\r\n'
+
+def test_adv_get(replay_server):
+    assert adv_get('http://localhost:8888/200-ok.txt')['data'] == b'success\r\n'
+
+@pytest.mark.parametrize('before,after', [
+    (b'http://localhost:8888/',     'http://localhost:8888/'),
+    ('localhost:8888/',             'http://localhost:8888/'),
+    ('http:/localhost:8888/',       'http://localhost:8888/'),
+    ('http://localhost:8888/&/',     'http://localhost:8888/&/'),
+    ('http://localhost:8888/ /',    'http://localhost:8888/%20/'),
+    ('http://localhost-€/€/',       'http://xn--localhost--077e/%E2%82%AC/'),
+    ('http://localhost-€:8888/€/',  'http://xn--localhost--077e:8888/%E2%82%AC/'),
+    ])
+def test_sanitize_url(before, after):
+    assert sanitize_url(before) == after
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(SizeLimitHandler(500*1024))])
+def test_size_limit_handler(replay_server, opener):
+    assert len(opener.open('http://localhost:8888/size-1MiB.txt').read()) == 500*1024
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(GZIPHandler())])
+def test_gzip_handler(replay_server, opener):
+    assert opener.open('http://localhost:8888/gzip.txt').read() == b'success\n'
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(EncodingFixHandler())])
+@pytest.mark.parametrize('url', [
+    'enc-gb2312-header.txt', 'enc-gb2312-meta.txt', #'enc-gb2312-missing.txt',
+    'enc-iso-8859-1-header.txt', 'enc-iso-8859-1-missing.txt',
+    'enc-utf-8-header.txt',
+    ])
+def test_encoding_fix_handler(replay_server, opener, url):
+    out = adv_get('http://localhost:8888/%s' % url)
+    out = out['data'].decode(out['encoding'])
+    assert 'succes' in out or 'succès' in out or '成功' in out
+
+@pytest.mark.parametrize('opener', [custom_opener(follow='rss'), build_opener(AlternateHandler(MIMETYPE['rss']))])
+def test_alternate_handler(replay_server, opener):
+    assert opener.open('http://localhost:8888/alternate-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPEquivHandler(), HTTPRefreshHandler())])
+def test_http_equiv_handler(replay_server, opener):
+    assert opener.open('http://localhost:8888/meta-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+    assert opener.open('http://localhost:8888/meta-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+    assert opener.open('http://localhost:8888/meta-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPAllRedirectHandler())])
+def test_http_all_redirect_handler(replay_server, opener):
+    assert opener.open('http://localhost:8888/308-redirect.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+    assert opener.open('http://localhost:8888/301-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+    assert opener.open('http://localhost:8888/301-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+    assert opener.open('http://localhost:8888/301-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPRefreshHandler())])
+def test_http_refresh_handler(replay_server, opener):
+    assert opener.open('http://localhost:8888/header-refresh.txt').geturl() == 'http://localhost:8888/200-ok.txt'
--- a/tests/test_feeds.py
+++ b/tests/test_feeds.py
@@ -0,0 +1,108 @@
+import pytest
+
+from morss.crawler import adv_get
+from morss.feeds import *
+
+
+def get_feed(url):
+    url = 'http://localhost:8888/%s' % url
+    out = adv_get(url)
+    feed = parse(out['data'], url=url, encoding=out['encoding'])
+    return feed
+
+def check_feed(feed):
+    # NB. time and updated not covered
+    assert feed.title == '!TITLE!'
+    assert feed.desc == '!DESC!'
+    assert feed[0] == feed.items[0]
+    assert feed[0].title == '!ITEM_TITLE!'
+    assert feed[0].link == '!ITEM_LINK!'
+    assert '!ITEM_DESC!' in feed[0].desc # broader test due to possible inclusion of surrounding <div> in xml
+    assert '!ITEM_CONTENT!' in feed[0].content
+
+def check_output(feed):
+    output = feed.tostring()
+    assert '!TITLE!' in output
+    assert '!DESC!' in output
+    assert '!ITEM_TITLE!' in output
+    assert '!ITEM_LINK!' in output
+    assert '!ITEM_DESC!' in output
+    assert '!ITEM_CONTENT!' in output
+
+def check_change(feed):
+    feed.title = '!TITLE2!'
+    feed.desc = '!DESC2!'
+    feed[0].title = '!ITEM_TITLE2!'
+    feed[0].link = '!ITEM_LINK2!'
+    feed[0].desc = '!ITEM_DESC2!'
+    feed[0].content = '!ITEM_CONTENT2!'
+
+    assert feed.title == '!TITLE2!'
+    assert feed.desc == '!DESC2!'
+    assert feed[0].title == '!ITEM_TITLE2!'
+    assert feed[0].link == '!ITEM_LINK2!'
+    assert '!ITEM_DESC2!' in feed[0].desc
+    assert '!ITEM_CONTENT2!' in feed[0].content
+
+def check_add(feed):
+    feed.append({
+        'title': '!ITEM_TITLE3!',
+        'link': '!ITEM_LINK3!',
+        'desc': '!ITEM_DESC3!',
+        'content': '!ITEM_CONTENT3!',
+    })
+
+    assert feed[1].title == '!ITEM_TITLE3!'
+    assert feed[1].link == '!ITEM_LINK3!'
+    assert '!ITEM_DESC3!' in feed[1].desc
+    assert '!ITEM_CONTENT3!' in feed[1].content
+
+each_format = pytest.mark.parametrize('url', [
+    'feed-rss-channel-utf-8.txt', 'feed-atom-utf-8.txt',
+    'feed-atom03-utf-8.txt', 'feed-json-utf-8.txt', 'feed-html-utf-8.txt',
+    ])
+
+each_check = pytest.mark.parametrize('check', [
+    check_feed, check_output, check_change, check_add,
+    ])
+
+@each_format
+@each_check
+def test_parse(replay_server, url, check):
+    feed = get_feed(url)
+    check(feed)
+
+@each_format
+@each_check
+def test_convert_rss(replay_server, url, check):
+    feed = get_feed(url)
+    feed = feed.convert(FeedXML)
+    check(feed)
+
+@each_format
+@each_check
+def test_convert_json(replay_server, url, check):
+    feed = get_feed(url)
+    feed = feed.convert(FeedJSON)
+    check(feed)
+
+@each_format
+@each_check
+def test_convert_html(replay_server, url, check):
+    feed = get_feed(url)
+    feed = feed.convert(FeedHTML)
+    if len(feed) > 1:
+        # remove the 'blank' default html item
+        del feed[0]
+    check(feed)
+
+@each_format
+def test_convert_csv(replay_server, url):
+    # only csv output, not csv feed, check therefore differnet
+    feed = get_feed(url)
+    output = feed.tocsv()
+
+    assert '!ITEM_TITLE!' in output
+    assert '!ITEM_LINK!' in output
+    assert '!ITEM_DESC!' in output
+    assert '!ITEM_CONTENT!' in output
--- a/www/.htaccess
+++ b/www/.htaccess
@@ -1,15 +0,0 @@
-Options -Indexes
-
-ErrorDocument 403 "Access forbidden"
-ErrorDocument 404 /cgi/main.py
-ErrorDocument 500 "A very nasty bug found his way onto this very server"
-
-# Uncomment below line to turn debug on for all requests
-#SetEnv DEBUG 1
-
-# Uncomment below line to turn debug on for requests with :debug in the url
-#SetEnvIf Request_URI :debug DEBUG=1
-
-<Files ~ "\.(py|pyc|db|log)$">
-	deny from all
-</Files>
--- a/www/cgi/.htaccess
+++ b/www/cgi/.htaccess
@@ -1,9 +0,0 @@
-order allow,deny
-
-deny from all
-
-<Files main.py>
-	allow from all
-	AddHandler cgi-script .py
-	Options +ExecCGI
-</Files>
--- a/www/sheet.xsl
+++ b/www/sheet.xsl
@@ -16,6 +16,7 @@
 			<title>RSS feed by morss</title>
 			<meta name="viewport" content="width=device-width; initial-scale=1.0;" />
 			<meta name="robots" content="noindex" />
+			<link rel="shortcut icon" type="image/svg+xml" href="/logo.svg" sizes="any" />

 			<style type="text/css">
 				body * {
@@ -203,7 +204,9 @@
 					link of the 
 					<select>
 						<option value="">first</option>
-						<option value=":newest" title="Select feed items by publication date (instead of appearing order)">newest (?)</option>
+						<option value=":order=newest" title="Select feed items by publication date (instead of appearing order)">newest (?)</option>
+						<option value=":order=last">last</option>
+						<option value=":order=oldest">oldest</option>
 					</select>
 					items and 
 					<select>