Save auto version number

Fixed #108
Clean up sqlite code
2023-06-27 22:36:29 +02:00 · 2023-06-26 01:30:47 +02:00 · 2023-06-26 01:29:00 +02:00 · 2023-06-26 01:28:33 +02:00 · 2023-06-24 01:50:12 +02:00 · 2023-06-23 23:14:32 +02:00
49 changed files with 10877 additions and 707 deletions
--- a/.github/workflows/default.yml
+++ b/.github/workflows/default.yml
@@ -0,0 +1,78 @@
+name: default
+on:
+    push:
+        branches:
+            - master
+
+jobs:
+    test-lint:
+        runs-on: ubuntu-latest
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v3
+              with:
+                  fetch-depth: 0
+
+            - name: Prepare image
+              run: apt-get -y update && apt-get -y install python3-pip libenchant-2-2 aspell-en
+
+            - name: Install dependencies
+              run: pip3 install .[full] .[dev]
+            - run: isort --check-only --diff .
+            - run: pylint morss --rcfile=.pylintrc --disable=C,R,W --fail-under=8
+            - run: pytest --cov=morss tests
+
+    python-publish:
+        runs-on: ubuntu-latest
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v3
+              with:
+                  fetch-depth: 0
+
+            - name: Prepare image
+              run: apt-get -y update && apt-get -y install python3-pip python3-build
+
+            - name: Build package
+              run: python3 -m build
+
+            - name: Publish package
+              uses: https://github.com/pypa/gh-action-pypi-publish@release/v1
+              with:
+                  password: ${{ secrets.pypi_api_token }}
+
+    docker-publish-deploy:
+        runs-on: ubuntu-latest
+        container:
+            image: catthehacker/ubuntu:act-latest
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v3
+
+            - name: Set up QEMU
+              uses: https://github.com/docker/setup-qemu-action@v2
+
+            - name: Set up Docker Buildx
+              uses: https://github.com/docker/setup-buildx-action@v2
+
+            - name: Login to Docker Hub
+              uses: https://github.com/docker/login-action@v2
+              with:
+                  username: ${{ secrets.docker_user }}
+                  password: ${{ secrets.docker_pwd }}
+
+            - name: Build and push
+              uses: https://github.com/docker/build-push-action@v4
+              with:
+                  context: .
+                  platforms: linux/amd64,linux/arm64,linux/arm/v7
+                  push: true
+                  tags: ${{ secrets.docker_repo }}
+
+            - name: Deploy on server
+              uses: https://github.com/appleboy/ssh-action@v0.1.10
+              with:
+                  host: ${{ secrets.ssh_host }}
+                  username: ${{ secrets.ssh_user }}
+                  key: ${{ secrets.ssh_key }}
+                  script: morss-update
--- a/.pylintrc
+++ b/.pylintrc
@@ -0,0 +1,50 @@
+[MASTER]
+ignore=CVS
+suggestion-mode=yes
+extension-pkg-allow-list=lxml.etree
+
+[MESSAGES CONTROL]
+disable=missing-function-docstring,
+        missing-class-docstring,
+        missing-module-docstring,
+        wrong-spelling-in-comment,
+
+[REPORTS]
+reports=yes
+score=yes
+
+[SPELLING]
+spelling-dict=en_GB
+spelling-ignore-words=morss
+
+[STRING]
+check-quote-consistency=yes
+check-str-concat-over-line-jumps=yes
+
+[VARIABLES]
+allow-global-unused-variables=no
+init-import=no
+
+[FORMAT]
+expected-line-ending-format=LF
+indent-string='    '
+max-line-length=120
+max-module-lines=1000
+
+[BASIC]
+argument-naming-style=snake_case
+attr-naming-style=snake_case
+class-attribute-naming-style=snake_case
+class-const-naming-style=UPPER_CASE
+class-naming-style=PascalCase
+const-naming-style=UPPER_CASE
+function-naming-style=snake_case
+inlinevar-naming-style=snake_case
+method-naming-style=snake_case
+module-naming-style=snake_case
+variable-naming-style=snake_case
+
+include-naming-hint=yes
+
+bad-names=foo, bar
+good-names=i, j, k
--- a/18
+++ b/18
@@ -1,8 +1,16 @@
-FROM alpine:latest
-
-RUN apk add --no-cache python3 py3-lxml py3-gunicorn py3-pip py3-wheel git
+FROM alpine:edge

 ADD . /app
-RUN pip3 install /app

-CMD gunicorn --bind 0.0.0.0:8080 -w 4 --preload morss
+RUN set -ex; \
+	apk add --no-cache --virtual .run-deps python3 py3-lxml py3-setproctitle py3-setuptools; \
+	apk add --no-cache --virtual .build-deps py3-pip py3-wheel; \
+	pip3 install --no-cache-dir /app[full]; \
+	apk del .build-deps
+
+USER 1000:1000
+
+ENTRYPOINT ["/bin/sh", "/app/morss-helper"]
+CMD ["run"]
+
+HEALTHCHECK CMD /bin/sh /app/morss-helper check
--- a/README.md
+++ b/README.md
@@ -1,11 +1,14 @@
 # Morss - Get full-text RSS feeds

-_GNU AGPLv3 code_  
-_Provided logo is CC BY-NC-SA 4.0_
+[Homepage](https://morss.it/) • 
+[Upstream source code](https://git.pictuga.com/pictuga/morss) • 
+[Github mirror](https://github.com/pictuga/morss) (for Issues & Pull requests)

-Upstream source code: https://git.pictuga.com/pictuga/morss  
-Github mirror (for Issues & Pull requests): https://github.com/pictuga/morss  
-Homepage: https://morss.it/
+[![Build Status](https://ci.pictuga.com/api/badges/pictuga/morss/status.svg)](https://ci.pictuga.com/pictuga/morss)
+[![Github Stars](https://img.shields.io/github/stars/pictuga/morss?logo=github)](https://github.com/pictuga/morss/stargazers)
+[![Github Forks](https://img.shields.io/github/forks/pictuga/morss?logo=github)](https://github.com/pictuga/morss/network/members)
+[![GNU AGPLv3 code](https://img.shields.io/static/v1?label=license&message=AGPLv3)](https://git.pictuga.com/pictuga/morss/src/branch/master/LICENSE)
+[![Logo is CC BY-NC-SA 4.0](https://img.shields.io/static/v1?label=CC&message=BY-NC-SA%204.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/)

 This tool's goal is to get full-text RSS feeds out of striped RSS feeds,
 commonly available on internet. Indeed most newspapers only make a small
@@ -38,7 +41,7 @@ Some features of morss:
 - Follow 301/meta redirects
 - Recover xml feeds with corrupt encoding
 - Supports gzip-compressed http content
- HTTP caching with 3 different backends (in-memory/sqlite/mysql)
+- HTTP caching with different backends (in-memory/redis/diskcache)
 - Works as server/cli tool
 - Deobfuscate various tracking links

@@ -46,38 +49,79 @@ Some features of morss:

 ### Python package

+![Build Python](https://img.shields.io/badge/dynamic/json?label=build%20python&query=$.stages[?(@.name=='python')].status&url=https://ci.pictuga.com/api/repos/pictuga/morss/builds/latest)
+[![PyPI](https://img.shields.io/pypi/v/morss)](https://pypi.org/project/morss/)
+[![PyPI Downloads](https://img.shields.io/pypi/dm/morss)](https://pypistats.org/packages/morss)
+
+Simple install (without optional dependencies)
+
+From pip
+
+```shell
+pip install morss
+```
+
+From git
+
 ```shell
 pip install git+https://git.pictuga.com/pictuga/morss.git
 ```

+Full installation (including optional dependencies)
+
+From pip
+
+```shell
+pip install morss[full]
+```
+
+From git
+
+```shell
+pip install git+https://git.pictuga.com/pictuga/morss.git#egg=morss[full]
+```
+
+The full install includes all the cache backends. Otherwise, only in-memory
+cache is available. The full install also includes gunicorn (for more efficient
+HTTP handling).
+
 The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
 C code needs to be compiled). If possible on your distribution, try installing
 it with the system package manager.

-Dependencies:
-
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
- [lxml](http://lxml.de/) for xml parsing
- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
- [chardet](https://pypi.python.org/pypi/chardet)
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
- pymysql
-
-You may also need:
- Apache, with python-cgi support, to run on a server
- a fast internet connection
-
 ### Docker

-Build & run
+![Build Docker](https://img.shields.io/badge/dynamic/json?label=build%20docker&query=$.stages[?(@.name=='docker')].status&url=https://ci.pictuga.com/api/repos/pictuga/morss/builds/latest)
+[![Docker Hub](https://img.shields.io/docker/pulls/pictuga/morss)](https://hub.docker.com/r/pictuga/morss)
+[![Docker Arch](https://img.shields.io/badge/dynamic/json?color=blue&label=docker%20arch&query=$.results[0].images[*].architecture&url=https://hub.docker.com/v2/repositories/pictuga/morss/tags)](https://hub.docker.com/r/pictuga/morss/tags)
+
+From docker hub
+
+With cli

 ```shell
-docker build --tag morss https://git.pictuga.com/pictuga/morss.git
-docker run -p 8080:8080 morss
+docker pull pictuga/morss
 ```

-With docker-compose:
+With docker-compose **(recommended)**
+
+```yml
+services:
+    app:
+        image: pictuga/morss
+        ports:
+            - '8000:8000'
+```
+
+Build from source
+
+With cli
+
+```shell
+docker build --tag morss https://git.pictuga.com/pictuga/morss.git --no-cache --pull
+```
+
+With docker-compose

 ```yml
 services:
@@ -85,21 +129,54 @@ services:
        build: https://git.pictuga.com/pictuga/morss.git
        image: morss
        ports:
-            - '8080:8080'
+            - '8000:8000'
 ```

 Then execute

 ```shell
-docker-compose build
-docker-compose up
+docker-compose build --no-cache --pull
 ```

-To update:
+### Cloud providers

- To get the latest code from the git repository, add `--no-cache` to the build
-commands
- To update the base image (`alpine:latest`), add `--pull` to the build commands
+One-click deployment:
+
+[![Heroku](https://img.shields.io/static/v1?label=deploy%20to&message=heroku&logo=heroku&color=79589F)](https://heroku.com/deploy?template=https://github.com/pictuga/morss)
+[![Google Cloud](https://img.shields.io/static/v1?label=deploy%20to&message=google&logo=google&color=4285F4)](https://deploy.cloud.run/?git_repo=https://github.com/pictuga/morss.git)
+
+Providers supporting `cloud-init` (AWS, Oracle Cloud Infrastructure), based on Ubuntu:
+
+``` yml
+#cloud-config
+
+packages:
+  - python3-pip
+  - python3-wheel
+  - python3-lxml
+  - python3-setproctitle
+  - ca-certificates
+
+write_files:
+  - path: /etc/environment
+    append: true
+    content: |
+      DEBUG=1
+      CACHE=diskcache
+      CACHE_SIZE=1073741824 # 1GiB
+  - path: /var/lib/cloud/scripts/per-boot/morss.sh
+    permissions: 744
+    content: |
+      #!/bin/sh
+      /usr/local/bin/morss-helper daemon
+
+runcmd:
+  - source /etc/environment
+  - update-ca-certificates
+  - iptables -I INPUT 6 -m state --state NEW -p tcp --dport ${PORT:-8000} -j ACCEPT
+  - netfilter-persistent save
+  - pip install morss[full]
+```

 ## Run

@@ -120,14 +197,29 @@ For example: `http://morss.example/:clip/https://twitter.com/pictuga`
 The `main.py` part is only needed if your server doesn't support the Apache
 redirect rule set in the provided `.htaccess`.

-Works like a charm with [Tiny Tiny
-RSS](http://tt-rss.org/redmine/projects/tt-rss/wiki), and most probably other
-clients.
+Works like a charm with [Tiny Tiny RSS](https://tt-rss.org/), and most probably
+other clients.


-#### Via Docker
+#### Using Docker

-See above (in Install)
+From docker hub
+
+```shell
+docker run -p 8000:8000 pictuga/morss
+```
+
+From source
+
+```shell
+docker run -p 8000:8000 morss
+```
+
+With docker-compose **(recommended)**
+
+```shell
+docker-compose up
+```

 #### Using Gunicorn

@@ -140,13 +232,13 @@ gunicorn --preload morss
 Running this command should do:

 ```shell
-uwsgi --http :8080 --plugin python --wsgi-file main.py
+uwsgi --http :8000 --plugin python --wsgi-file main.py
 ```

 #### Using morss' internal HTTP server

 Morss can run its own, **very basic**, HTTP server, meant for debugging mostly.
-The latter should start when you run morss without any argument, on port 8080.
+The latter should start when you run morss without any argument, on port 8000.
 I'd highly recommend you to use gunicorn or something similar for better
 performance.

@@ -184,8 +276,30 @@ For this, you need to make sure your host allows python script execution. This
 method uses HTTP calls to fetch the RSS feeds, which will be handled through
 `mod_cgi` for example on Apache severs.

-Please pay attention to `main.py` permissions for it to be executable. Also
-ensure that the provided `/www/.htaccess` works well with your server.
+Please pay attention to `main.py` permissions for it to be executable. See below
+some tips for the `.htaccess` file.
+
+```htaccess
+Options -Indexes
+
+ErrorDocument 404 /cgi/main.py
+
+# Turn debug on for all requests
+SetEnv DEBUG 1
+
+# Turn debug on for requests with :debug in the url
+SetEnvIf Request_URI :debug DEBUG=1
+
+<Files ~ "\.(py|pyc|db|log)$">
+	deny from all
+</Files>
+
+<Files main.py>
+	allow from all
+	AddHandler cgi-script .py
+	Options +ExecCGI
+</Files>
+```

 ### As a CLI application

@@ -199,6 +313,12 @@ For example: `morss --clip http://feeds.bbci.co.uk/news/rss.xml`

 *(Brackets indicate optional text)*

+If using Docker:
+
+```shell
+docker run morss --clip http://feeds.bbci.co.uk/news/rss.xml
+```
+
 ### As a newsreader hook

 To use it, the newsreader [Liferea](http://lzone.de/liferea/) is required
@@ -210,7 +330,7 @@ To use this script, you have to enable "(Unix) command" in liferea feed
 settings, and use the command:

 ```
-morss [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
+morss [--argwithoutvalue] [--argwithvalue=value] [...] FEEDURL
 ```

 For example: `morss http://feeds.bbci.co.uk/news/rss.xml`
@@ -233,7 +353,7 @@ Using cache and passing arguments:
 ```python
 >>> import morss
 >>> url = 'http://feeds.bbci.co.uk/news/rss.xml'
->>> cache = '/tmp/morss-cache.db' # sqlite cache location
+>>> cache = '/tmp/morss-cache' # diskcache cache location
 >>> options = {'csv':True}
 >>> xml_string = morss.process(url, cache, options)
 >>> xml_string[:50]
@@ -247,11 +367,10 @@ under the hood.
 Doing it step-by-step:

 ```python
-import morss, morss.crawler
+import morss

 url = 'http://newspaper.example/feed.xml'
 options = morss.Options(csv=True) # arguments
-morss.crawler.sqlite_default = '/tmp/morss-cache.db' # sqlite cache location

 url, rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
 rss = morss.FeedGather(rss, url, options) # this fills the feed and cleans it up
@@ -270,11 +389,13 @@ arguments to morss is explained in Run above.
 The list of arguments can be obtained by running `morss --help`

 ```
-usage: morss [-h] [--format {rss,json,html,csv}] [--search STRING] [--clip]
-             [--indent] [--cache] [--force] [--proxy] [--newest] [--firstlink]
-             [--resolve] [--items XPATH] [--item_link XPATH]
-             [--item_title XPATH] [--item_content XPATH] [--item_time XPATH]
-             [--nolink] [--noref] [--silent]
+usage: morss [-h] [--post STRING] [--xpath XPATH]
+             [--format {rss,json,html,csv}] [--search STRING] [--clip]
+             [--indent] [--cache] [--force] [--proxy]
+             [--order {first,last,newest,oldest}] [--firstlink] [--resolve]
+             [--items XPATH] [--item_link XPATH] [--item_title XPATH]
+             [--item_content XPATH] [--item_time XPATH]
+             [--mode {xml,html,json}] [--nolink] [--noref] [--silent]
             url

 Get full-text RSS feeds
@@ -282,8 +403,10 @@ Get full-text RSS feeds
 positional arguments:
  url                   feed url

-optional arguments:
+options:
  -h, --help            show this help message and exit
+  --post STRING         POST request
+  --xpath XPATH         xpath rule to manually detect the article

 output:
  --format {rss,json,html,csv}
@@ -299,8 +422,9 @@ action:
                        articles' content), so as to save time
  --force               force refetch the rss feed and articles
  --proxy               doesn't fill the articles
-  --newest              return the feed items in chronological order (morss
-                        ohterwise shows the items by appearing order)
+  --order {first,last,newest,oldest}
+                        order in which to process items (which are however NOT
+                        sorted in the output)
  --firstlink           pull the first article mentioned in the description
                        instead of the default link
  --resolve             replace tracking links with direct links to articles
@@ -315,6 +439,8 @@ custom feeds:
  --item_content XPATH  entry's content
  --item_time XPATH     entry's date & time (accepts a wide range of time
                        formats)
+  --mode {xml,html,json}
+                        parser to use for the custom feeds

 misc:
  --nolink              drop links, but keeps links' inner text
@@ -336,31 +462,39 @@ servers)

 To pass environment variables:

- Docker-cli: `docker run -p 8080:8080 morss --env KEY=value`
+- Docker-cli: `docker run -p 8000:8000 morss --env KEY=value`
 - docker-compose: add an `environment:` section in the .yml file
 - Gunicorn/uWSGI/CLI: prepend `KEY=value` before the command
 - Apache: via the `SetEnv` instruction (see sample `.htaccess` provided)
+- cloud-init: in the `/etc/environment` file

 Generic:

 - `DEBUG=1`: to have some feedback from the script execution. Useful for
 debugging.
 - `IGNORE_SSL=1`: to ignore SSL certs when fetch feeds and articles
- `DELAY` sets the browser cache delay, only for HTTP clients
- `TIMEOUT` sets the HTTP timeout when fetching rss feeds and articles
+- `DELAY` (seconds) sets the browser cache delay, only for HTTP clients
+- `TIMEOUT` (seconds) sets the HTTP timeout when fetching rss feeds and articles
+- `DATA_PATH`: to set custom file location for the `www` folder

 When parsing long feeds, with a lot of items (100+), morss might take a lot of
 time to parse it, or might even run into a memory overflow on some shared
 hosting plans (limits around 10Mb), in which case you might want to adjust the
 below settings via environment variables.

- `MAX_TIME` sets the maximum amount of time spent *fetching* articles, more
-time might be spent taking older articles from cache. `-1` for unlimited.
+Also, if the request takes too long to process, the http request might be
+discarded. See relevant config for
+[gunicorn](https://docs.gunicorn.org/en/stable/settings.html#timeout) or
+[nginx](http://nginx.org/en/docs/http/ngx_http_proxy_module.html#proxy_read_timeout).
+
+- `MAX_TIME` (seconds) sets the maximum amount of time spent *fetching*
+articles, more time might be spent taking older articles from cache. `-1` for
+unlimited.
 - `MAX_ITEM` sets the maximum number of articles to fetch. `-1` for unlimited.
 More articles will be taken from cache following the nexts settings.
- `LIM_TIME` sets the maximum amount of time spent working on the feed (whether
-or not it's already cached). Articles beyond that limit will be dropped from the
-feed. `-1` for unlimited.
+- `LIM_TIME` (seconds) sets the maximum amount of time spent working on the feed
+(whether or not it's already cached). Articles beyond that limit will be dropped
+from the feed. `-1` for unlimited.
 - `LIM_ITEM` sets the maximum number of article checked, limiting both the
 number of articles fetched and taken from cache. Articles beyond that limit will
 be dropped from the feed, even if they're cached. `-1` for unlimited.
@@ -368,19 +502,21 @@ be dropped from the feed, even if they're cached. `-1` for unlimited.
 morss uses caching to make loading faster. There are 3 possible cache backends:

 - `(nothing/default)`: a simple python in-memory dict-like object.
- `CACHE=sqlite`: sqlite3 cache. Default file location is in-memory (i.e. it
-will be cleared every time the program is run). Path can be defined with
-`SQLITE_PATH`.
- `CACHE=mysql`: MySQL cache. Connection can be defined with the following
-environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
+- `CACHE=redis`: Redis cache. Connection can be defined with the following
+environment variables: `REDIS_HOST`, `REDIS_PORT`, `REDIS_DB`, `REDIS_PWD`
+- `CACHE=diskcache`: disk-based cache. Target directory canbe defined with
+`DISKCACHE_DIR`.

 To limit the size of the cache:

 - `CACHE_SIZE` sets the target number of items in the cache (further items will
 be deleted but the cache might be temporarily bigger than that). Defaults to 1k
-entries.
- `CACHE_LIFESPAN` sets how often the cache must be trimmed (i.e. cut down to
-the number of items set in `CACHE_SIZE`). Defaults to 1min.
+entries. NB. When using `diskcache`, this is the cache max size in Bytes.
+- `CACHE_LIFESPAN` (seconds) sets how often the cache must be trimmed (i.e. cut
+down to the number of items set in `CACHE_SIZE`). Defaults to 1min.
+
+Gunicorn also accepts command line arguments via the `GUNICORN_CMD_ARGS`
+environment variable.

 ### Content matching

--- a/app.json
+++ b/app.json
@@ -0,0 +1,21 @@
+{
+	"stack": "container",
+	"env": {
+		"DEBUG": {
+			"value": 1,
+			"required": false
+		},
+		"GUNICORN_CMD_ARGS": {
+			"value": "",
+			"required": false
+		},
+		"CACHE": {
+			"value": "diskcache",
+			"required": false
+		},
+		"CACHE_SIZE": {
+			"value": 1073741824,
+			"required": false
+		}
+	}
+}
--- a/heroku.yml
+++ b/heroku.yml
@@ -0,0 +1,3 @@
+build: 
+  docker:
+    web: Dockerfile
--- a/main.py
+++ b/main.py
--- a/47
+++ b/47
@@ -0,0 +1,47 @@
+#! /bin/sh
+set -ex
+
+if ! command -v python && command -v python3 ; then
+	alias python='python3'
+fi
+
+run() {
+	gunicorn --bind 0.0.0.0:${PORT:-8000} --preload --access-logfile - morss
+}
+
+daemon() {
+	gunicorn --bind 0.0.0.0:${PORT:-8000} --preload --access-logfile - --daemon morss
+}
+
+reload() {
+	pid=$(pidof 'gunicorn: master [morss]' || true)
+		# NB. requires python-setproctitle
+		# `|| true` due to `set -e`
+
+	if [ -z "$pid" ]; then
+		# if gunicorn is not currently running
+		daemon
+
+	else
+		kill -s USR2 $pid
+		kill -s WINCH $pid
+		sleep 1 # give gunicorn some time to reload
+		kill -s TERM $pid
+
+	fi
+}
+
+check() {
+	python -m morss.crawler http://localhost:${PORT:-8000}/ > /dev/null 2>&1
+}
+
+if [ -z "$1" ]; then
+	run
+
+elif [ "$1" = "sh" ] || [ "$1" = "bash" ] || command -v "$1" ; then
+	$@
+
+else
+	python -m morss $@
+
+fi
--- a/morss.service
+++ b/morss.service
@@ -0,0 +1,13 @@
+[Unit]
+Description=morss server (gunicorn)
+After=network.target
+
+[Service]
+ExecStart=/usr/local/bin/morss-helper run
+ExecReload=/usr/local/bin/morss-helper reload
+KillMode=process
+Restart=always
+User=http
+
+[Install]
+WantedBy=multi-user.target
--- a/morss/init.py
+++ b/morss/init.py
@@ -16,5 +16,10 @@
 # with this program. If not, see <https://www.gnu.org/licenses/>.

 # ran on `import morss`
+
+# pylint: disable=unused-import,unused-variable
+
+__version__ = ""
+
 from .morss import *
 from .wsgi import application
--- a/morss/main.py
+++ b/morss/main.py
@@ -20,9 +20,7 @@
 import os
 import sys

-from . import wsgi
-from . import cli
-
+from . import cli, wsgi
 from .morss import MorssException


--- a/morss/caching.py
+++ b/morss/caching.py
@@ -0,0 +1,122 @@
+# This file is part of morss
+#
+# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License along
+# with this program. If not, see <https://www.gnu.org/licenses/>.
+
+import os
+import threading
+import time
+from collections import OrderedDict
+
+CACHE_SIZE = int(os.getenv('CACHE_SIZE', 1000)) # max number of items in cache (default: 1k items)
+CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60)) # how often to auto-clear the cache (default: 1min)
+
+
+class BaseCache:
+    """ Subclasses must behave like a dict """
+
+    def trim(self):
+        pass
+
+    def autotrim(self, delay=CACHE_LIFESPAN):
+        # trim the cache every so often
+
+        self.trim()
+
+        t = threading.Timer(delay, self.autotrim)
+        t.daemon = True
+        t.start()
+
+    def __contains__(self, url):
+        try:
+            self[url]
+
+        except KeyError:
+            return False
+
+        else:
+            return True
+
+
+class CappedDict(OrderedDict, BaseCache):
+    def trim(self):
+        if CACHE_SIZE >= 0:
+            for i in range( max( len(self) - CACHE_SIZE , 0 )):
+                self.popitem(False)
+
+    def __setitem__(self, key, data):
+        # https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
+        if key in self:
+            del self[key]
+        OrderedDict.__setitem__(self, key, data)
+
+
+try:
+    import redis # isort:skip
+except ImportError:
+    pass
+
+
+class RedisCacheHandler(BaseCache):
+    def __init__(self, host='localhost', port=6379, db=0, password=None):
+        self.r = redis.Redis(host=host, port=port, db=db, password=password)
+
+    def __getitem__(self, key):
+        return self.r.get(key)
+
+    def __setitem__(self, key, data):
+        self.r.set(key, data)
+
+
+try:
+    import diskcache # isort:skip
+except ImportError:
+    pass
+
+
+class DiskCacheHandler(BaseCache):
+    def __init__(self, directory=None, **kwargs):
+        self.cache = diskcache.Cache(directory=directory, eviction_policy='least-frequently-used', **kwargs)
+
+    def __del__(self):
+        self.cache.close()
+
+    def trim(self):
+        self.cache.cull()
+
+    def __getitem__(self, key):
+        return self.cache[key]
+
+    def __setitem__(self, key, data):
+        self.cache.set(key, data)
+
+
+if 'CACHE' in os.environ:
+    if os.environ['CACHE'] == 'redis':
+        default_cache = RedisCacheHandler(
+            host = os.getenv('REDIS_HOST', 'localhost'),
+            port = int(os.getenv('REDIS_PORT', 6379)),
+            db = int(os.getenv('REDIS_DB', 0)),
+            password = os.getenv('REDIS_PWD', None)
+        )
+
+    elif os.environ['CACHE'] == 'diskcache':
+        default_cache = DiskCacheHandler(
+            directory = os.getenv('DISKCACHE_DIR', '/tmp/morss-diskcache'),
+            size_limit = CACHE_SIZE # in Bytes
+        )
+
+else:
+        default_cache = CappedDict()
--- a/morss/cli.py
+++ b/morss/cli.py
@@ -15,12 +15,11 @@
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.

-import sys
-import os.path
 import argparse
+import os.path
+import sys

-from .morss import FeedFetch, FeedGather, FeedFormat
-from .morss import Options
+from .morss import FeedFetch, FeedFormat, FeedGather, Options


 def cli_app():
@@ -32,6 +31,9 @@ def cli_app():

    parser.add_argument('url', help='feed url')

+    parser.add_argument('--post', action='store', type=str, metavar='STRING', help='POST request')
+    parser.add_argument('--xpath', action='store', type=str, metavar='XPATH', help='xpath rule to manually detect the article')
+
    group = parser.add_argument_group('output')
    group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
    group.add_argument('--search', action='store', type=str, metavar='STRING', help='does a basic case-sensitive search in the feed')
@@ -42,7 +44,7 @@ def cli_app():
    group.add_argument('--cache', action='store_true', help='only take articles from the cache (ie. don\'t grab new articles\' content), so as to save time')
    group.add_argument('--force', action='store_true', help='force refetch the rss feed and articles')
    group.add_argument('--proxy', action='store_true', help='doesn\'t fill the articles')
-    group.add_argument('--newest', action='store_true', help='return the feed items in chronological order (morss ohterwise shows the items by appearing order)')
+    group.add_argument('--order', default='first', choices=('first', 'last', 'newest', 'oldest'), help='order in which to process items (which are however NOT sorted in the output)')
    group.add_argument('--firstlink', action='store_true', help='pull the first article mentioned in the description instead of the default link')
    group.add_argument('--resolve', action='store_true', help='replace tracking links with direct links to articles (not compatible with --proxy)')

@@ -52,6 +54,7 @@ def cli_app():
    group.add_argument('--item_title', action='store', type=str, metavar='XPATH', help='entry\'s title')
    group.add_argument('--item_content', action='store', type=str, metavar='XPATH', help='entry\'s content')
    group.add_argument('--item_time', action='store', type=str, metavar='XPATH', help='entry\'s date & time (accepts a wide range of time formats)')
+    group.add_argument('--mode', default=None, choices=('xml', 'html', 'json'), help='parser to use for the custom feeds')

    group = parser.add_argument_group('misc')
    group.add_argument('--nolink', action='store_true', help='drop links, but keeps links\' inner text')
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -16,31 +16,37 @@
 # with this program. If not, see <https://www.gnu.org/licenses/>.

 import os
-import sys
-
-import zlib
-from io import BytesIO, StringIO
-import re
-import chardet
-from cgi import parse_header
-import lxml.html
-import time
-import threading
+import pickle
 import random
+import re
+import sys
+import time
+import zlib
+from cgi import parse_header
 from collections import OrderedDict
+from io import BytesIO, StringIO
+
+import chardet
+
+from .caching import default_cache

 try:
    # python 2
-    from urllib2 import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
    from urllib import quote
-    from urlparse import urlparse, urlunparse
-    import mimetools
+
+    from httplib import HTTPMessage
+    from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
+                         Request, addinfourl, build_opener, parse_http_list,
+                         parse_keqv_list)
+    from urlparse import urlsplit
 except ImportError:
    # python 3
-    from urllib.request import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
-    from urllib.parse import quote
-    from urllib.parse import urlparse, urlunparse
-    import email
+    from email import message_from_string
+    from http.client import HTTPMessage
+    from urllib.parse import quote, urlsplit
+    from urllib.request import (BaseHandler, HTTPCookieProcessor,
+                                HTTPRedirectHandler, Request, addinfourl,
+                                build_opener, parse_http_list, parse_keqv_list)

 try:
    # python 2
@@ -50,14 +56,12 @@ except NameError:
    basestring = unicode = str


-CACHE_SIZE = int(os.getenv('CACHE_SIZE', 1000)) # max number of items in cache (default: 1k items)
-CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60)) # how often to auto-clear the cache (default: 1min)
-
-
 MIMETYPE = {
    'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
    'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
-    'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
+    'html': ['text/html', 'application/xhtml+xml', 'application/xml'],
+    'json': ['application/json'],
+    }


 DEFAULT_UAS = [
@@ -82,14 +86,17 @@ def get(*args, **kwargs):
    return adv_get(*args, **kwargs)['data']


-def adv_get(url, timeout=None, *args, **kwargs):
+def adv_get(url, post=None, timeout=None, *args, **kwargs):
    url = sanitize_url(url)

+    if post is not None:
+        post = post.encode('utf-8')
+
    if timeout is None:
-        con = custom_handler(*args, **kwargs).open(url)
+        con = custom_opener(*args, **kwargs).open(url, data=post)

    else:
-        con = custom_handler(*args, **kwargs).open(url, timeout=timeout)
+        con = custom_opener(*args, **kwargs).open(url, data=post, timeout=timeout)

    data = con.read()

@@ -97,7 +104,7 @@ def adv_get(url, timeout=None, *args, **kwargs):
    encoding= detect_encoding(data, con)

    return {
-        'data':data,
+        'data': data,
        'url': con.geturl(),
        'con': con,
        'contenttype': contenttype,
@@ -105,9 +112,7 @@ def adv_get(url, timeout=None, *args, **kwargs):
    }


-def custom_handler(follow=None, delay=None, encoding=None):
-    handlers = []
-
+def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
    # as per urllib2 source code, these Handelers are added first
    # *unless* one of the custom handlers inherits from one of them
    #
@@ -115,21 +120,33 @@ def custom_handler(follow=None, delay=None, encoding=None):
    # HTTPDefaultErrorHandler, HTTPRedirectHandler,
    # FTPHandler, FileHandler, HTTPErrorProcessor]
    # & HTTPSHandler
+    #
+    # when processing a request:
+    # (1) all the *_request are run
+    # (2) the *_open are run until sth is returned (other than None)
+    # (3) all the *_response are run
+    #
+    # During (3), if an http error occurs (i.e. not a 2XX response code), the
+    # http_error_* are run until sth is returned (other than None). If they all
+    # return nothing, a python error is raised

-    #handlers.append(DebugHandler())
-    handlers.append(SizeLimitHandler(500*1024)) # 500KiB
-    handlers.append(HTTPCookieProcessor())
-    handlers.append(GZIPHandler())
-    handlers.append(HTTPEquivHandler())
-    handlers.append(HTTPRefreshHandler())
-    handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
-    handlers.append(BrowserlyHeaderHandler())
-    handlers.append(EncodingFixHandler(encoding))
+    handlers = [
+        #DebugHandler(),
+        SizeLimitHandler(500*1024), # 500KiB
+        HTTPCookieProcessor(),
+        GZIPHandler(),
+        HTTPAllRedirectHandler(),
+        HTTPEquivHandler(),
+        HTTPRefreshHandler(),
+        UAHandler(random.choice(DEFAULT_UAS)),
+        BrowserlyHeaderHandler(),
+        EncodingFixHandler(),
+    ]

    if follow:
        handlers.append(AlternateHandler(MIMETYPE[follow]))

-    handlers.append(CacheHandler(force_min=delay))
+    handlers.append(CacheHandler(policy=policy, force_min=force_min, force_max=force_max))

    return build_opener(*handlers)

@@ -146,10 +163,20 @@ def is_ascii(string):
        return True


+def soft_quote(string):
+    " url-quote only when not a valid ascii string "
+
+    if is_ascii(string):
+        return string
+
+    else:
+        return quote(string.encode('utf-8'))
+
+
 def sanitize_url(url):
    # make sure the url is unicode, i.e. not bytes
    if isinstance(url, bytes):
-        url = url.decode()
+        url = url.decode('utf-8')

    # make sure there's a protocol (http://)
    if url.split(':', 1)[0] not in PROTOCOL:
@@ -162,18 +189,64 @@ def sanitize_url(url):
    url = url.replace(' ', '%20')

    # escape non-ascii unicode characters
-    # https://stackoverflow.com/a/4391299
-    parts = list(urlparse(url))
+    parts = urlsplit(url)

-    for i in range(len(parts)):
-        if not is_ascii(parts[i]):
-            if i == 1:
-                parts[i] = parts[i].encode('idna').decode('ascii')
+    parts = parts._replace(
+        netloc=parts.netloc.replace(
+            parts.hostname,
+            parts.hostname.encode('idna').decode('ascii')
+            ),
+        path=soft_quote(parts.path),
+        query=soft_quote(parts.query),
+        fragment=soft_quote(parts.fragment),
+    )

-            else:
-                parts[i] = quote(parts[i].encode('utf-8'))
+    return parts.geturl()

-    return urlunparse(parts)
+
+class RespDataHandler(BaseHandler):
+    " Make it easier to use the reponse body "
+
+    def data_reponse(self, req, resp, data):
+        pass
+
+    def http_response(self, req, resp):
+        # read data
+        data = resp.read()
+
+        # process data and use returned content (if any)
+        data = self.data_response(req, resp, data) or data
+
+        # reformat the stuff
+        fp = BytesIO(data)
+        old_resp = resp
+        resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
+        resp.msg = old_resp.msg
+
+        return resp
+
+    https_response = http_response
+
+
+class RespStrHandler(RespDataHandler):
+    " Make it easier to use the _decoded_ reponse body "
+
+    def str_reponse(self, req, resp, data_str):
+        pass
+
+    def data_response(self, req, resp, data):
+        #decode
+        enc = detect_encoding(data, resp)
+        data_str = data.decode(enc, 'replace')
+
+        #process
+        data_str = self.str_response(req, resp, data_str)
+
+        # return
+        data = data_str.encode(enc) if data_str is not None else data
+
+        #return
+        return data


 class DebugHandler(BaseHandler):
@@ -196,7 +269,7 @@ class SizeLimitHandler(BaseHandler):

    handler_order = 450

-    def __init__(self, limit=5*1024^2):
+    def __init__(self, limit=5*1024**2):
        self.limit = limit

    def http_response(self, req, resp):
@@ -217,29 +290,17 @@ def UnGzip(data):
    return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data)


-class GZIPHandler(BaseHandler):
+class GZIPHandler(RespDataHandler):
    def http_request(self, req):
        req.add_unredirected_header('Accept-Encoding', 'gzip')
        return req

-    def http_response(self, req, resp):
+    def data_response(self, req, resp, data):
        if 200 <= resp.code < 300:
            if resp.headers.get('Content-Encoding') == 'gzip':
-                data = resp.read()
-
-                data = UnGzip(data)
-
                resp.headers['Content-Encoding'] = 'identity'

-                fp = BytesIO(data)
-                old_resp = resp
-                resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
-                resp.msg = old_resp.msg
-
-        return resp
-
-    https_response = http_response
-    https_request = http_request
+                return UnGzip(data)


 def detect_encoding(data, resp=None):
@@ -276,28 +337,9 @@ def detect_raw_encoding(data, resp=None):
    return 'utf-8'


-class EncodingFixHandler(BaseHandler):
-    def __init__(self, encoding=None):
-        self.encoding = encoding
-
-    def http_response(self, req, resp):
-        maintype = resp.info().get('Content-Type', '').split('/')[0]
-        if 200 <= resp.code < 300 and maintype == 'text':
-            data = resp.read()
-
-            enc = self.encoding or detect_encoding(data, resp)
-
-            data = data.decode(enc, 'replace')
-            data = data.encode(enc)
-
-            fp = BytesIO(data)
-            old_resp = resp
-            resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
-            resp.msg = old_resp.msg
-
-        return resp
-
-    https_response = http_response
+class EncodingFixHandler(RespStrHandler):
+    def str_response(self, req, resp, data_str):
+        return data_str


 class UAHandler(BaseHandler):
@@ -323,71 +365,58 @@ class BrowserlyHeaderHandler(BaseHandler):
    https_request = http_request


-class AlternateHandler(BaseHandler):
+def iter_html_tag(html_str, tag_name):
+    " To avoid parsing whole pages when looking for a simple tag "
+
+    re_tag = r'<%s\s+[^>]+>' % tag_name
+    re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
+
+    for tag_match in re.finditer(re_tag, html_str):
+        attr_match = re.findall(re_attr, tag_match.group(0))
+
+        if attr_match is not None:
+            yield dict(attr_match)
+
+
+class AlternateHandler(RespStrHandler):
    " Follow <link rel='alternate' type='application/rss+xml' href='...' /> "

    def __init__(self, follow=None):
        self.follow = follow or []

-    def http_response(self, req, resp):
+    def str_response(self, req, resp, data_str):
        contenttype = resp.info().get('Content-Type', '').split(';')[0]
+
        if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
            # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types

-            data = resp.read()
-
-            try:
-                links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
-
-                for link in links:
-                    if link.get('type', '') in self.follow:
-                        resp.code = 302
-                        resp.msg = 'Moved Temporarily'
-                        resp.headers['location'] = link.get('href')
-                        break
-
-            except (ValueError, SyntaxError):
-                # catch parsing errors
-                pass
-
-            fp = BytesIO(data)
-            old_resp = resp
-            resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
-            resp.msg = old_resp.msg
-
-        return resp
-
-    https_response = http_response
+            for link in iter_html_tag(data_str[:10000], 'link'):
+                if (link.get('rel') == 'alternate'
+                        and link.get('type') in self.follow
+                        and 'href' in link):
+                    resp.code = 302
+                    resp.msg = 'Moved Temporarily'
+                    resp.headers['location'] = link.get('href')
+                    break


-class HTTPEquivHandler(BaseHandler):
+class HTTPEquivHandler(RespStrHandler):
    " Handler to support <meta http-equiv='...' content='...' />, since it defines HTTP headers "

    handler_order = 600

-    def http_response(self, req, resp):
+    def str_response(self, req, resp, data_str):
        contenttype = resp.info().get('Content-Type', '').split(';')[0]
        if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
-            data = resp.read()

-            try:
-                headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
+            for meta in iter_html_tag(data_str[:10000], 'meta'):
+                if 'http-equiv' in meta and 'content' in meta:
+                    resp.headers[meta.get('http-equiv').lower()] = meta.get('content')

-                for header in headers:
-                    resp.headers[header.get('http-equiv').lower()] = header.get('content')

-            except (ValueError, SyntaxError):
-                # catch parsing errors
-                pass
-
-            fp = BytesIO(data)
-            old_resp = resp
-            resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
-            resp.msg = old_resp.msg
-
-        return resp
-
-    https_response = http_response
+class HTTPAllRedirectHandler(HTTPRedirectHandler):
+    def http_error_308(self, req, fp, code, msg, headers):
+        return self.http_error_301(req, fp, 301, msg, headers)


 class HTTPRefreshHandler(BaseHandler):
@@ -396,7 +425,7 @@ class HTTPRefreshHandler(BaseHandler):
    def http_response(self, req, resp):
        if 200 <= resp.code < 300:
            if resp.headers.get('refresh'):
-                regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url=(["\']?)(?P<url>.+)\2$'
+                regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url\s*=\s*(["\']?)(?P<url>.+)\2$'
                match = re.search(regex, resp.headers.get('refresh'))

                if match:
@@ -412,56 +441,124 @@ class HTTPRefreshHandler(BaseHandler):
    https_response = http_response


+def parse_headers(text=u'\n\n'):
+    if sys.version_info[0] >= 3:
+        # python 3
+        return message_from_string(text, _class=HTTPMessage)
+
+    else:
+        # python 2
+        return HTTPMessage(StringIO(text))
+
+
+def error_response(code, msg, url=''):
+    # return an error as a response
+    resp = addinfourl(BytesIO(), parse_headers(), url, code)
+    resp.msg = msg
+    return resp
+
+
 class CacheHandler(BaseHandler):
    " Cache based on etags/last-modified "

-    private_cache = False # Websites can indicate whether the page should be
-                          # cached by CDNs (e.g. shouldn't be the case for
-                          # private/confidential/user-specific pages.
-                          # With this setting, decide whether (False) you want
-                          # the cache to behave like a CDN (i.e. don't cache
-                          # private pages), or (True) to behave like a end-cache
-                          # private pages. If unsure, False is the safest bet.
+    privacy = 'private' # Websites can indicate whether the page should be cached
+                        # by CDNs (e.g. shouldn't be the case for
+                        # private/confidential/user-specific pages. With this
+                        # setting, decide whether you want the cache to behave
+                        # like a CDN (i.e. don't cache private pages, 'public'),
+                        # or to behave like a end-user private pages
+                        # ('private'). If unsure, 'public' is the safest bet,
+                        # but many websites abuse this feature...
+
+                      # NB. This overrides all the other min/max/policy settings.
    handler_order = 499

-    def __init__(self, cache=None, force_min=None):
+    def __init__(self, cache=None, force_min=None, force_max=None, policy=None):
        self.cache = cache or default_cache
        self.force_min = force_min
-            # Servers indicate how long they think their content is "valid".
-            # With this parameter (force_min, expressed in seconds), we can
-            # override the validity period (i.e. bypassing http headers)
-            # Special values:
-            #   -1: valid forever, i.e. use the cache no matter what (and fetch
-            #       the page online if not present in cache)
-            #    0: valid zero second, i.e. force refresh
-            #   -2: same as -1, i.e. use the cache no matter what, but do NOT
-            #       fetch the page online if not present in cache, throw an
-            #       error instead
+        self.force_max = force_max
+        self.policy = policy # can be cached/refresh/offline/None (default)
+
+        # Servers indicate how long they think their content is "valid". With
+        # this parameter (force_min/max, expressed in seconds), we can override
+        # the validity period (i.e. bypassing http headers)
+        # Special choices, via "policy":
+        #   cached: use the cache no matter what (and fetch the page online if
+        #           not present in cache)
+        #   refresh: valid zero second, i.e. force refresh
+        #   offline: same as cached, i.e. use the cache no matter what, but do
+        #            NOT fetch the page online if not present in cache, throw an
+        #            error instead
+        #   None: just follow protocols
+
+        # sanity checks
+        assert self.force_max is None or self.force_max >= 0
+        assert self.force_min is None or self.force_min >= 0
+        assert self.force_max is None or self.force_min is None or self.force_max >= self.force_min

    def load(self, url):
        try:
-            out = list(self.cache[url])
+            data = pickle.loads(self.cache[url])
+
        except KeyError:
-            out = [None, None, unicode(), bytes(), 0]
+            data = None

-        if sys.version_info[0] >= 3:
-            out[2] = email.message_from_string(out[2] or unicode()) # headers
        else:
-            out[2] = mimetools.Message(StringIO(out[2] or unicode()))
+            data['headers'] = parse_headers(data['headers'] or unicode())

-        return out
+        return data

-    def save(self, url, code, msg, headers, data, timestamp):
-        self.cache[url] = (code, msg, unicode(headers), data, timestamp)
+    def save(self, key, data):
+        data['headers'] = unicode(data['headers'])
+        self.cache[key] = pickle.dumps(data, 0)
+
+    def cached_response(self, req, fallback=None):
+        req.from_morss_cache = True
+
+        data = self.load(req.get_full_url())
+
+        if data is not None:
+            # return the cache as a response
+            resp = addinfourl(BytesIO(data['data']), data['headers'], req.get_full_url(), data['code'])
+            resp.msg = data['msg']
+            return resp
+
+        else:
+            return fallback
+
+    def save_response(self, req, resp):
+        if req.from_morss_cache:
+            # do not re-save (would reset the timing)
+            return resp
+
+        data = resp.read()
+
+        self.save(req.get_full_url(), {
+            'code': resp.code,
+            'msg': resp.msg,
+            'headers': resp.headers,
+            'data': data,
+            'timestamp': time.time()
+            })
+
+        fp = BytesIO(data)
+        old_resp = resp
+        resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
+        resp.msg = old_resp.msg
+
+        return resp

    def http_request(self, req):
-        (code, msg, headers, data, timestamp) = self.load(req.get_full_url())
+        req.from_morss_cache = False # to track whether it comes from cache

-        if 'etag' in headers:
-            req.add_unredirected_header('If-None-Match', headers['etag'])
+        data = self.load(req.get_full_url())

-        if 'last-modified' in headers:
-            req.add_unredirected_header('If-Modified-Since', headers.get('last-modified'))
+        if data is not None:
+            if 'etag' in data['headers']:
+                req.add_unredirected_header('If-None-Match', data['headers']['etag'])
+
+            if 'last-modified' in data['headers']:
+                req.add_unredirected_header('If-Modified-Since', data['headers']['last-modified'])

        return req

@@ -470,275 +567,111 @@ class CacheHandler(BaseHandler):
        # If 'None' is returned, try your chance with the next-available handler
        # If a 'resp' is returned, stop there, and proceed with 'http_response'

-        (code, msg, headers, data, timestamp) = self.load(req.get_full_url())
+        # Here, we try to see whether we want to use data from cache (i.e.
+        # return 'resp'), or whether we want to refresh the content (return
+        # 'None')

-        # some info needed to process everything
-        cache_control = parse_http_list(headers.get('cache-control', ()))
-        cache_control += parse_http_list(headers.get('pragma', ()))
+        data = self.load(req.get_full_url())

-        cc_list = [x for x in cache_control if '=' not in x]
-        cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
+        if data is not None:
+            # some info needed to process everything
+            cache_control = parse_http_list(data['headers'].get('cache-control', ()))
+            cache_control += parse_http_list(data['headers'].get('pragma', ()))

-        cache_age = time.time() - timestamp
+            cc_list = [x for x in cache_control if '=' not in x]
+            cc_values = parse_keqv_list([x for x in cache_control if '=' in x])

-        # list in a simple way what to do when
-        if req.get_header('Morss') == 'from_304': # for whatever reason, we need an uppercase
-            # we're just in the middle of a dirty trick, use cache
-            pass
+            cache_age = time.time() - data['timestamp']

-        elif self.force_min == -2:
-            if code is not None:
-                # already in cache, perfect, use cache
-                pass
+        # list in a simple way what to do in special cases

-            else:
-                # raise an error, via urllib handlers
-                headers['Morss'] = 'from_cache'
-                resp = addinfourl(BytesIO(), headers, req.get_full_url(), 409)
-                resp.msg = 'Conflict'
-                return resp
-
-        elif code is None:
-            # cache empty, refresh
+        if data is not None and 'private' in cc_list and self.privacy == 'public':
+            # private data but public cache, do not use cache
+            # privacy concern, so handled first and foremost
+            # (and doesn't need to be addressed anymore afterwards)
            return None

-        elif self.force_min == -1:
-            # force use cache
-            pass
+        elif self.policy == 'offline':
+            # use cache, or return an error
+            return self.cached_response(
+                req,
+                error_response(409, 'Conflict', req.get_full_url())
+            )

-        elif self.force_min == 0:
+        elif self.policy == 'cached':
+            # use cache, or fetch online
+            return self.cached_response(req, None)
+
+        elif self.policy == 'refresh':
            # force refresh
            return None

-        elif code == 301 and cache_age < 7*24*3600:
+        elif data is None:
+            # we have already settled all the cases that don't need the cache.
+            # all the following ones need the cached item
+            return None
+
+        elif self.force_max is not None and cache_age > self.force_max:
+            # older than we want, refresh
+            return None
+
+        elif self.force_min is not None and cache_age < self.force_min:
+            # recent enough, use cache
+            return self.cached_response(req)
+
+        elif data['code'] == 301 and cache_age < 7*24*3600:
            # "301 Moved Permanently" has to be cached...as long as we want
            # (awesome HTTP specs), let's say a week (why not?). Use force_min=0
            # if you want to bypass this (needed for a proper refresh)
-            pass
+            return self.cached_response(req)

-        elif  self.force_min is None and ('no-cache' in cc_list
-                                        or 'no-store' in cc_list
-                                        or ('private' in cc_list and not self.private_cache)):
-            # kindly follow web servers indications, refresh
-            # if the same settings are used all along, this section shouldn't be
-            # of any use, since the page woudln't be cached in the first place
-            # the check is only performed "just in case"
+        elif self.force_min is None and ('no-cache' in cc_list or 'no-store' in cc_list):
+            # kindly follow web servers indications, refresh if the same
+            # settings are used all along, this section shouldn't be of any use,
+            # since the page woudln't be cached in the first place the check is
+            # only performed "just in case"
+            # NB. NOT respected if force_min is set
            return None

        elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
-            # server says it's still fine (and we trust him, if not, use force_min=0), use cache
-            pass
-
-        elif self.force_min is not None and self.force_min > cache_age:
-            # still recent enough for us, use cache
-            pass
+            # server says it's still fine (and we trust him, if not, use overrides), use cache
+            return self.cached_response(req)

        else:
            # according to the www, we have to refresh when nothing is said
            return None

-        # return the cache as a response. This code is reached with 'pass' above
-        headers['morss'] = 'from_cache' # TODO delete the morss header from incoming pages, to avoid websites messing up with us
-        resp = addinfourl(BytesIO(data), headers, req.get_full_url(), code)
-        resp.msg = msg
-
-        return resp
-
    def http_response(self, req, resp):
-        # code for after-fetch, to know whether to save to hard-drive (if stiking to http headers' will)
+        # code for after-fetch, to know whether to save to hard-drive (if sticking to http headers' will)

-        if resp.code == 304:
-            return resp
+        if resp.code == 304 and resp.url in self.cache:
+            # we are hopefully the first after the HTTP handler, so no need
+            # to re-run all the *_response
+            # here: cached page, returning from cache
+            return self.cached_response(req)

-        if ('cache-control' in resp.headers or 'pragma' in resp.headers) and self.force_min is None:
+        elif self.force_min is None and ('cache-control' in resp.headers or 'pragma' in resp.headers):
            cache_control = parse_http_list(resp.headers.get('cache-control', ()))
            cache_control += parse_http_list(resp.headers.get('pragma', ()))

            cc_list = [x for x in cache_control if '=' not in x]

-            if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache):
-                # kindly follow web servers indications
+            if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and self.privacy == 'public'):
+                # kindly follow web servers indications (do not save & return)
                return resp

-        if resp.headers.get('Morss') == 'from_cache':
-            # it comes from cache, so no need to save it again
-            return resp
+            else:
+                # save
+                return self.save_response(req, resp)

-        # save to disk
-        data = resp.read()
-        self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
-
-        # the below is only needed because of 'resp.read()' above, as we can't
-        # seek(0) on arbitraty file-like objects (e.g. sockets)
-        fp = BytesIO(data)
-        old_resp = resp
-        resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
-        resp.msg = old_resp.msg
-
-        return resp
-
-    def http_error_304(self, req, fp, code, msg, headers):
-        cache = list(self.load(req.get_full_url()))
-
-        if cache[0]:
-            cache[-1] = time.time()
-            self.save(req.get_full_url(), *cache)
-
-            new = Request(req.get_full_url(),
-                           headers=req.headers,
-                           unverifiable=True)
-
-            new.add_unredirected_header('Morss', 'from_304')
-                # create a "fake" new request to just re-run through the various
-                # handlers
-
-            return self.parent.open(new, timeout=req.timeout)
-
-        return None # when returning 'None', the next-available handler is used
-                    # the 'HTTPRedirectHandler' has no 'handler_order', i.e.
-                    # uses the default of 500, therefore executed after this
+        else:
+            return self.save_response(req, resp)

    https_request = http_request
    https_open = http_open
    https_response = http_response


-class BaseCache:
-    """ Subclasses must behave like a dict """
-
-    def trim(self):
-        pass
-
-    def autotrim(self, delay=CACHE_LIFESPAN):
-        # trim the cache every so often
-
-        self.trim()
-
-        t = threading.Timer(delay, self.autotrim)
-        t.daemon = True
-        t.start()
-
-    def __contains__(self, url):
-        try:
-            self[url]
-
-        except KeyError:
-            return False
-
-        else:
-            return True
-
-
-import sqlite3
-
-
-class SQLiteCache(BaseCache):
-    def __init__(self, filename=':memory:'):
-        self.con = sqlite3.connect(filename, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
-
-        with self.con:
-            self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)')
-            self.con.execute('pragma journal_mode=WAL')
-
-        self.trim()
-
-    def __del__(self):
-        self.con.close()
-
-    def trim(self):
-        with self.con:
-            self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,))
-
-    def __getitem__(self, url):
-        row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
-
-        if not row:
-            raise KeyError
-
-        return row[1:]
-
-    def __setitem__(self, url, value): # value = (code, msg, headers, data, timestamp)
-        value = list(value)
-        value[3] = sqlite3.Binary(value[3]) # data
-        value = tuple(value)
-
-        with self.con:
-            self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?) ON CONFLICT(url) DO UPDATE SET code=?, msg=?, headers=?, data=?, timestamp=?', (url,) + value + value)
-
-
-import pymysql.cursors
-
-
-class MySQLCacheHandler(BaseCache):
-    def __init__(self, user, password, database, host='localhost'):
-        self.user = user
-        self.password = password
-        self.database = database
-        self.host = host
-
-        with self.cursor() as cursor:
-            cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)')
-
-        self.trim()
-
-    def cursor(self):
-        return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
-
-    def trim(self):
-        with self.cursor() as cursor:
-            cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,))
-
-    def __getitem__(self, url):
-        cursor = self.cursor()
-        cursor.execute('SELECT * FROM data WHERE url=%s', (url,))
-        row = cursor.fetchone()
-
-        if not row:
-            raise KeyError
-
-        return row[1:]
-
-    def __setitem__(self, url, value): # (code, msg, headers, data, timestamp)
-        with self.cursor() as cursor:
-            cursor.execute('INSERT INTO data VALUES (%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE code=%s, msg=%s, headers=%s, data=%s, timestamp=%s',
-                (url,) + value + value)
-
-
-class CappedDict(OrderedDict, BaseCache):
-    def trim(self):
-        if CACHE_SIZE >= 0:
-            for i in range( max( len(self) - CACHE_SIZE , 0 )):
-                self.popitem(False)
-
-    def __setitem__(self, key, value):
-        # https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
-        if key in self:
-            del self[key]
-        OrderedDict.__setitem__(self, key, value)
-
-
-if 'CACHE' in os.environ:
-    if os.environ['CACHE'] == 'mysql':
-        default_cache = MySQLCacheHandler(
-            user = os.getenv('MYSQL_USER'),
-            password = os.getenv('MYSQL_PWD'),
-            database = os.getenv('MYSQL_DB'),
-            host = os.getenv('MYSQL_HOST', 'localhost')
-        )
-
-    elif os.environ['CACHE'] == 'sqlite':
-        if 'SQLITE_PATH' in os.environ:
-            path = os.getenv('SQLITE_PATH') + '/morss-cache.db'
-
-        else:
-            path = ':memory:'
-
-        default_cache = SQLiteCache(path)
-
-else:
-        default_cache = CappedDict()
-
-
 if 'IGNORE_SSL' in os.environ:
    import ssl
    ssl._create_default_https_context = ssl._create_unverified_context
--- a/morss/feedify.ini
+++ b/morss/feedify.ini
@@ -73,7 +73,7 @@ item_updated = atom03:updated
 mode = json

 mimetype = application/json
-timeformat = %Y-%m-%dT%H:%M:%SZ
+timeformat = %Y-%m-%dT%H:%M:%S%z
 base = {}

 title = title
@@ -90,9 +90,6 @@ item_updated = updated
 [html]
 mode = html

-path =
-  http://localhost/
-
 title = //div[@id='header']/h1
 desc = //div[@id='header']/p
 items = //div[@id='content']/div
--- a/morss/feeds.py
+++ b/morss/feeds.py
@@ -15,35 +15,31 @@
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.

-import sys
-import os.path
-
-from datetime import datetime
-
-import re
-import json
 import csv
-
+import json
+import re
+from copy import deepcopy
+from datetime import datetime
 from fnmatch import fnmatch

-from lxml import etree
-from dateutil import tz
 import dateutil.parser
-from copy import deepcopy
-
 import lxml.html
+from dateutil import tz
+from lxml import etree
+
 from .readabilite import parse as html_parse
+from .util import *

 json.encoder.c_make_encoder = None

 try:
    # python 2
-    from StringIO import StringIO
    from ConfigParser import RawConfigParser
+    from StringIO import StringIO
 except ImportError:
    # python 3
-    from io import StringIO
    from configparser import RawConfigParser
+    from io import StringIO

 try:
    # python 2
@@ -55,7 +51,7 @@ except NameError:

 def parse_rules(filename=None):
    if not filename:
-        filename = os.path.join(os.path.dirname(__file__), 'feedify.ini')
+        filename = pkg_path('feedify.ini')

    config = RawConfigParser()
    config.read(filename)
@@ -69,18 +65,10 @@ def parse_rules(filename=None):
            # for each rule

            if rules[section][arg].startswith('file:'):
-                paths = [os.path.join(sys.prefix, 'share/morss/www', rules[section][arg][5:]),
-                    os.path.join(os.path.dirname(__file__), '../www', rules[section][arg][5:]),
-                    os.path.join(os.path.dirname(__file__), '../..', rules[section][arg][5:])]
-
-                for path in paths:
-                    try:
-                        file_raw = open(path).read()
-                        file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
-                        rules[section][arg] = file_clean
-
-                    except IOError:
-                        pass
+                path = data_path('www', rules[section][arg][5:])
+                file_raw = open(path).read()
+                file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
+                rules[section][arg] = file_clean

            elif '\n' in rules[section][arg]:
                rules[section][arg] = rules[section][arg].split('\n')[1:]
@@ -88,20 +76,25 @@ def parse_rules(filename=None):
    return rules


-def parse(data, url=None, encoding=None):
+def parse(data, url=None, encoding=None, ruleset=None):
    " Determine which ruleset to use "

-    rulesets = parse_rules()
+    if ruleset is not None:
+        rulesets = [ruleset]
+
+    else:
+        rulesets = parse_rules().values()
+
    parsers = [FeedXML, FeedHTML, FeedJSON]

    # 1) Look for a ruleset based on path

    if url is not None:
-        for ruleset in rulesets.values():
+        for ruleset in rulesets:
            if 'path' in ruleset:
                for path in ruleset['path']:
                    if fnmatch(url, path):
-                        parser = [x for x in parsers if x.mode == ruleset['mode']][0]
+                        parser = [x for x in parsers if x.mode == ruleset.get('mode')][0] # FIXME what if no mode specified?
                        return parser(data, ruleset, encoding=encoding)

    # 2) Try each and every parser
@@ -111,9 +104,6 @@ def parse(data, url=None, encoding=None):
        # 3b) See if .items matches anything

    for parser in parsers:
-        ruleset_candidates = [x for x in rulesets.values() if x['mode'] == parser.mode and 'path' not in x]
-            # 'path' as they should have been caught beforehands
-
        try:
            feed = parser(data, encoding=encoding)

@@ -124,13 +114,17 @@ def parse(data, url=None, encoding=None):
        else:
            # parsing worked, now we try the rulesets

+            ruleset_candidates = [x for x in rulesets if x.get('mode') in (parser.mode, None) and 'path' not in x]
+                # 'path' as they should have been caught beforehands
+                # try anyway if no 'mode' specified
+
            for ruleset in ruleset_candidates:
                feed.rules = ruleset

                try:
                    feed.items[0]

-                except (AttributeError, IndexError):
+                except (AttributeError, IndexError, TypeError):
                    # parsing and or item picking did not work out
                    pass

@@ -193,11 +187,12 @@ class ParserBase(object):
        return self.convert(FeedHTML).tostring(**k)

    def convert(self, TargetParser):
-        if type(self) == TargetParser:
-            return self
-
        target = TargetParser()

+        if type(self) == TargetParser and self.rules == target.rules:
+            # check both type *AND* rules (e.g. when going from freeform xml to rss)
+            return self
+
        for attr in target.dic:
            if attr == 'items':
                for item in self.items:
@@ -366,7 +361,13 @@ class ParserXML(ParserBase):

    def rule_search_all(self, rule):
        try:
-            return self.root.xpath(rule, namespaces=self.NSMAP)
+            match = self.root.xpath(rule, namespaces=self.NSMAP)
+            if isinstance(match, str):
+                # some xpath rules return a single string instead of an array (e.g. concatenate() )
+                return [match,]
+
+            else:
+                return match

        except etree.XPathEvalError:
            return []
@@ -429,7 +430,7 @@ class ParserXML(ParserBase):

        match = self.rule_search(rrule)

-        html_rich = ('atom' in rule or self.rules['mode'] == 'html') \
+        html_rich = ('atom' in rule or self.rules.get('mode') == 'html') \
            and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]

        if key is not None:
@@ -440,7 +441,7 @@ class ParserXML(ParserBase):
                self._clean_node(match)
                match.append(lxml.html.fragment_fromstring(value, create_parent='div'))

-                if self.rules['mode'] == 'html':
+                if self.rules.get('mode') == 'html':
                    match.find('div').drop_tag() # not supported by lxml.etree

                else: # i.e. if atom
@@ -456,7 +457,7 @@ class ParserXML(ParserBase):
    def rule_str(self, rule):
        match = self.rule_search(rule)

-        html_rich = ('atom' in rule or self.rules['mode'] == 'html') \
+        html_rich = ('atom' in rule or self.mode == 'html') \
            and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]

        if isinstance(match, etree._Element):
@@ -489,7 +490,14 @@ class ParserHTML(ParserXML):
            repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
            rule = re.sub(pattern, repl, rule)

-            return self.root.xpath(rule)
+            match = self.root.xpath(rule)
+
+            if isinstance(match, str):
+                # for some xpath rules, see XML parser
+                return [match,]
+
+            else:
+                return match

        except etree.XPathEvalError:
            return []
@@ -508,24 +516,31 @@ class ParserHTML(ParserXML):


 def parse_time(value):
+    # parsing per se
    if value is None or value == 0:
-        return None
+        time = None

    elif isinstance(value, basestring):
        if re.match(r'^[0-9]+$', value):
-            return datetime.fromtimestamp(int(value), tz.tzutc())
+            time = datetime.fromtimestamp(int(value))

        else:
-            return dateutil.parser.parse(value).replace(tzinfo=tz.tzutc())
+            time = dateutil.parser.parse(value)

    elif isinstance(value, int):
-        return datetime.fromtimestamp(value, tz.tzutc())
+        time = datetime.fromtimestamp(value)

    elif isinstance(value, datetime):
-        return value
+        time = value

    else:
-        return None
+        time = None
+
+    # add default time zone if none set
+    if time is not None and time.tzinfo is None:
+            time = time.replace(tzinfo=tz.tzutc())
+
+    return time


 class ParserJSON(ParserBase):
@@ -684,7 +699,7 @@ class Feed(object):
                try:
                    setattr(item, attr, new[attr])

-                except (IndexError, TypeError):
+                except (KeyError, IndexError, TypeError):
                    pass

        return item
@@ -800,6 +815,8 @@ class FeedJSON(Feed, ParserJSON):


 if __name__ == '__main__':
+    import sys
+
    from . import crawler

    req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -16,30 +16,26 @@
 # with this program. If not, see <https://www.gnu.org/licenses/>.

 import os
-
+import re
+import sys
 import time
 from datetime import datetime
-from dateutil import tz
-
 from fnmatch import fnmatch
-import re

 import lxml.etree
 import lxml.html
+from dateutil import tz

-from . import feeds
-from . import crawler
-from . import readabilite
-
+from . import caching, crawler, feeds, readabilite

 try:
    # python 2
    from httplib import HTTPException
-    from urlparse import urlparse, urljoin, parse_qs
+    from urlparse import parse_qs, urljoin, urlparse
 except ImportError:
    # python 3
    from http.client import HTTPException
-    from urllib.parse import urlparse, urljoin, parse_qs
+    from urllib.parse import parse_qs, urljoin, urlparse


 MAX_ITEM = int(os.getenv('MAX_ITEM', 5)) # cache-only beyond
@@ -64,7 +60,7 @@ def log(txt):

        else:
            # when using internal server or cli
-            print(repr(txt))
+            print(repr(txt), file=sys.stderr)


 def len_html(txt):
@@ -91,12 +87,12 @@ class Options:
        else:
            self.options = options or {}

-    def __getattr__(self, key):
+    def __getattr__(self, key, default=None):
        if key in self.options:
            return self.options[key]

        else:
-            return False
+            return default

    def __setitem__(self, key, value):
        self.options[key] = value
@@ -104,6 +100,8 @@ class Options:
    def __contains__(self, key):
        return key in self.options

+    get = __getitem__ = __getattr__
+

 def ItemFix(item, options, feedurl='/'):
    """ Improves feed items (absolute links, resolve feedburner links, etc) """
@@ -197,21 +195,20 @@ def ItemFill(item, options, feedurl='/', fast=False):
    log(item.link)

    # download
-    delay = -1

-    if fast or options.fast:
+    if fast or options.cache:
        # force cache, don't fetch
-        delay = -2
+        policy = 'offline'

    elif options.force:
        # force refresh
-        delay = 0
+        policy = 'refresh'

    else:
-        delay = 24*60*60 # 24h
+        policy = None

    try:
-        req = crawler.adv_get(url=item.link, delay=delay, timeout=TIMEOUT)
+        req = crawler.adv_get(url=item.link, policy=policy, force_min=24*60*60, timeout=TIMEOUT)

    except (IOError, HTTPException) as e:
        log('http error')
@@ -221,7 +218,11 @@ def ItemFill(item, options, feedurl='/', fast=False):
        log('non-text page')
        return True

-    out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
+    if not req['data']:
+        log('empty page')
+        return True
+
+    out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode', xpath=options.xpath)

    if out is not None:
        item.content = out
@@ -265,33 +266,43 @@ def FeedFetch(url, options):
    # fetch feed
    delay = DELAY

-    if options.force:
-        delay = 0
+    if options.cache:
+        policy = 'offline'
+
+    elif options.force:
+        policy = 'refresh'
+
+    else:
+        policy = None

    try:
-        req = crawler.adv_get(url=url, follow=('rss' if not options.items else None), delay=delay, timeout=TIMEOUT * 2)
+        req = crawler.adv_get(url=url, post=options.post, follow=('rss' if not options.items else None), policy=policy, force_min=5*60, force_max=60*60, timeout=TIMEOUT)

    except (IOError, HTTPException):
        raise MorssException('Error downloading feed')

    if options.items:
        # using custom rules
-        rss = feeds.FeedHTML(req['data'], encoding=req['encoding'])
+        ruleset = {}

-        rss.rules['title'] = options.title              if options.title        else '//head/title'
-        rss.rules['desc'] = options.desc                if options.desc         else '//head/meta[@name="description"]/@content'
+        ruleset['items'] = options.items

-        rss.rules['items'] = options.items
+        if options.mode:
+            ruleset['mode'] = options.mode

-        rss.rules['item_title'] = options.item_title    if options.item_title   else '.'
-        rss.rules['item_link'] = options.item_link      if options.item_link    else './@href|.//a/@href|ancestor::a/@href'
+        ruleset['title'] = options.get('title', '//head/title')
+        ruleset['desc'] = options.get('desc', '//head/meta[@name="description"]/@content')
+
+        ruleset['item_title'] = options.get('item_title', '.')
+        ruleset['item_link'] = options.get('item_link', '(.|.//a|ancestor::a)/@href')

        if options.item_content:
-            rss.rules['item_content'] = options.item_content
+            ruleset['item_content'] = options.item_content

        if options.item_time:
-            rss.rules['item_time'] = options.item_time
+            ruleset['item_time'] = options.item_time

+        rss = feeds.parse(req['data'], encoding=req['encoding'], ruleset=ruleset)
        rss = rss.convert(feeds.FeedXML)

    else:
@@ -321,16 +332,23 @@ def FeedGather(rss, url, options):
    if options.cache:
        max_time = 0

-    if options.newest:
-        # :newest take the newest items
-        now = datetime.now(tz.tzutc())
-        sorted_items = sorted(rss.items, key=lambda x:x.updated or x.time or now, reverse=True)
+    # sort
+    sorted_items = list(rss.items)

-    else:
-        # default behavior, take the first items (in appearing order)
-        sorted_items = list(rss.items)
+    if options.order == 'last':
+    # `first` does nothing from a practical standpoint, so only `last` needs
+    # to be addressed
+        sorted_items = reversed(sorted_items)
+
+    elif options.order in ['newest', 'oldest']:
+        now = datetime.now(tz.tzutc())
+        sorted_items = sorted(sorted_items, key=lambda x:x.updated or x.time or now) # oldest to newest
+
+        if options.order == 'newest':
+            sorted_items = reversed(sorted_items)

    for i, item in enumerate(sorted_items):
+        # hard cap
        if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0:
            log('dropped')
            item.remove()
@@ -343,6 +361,7 @@ def FeedGather(rss, url, options):

        item = ItemFix(item, options, url)

+        # soft cap
        if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
            if not options.proxy:
                if ItemFill(item, options, url, True) is False:
@@ -409,7 +428,7 @@ def process(url, cache=None, options=None):
    options = Options(options)

    if cache:
-        crawler.default_cache = crawler.SQLiteCache(cache)
+        caching.default_cache = caching.DiskCacheHandler(cache)

    url, rss = FeedFetch(url, options)
    rss = FeedGather(rss, url, options)
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@@ -15,22 +15,22 @@
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.

+import re
+
+import bs4.builder._lxml
 import lxml.etree
 import lxml.html
-from bs4 import BeautifulSoup
-import re
+import lxml.html.soupparser
+
+
+class CustomTreeBuilder(bs4.builder._lxml.LXMLTreeBuilder):
+    def default_parser(self, encoding):
+        return lxml.html.HTMLParser(target=self, remove_comments=True, remove_pis=True, encoding=encoding)


 def parse(data, encoding=None):
-    if encoding:
-        data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')
-
-    else:
-        data = BeautifulSoup(data, 'lxml').prettify('utf-8')
-
-    parser = lxml.html.HTMLParser(remove_comments=True, encoding='utf-8')
-
-    return lxml.html.fromstring(data, parser=parser)
+    kwargs = {'from_encoding': encoding} if encoding else {}
+    return lxml.html.soupparser.fromstring(data, builder=CustomTreeBuilder, **kwargs)


 def count_words(string):
@@ -43,6 +43,8 @@ def count_words(string):
    if string is None:
        return 0

+    string = string.strip()
+
    i = 0
    count = 0

@@ -152,15 +154,20 @@ def score_all(node):

    for child in node:
        score = score_node(child)
-        child.attrib['morss_own_score'] = str(float(score))
+        set_score(child, score, 'morss_own_score')

        if score > 0 or len(list(child.iterancestors())) <= 2:
            spread_score(child, score)
            score_all(child)


-def set_score(node, value):
-    node.attrib['morss_score'] = str(float(value))
+def set_score(node, value, label='morss_score'):
+    try:
+        node.attrib[label] = str(float(value))
+
+    except KeyError:
+        # catch issues with e.g. html comments
+        pass


 def get_score(node):
@@ -200,6 +207,12 @@ def clean_root(root, keep_threshold=None):
 def clean_node(node, keep_threshold=None):
    parent = node.getparent()

+    # remove comments
+    if (isinstance(node, lxml.html.HtmlComment)
+            or isinstance(node, lxml.html.HtmlProcessingInstruction)):
+        parent.remove(node)
+        return
+
    if parent is None:
        # this is <html/> (or a removed element waiting for GC)
        return
@@ -210,7 +223,7 @@ def clean_node(node, keep_threshold=None):
        return

    # high score, so keep
-    if keep_threshold is not None and get_score(node) >= keep_threshold:
+    if keep_threshold is not None and keep_threshold > 0 and get_score(node) >= keep_threshold:
        return

    gdparent = parent.getparent()
@@ -231,11 +244,6 @@ def clean_node(node, keep_threshold=None):
        parent.remove(node)
        return

-    # remove comments
-    if isinstance(node, lxml.html.HtmlComment) or isinstance(node, lxml.html.HtmlProcessingInstruction):
-        parent.remove(node)
-        return
-
    # remove if too many kids & too high link density
    wc = count_words(node.text_content())
    if wc != 0 and len(list(node.iter())) > 3:
@@ -293,28 +301,26 @@ def clean_node(node, keep_threshold=None):
            gdparent.insert(gdparent.index(parent)+1, new_node)


-def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
-    ancestorsA = list(nodeA.iterancestors())
-    ancestorsB = list(nodeB.iterancestors())
+def lowest_common_ancestor(node_a, node_b, max_depth=None):
+    ancestors_a = list(node_a.iterancestors())
+    ancestors_b = list(node_b.iterancestors())

    if max_depth is not None:
-        ancestorsA = ancestorsA[:max_depth]
-        ancestorsB = ancestorsB[:max_depth]
+        ancestors_a = ancestors_a[:max_depth]
+        ancestors_b = ancestors_b[:max_depth]

-    ancestorsA.insert(0, nodeA)
-    ancestorsB.insert(0, nodeB)
+    ancestors_a.insert(0, node_a)
+    ancestors_b.insert(0, node_b)

-    for ancestorA in ancestorsA:
-        if ancestorA in ancestorsB:
-            return ancestorA
+    for ancestor_a in ancestors_a:
+        if ancestor_a in ancestors_b:
+            return ancestor_a

-    return nodeA # should always find one tho, at least <html/>, but needed for max_depth
+    return node_a # should always find one tho, at least <html/>, but needed for max_depth


-def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5):
-    " Input a raw html string, returns a raw html string of the article "
-
-    html = parse(data, encoding_in)
+def get_best_node(html, threshold=5):
+    # score all nodes
    score_all(html)

    # rank all nodes (largest to smallest)
@@ -331,9 +337,33 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=
    else:
        best = ranked_nodes[0]

+    return best
+
+
+def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5, xpath=None):
+    " Input a raw html string, returns a raw html string of the article "
+
+    html = parse(data, encoding_in)
+
+    if xpath is not None:
+        xpath_match = html.xpath(xpath)
+
+        if len(xpath_match):
+            best = xpath_match[0]
+
+        else:
+            best = get_best_node(html, threshold)
+
+    else:
+        best = get_best_node(html, threshold)
+
+    if best is None:
+        # if threshold not met
+        return None
+
    # clean up
    if not debug:
-        keep_threshold = get_score(ranked_nodes[0]) * 3/4
+        keep_threshold = get_score(best) * 3/4
        clean_root(best, keep_threshold)

    # check for spammy content (links only)
@@ -352,6 +382,7 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=

 if __name__ == '__main__':
    import sys
+
    from . import crawler

    req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
--- a/morss/util.py
+++ b/morss/util.py
@@ -0,0 +1,57 @@
+# This file is part of morss
+#
+# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License along
+# with this program. If not, see <https://www.gnu.org/licenses/>.
+
+import os
+import os.path
+import sys
+
+
+def pkg_path(*path_elements):
+    return os.path.join(os.path.dirname(__file__), *path_elements)
+
+
+data_path_base = None
+
+
+def data_path(*path_elements):
+    global data_path_base
+
+    path = os.path.join(*path_elements)
+
+    if data_path_base is not None:
+        return os.path.join(data_path_base, path)
+
+    bases = [
+        os.path.join(sys.prefix, 'share/morss'), # when installed as root
+        pkg_path('../../../share/morss'), 
+        pkg_path('../../../../share/morss'),
+        pkg_path('../share/morss'), # for `pip install --target=dir morss`
+        pkg_path('..'), # when running from source tree
+    ]
+
+    if 'DATA_PATH' in os.environ:
+        bases.append(os.environ['DATA_PATH'])
+
+    for base in bases:
+        full_path = os.path.join(base, path)
+
+        if os.path.isfile(full_path):
+            data_path_base = os.path.abspath(base)
+            return data_path(path)
+
+    else:
+        raise IOError()
--- a/morss/wsgi.py
+++ b/morss/wsgi.py
@@ -15,16 +15,16 @@
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.

-import sys
+import cgitb
+import mimetypes
 import os.path
 import re
-import lxml.etree
-
-import cgitb
-import wsgiref.util
-import wsgiref.simple_server
+import sys
 import wsgiref.handlers
-import mimetypes
+import wsgiref.simple_server
+import wsgiref.util
+
+import lxml.etree

 try:
    # python 2
@@ -33,13 +33,12 @@ except ImportError:
    # python 3
    from urllib.parse import unquote

-from . import crawler
-from . import readabilite
-from .morss import FeedFetch, FeedGather, FeedFormat
-from .morss import Options, log, TIMEOUT, DELAY, MorssException
+from . import caching, crawler, readabilite
+from .morss import (DELAY, TIMEOUT, FeedFetch, FeedFormat, FeedGather,
+                    MorssException, Options, log)
+from .util import data_path

-
-PORT = int(os.getenv('PORT', 8080))
+PORT = int(os.getenv('PORT', 8000))


 def parse_options(options):
@@ -50,7 +49,7 @@ def parse_options(options):
        split = option.split('=', 1)

        if len(split) > 1:
-            out[split[0]] = split[1]
+            out[split[0]] = unquote(split[1]).replace('|', '/') # | -> / for backward compatibility (and Apache)

        else:
            out[split[0]] = True
@@ -58,14 +57,18 @@ def parse_options(options):
    return out


-def get_path(environ):
+def request_uri(environ):
    if 'REQUEST_URI' in environ:
-        # when running on Apache
-        url = unquote(environ['REQUEST_URI'][1:])
+        # when running on Apache/uwsgi
+        url = environ['REQUEST_URI']
+
+    elif 'RAW_URI' in environ:
+        # gunicorn
+        url = environ['RAW_URI']

    else:
-        # when using internal server
-        url = environ['PATH_INFO'][1:]
+        # when using other servers
+        url = environ['PATH_INFO']

        if environ['QUERY_STRING']:
            url += '?' + environ['QUERY_STRING']
@@ -76,19 +79,13 @@ def get_path(environ):
 def cgi_parse_environ(environ):
    # get options

-    url = get_path(environ)
-    url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
+    url = request_uri(environ)[1:]
+    url = re.sub(r'^(cgi/)?(morss.py|main.py)/', '', url)

    if url.startswith(':'):
-        split = url.split('/', 1)
-
-        raw_options = split[0].replace('|', '/').replace('\\\'', '\'').split(':')[1:]
-
-        if len(split) > 1:
-            url = split[1]
-
-        else:
-            url = ''
+        parts = url.split('/', 1)
+        raw_options = parts[0].split(':')[1:]
+        url = parts[1] if len(parts) > 1 else ''

    else:
        raw_options = []
@@ -164,33 +161,28 @@ def middleware(func):
 def cgi_file_handler(environ, start_response, app):
    " Simple HTTP server to serve static files (.html, .css, etc.) "

-    url = get_path(environ)
+    url = request_uri(environ)[1:]

    if url == '':
        url = 'index.html'

    if re.match(r'^/?([a-zA-Z0-9_-][a-zA-Z0-9\._-]+/?)*$', url):
        # if it is a legitimate url (no funny relative paths)
-        paths = [
-            os.path.join(sys.prefix, 'share/morss/www', url),
-            os.path.join(os.path.dirname(__file__), '../www', url)
-            ]
+        try:
+            path = data_path('www', url)
+            f = open(path, 'rb')

-        for path in paths:
-            try:
-                f = open(path, 'rb')
+        except IOError:
+            # problem with file (cannot open or not found)
+            pass

-            except IOError:
-                # problem with file (cannot open or not found)
-                continue
-
-            else:
-                # file successfully open
-                headers = {}
-                headers['status'] = '200 OK'
-                headers['content-type'] = mimetypes.guess_type(path)[0] or 'application/octet-stream'
-                start_response(headers['status'], list(headers.items()))
-                return wsgiref.util.FileWrapper(f)
+        else:
+            # file successfully open
+            headers = {}
+            headers['status'] = '200 OK'
+            headers['content-type'] = mimetypes.guess_type(path)[0] or 'application/octet-stream'
+            start_response(headers['status'], list(headers.items()))
+            return wsgiref.util.FileWrapper(f)

    # regex didn't validate or no file found
    return app(environ, start_response)
@@ -200,32 +192,36 @@ def cgi_get(environ, start_response):
    url, options = cgi_parse_environ(environ)

    # get page
-    req = crawler.adv_get(url=url, timeout=TIMEOUT)
+    if options['get'] in ('page', 'article'):
+        req = crawler.adv_get(url=url, timeout=TIMEOUT)

-    if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
-        if options.get == 'page':
-            html = readabilite.parse(req['data'], encoding=req['encoding'])
-            html.make_links_absolute(req['url'])
+        if req['contenttype'] in crawler.MIMETYPE['html']:
+            if options['get'] == 'page':
+                html = readabilite.parse(req['data'], encoding=req['encoding'])
+                html.make_links_absolute(req['url'])

-            kill_tags = ['script', 'iframe', 'noscript']
+                kill_tags = ['script', 'iframe', 'noscript']

-            for tag in kill_tags:
-                for elem in html.xpath('//'+tag):
-                    elem.getparent().remove(elem)
+                for tag in kill_tags:
+                    for elem in html.xpath('//'+tag):
+                        elem.getparent().remove(elem)

-            output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
+                output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')

-        elif options.get == 'article':
-            output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
+            else: # i.e. options['get'] == 'article'
+                output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
+
+        elif req['contenttype'] in crawler.MIMETYPE['xml'] + crawler.MIMETYPE['rss'] + crawler.MIMETYPE['json']:
+            output = req['data']

        else:
-            raise MorssException('no :get option passed')
+            raise MorssException('unsupported mimetype')

    else:
-        output = req['data']
+        raise MorssException('no :get option passed')

    # return html page
-    headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
+    headers = {'status': '200 OK', 'content-type': req['contenttype'], 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
    start_response(headers['status'], list(headers.items()))
    return [output]

@@ -255,9 +251,9 @@ def cgi_error_handler(environ, start_response, app):
        raise

    except Exception as e:
-        headers = {'status': '500 Oops', 'content-type': 'text/html'}
+        headers = {'status': '404 Not Found', 'content-type': 'text/html', 'x-morss-error': repr(e)}
        start_response(headers['status'], list(headers.items()), sys.exc_info())
-        log('ERROR: %s' % repr(e), force=True)
+        log('ERROR: %s' % repr(e))
        return [cgitb.html(sys.exc_info())]


@@ -283,13 +279,20 @@ def cgi_handle_request():
    wsgiref.handlers.CGIHandler().run(app)


+class WSGIRequestHandlerRequestUri(wsgiref.simple_server.WSGIRequestHandler):
+    def get_environ(self):
+        env = wsgiref.simple_server.WSGIRequestHandler.get_environ(self)
+        env['REQUEST_URI'] = self.path
+        return env
+
+
 def cgi_start_server():
-    crawler.default_cache.autotrim()
+    caching.default_cache.autotrim()

    print('Serving http://localhost:%s/' % PORT)
-    httpd = wsgiref.simple_server.make_server('', PORT, application)
+    httpd = wsgiref.simple_server.make_server('', PORT, application, handler_class=WSGIRequestHandlerRequestUri)
    httpd.serve_forever()


 if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''):
-    crawler.default_cache.autotrim()
+    caching.default_cache.autotrim()
--- a/setup.py
+++ b/setup.py
@@ -1,24 +1,60 @@
-from setuptools import setup
+from datetime import datetime
 from glob import glob

+from setuptools import setup
+
+
+def get_version():
+    with open('morss/__init__.py', 'r+') as file:
+        lines = file.readlines()
+
+        # look for hard coded version number
+        for i in range(len(lines)):
+            if lines[i].startswith('__version__'):
+                version = lines[i].split('"')[1]
+                break
+
+        # create (& save) one if none found
+        if version == '':
+            version = datetime.now().strftime('%Y%m%d.%H%M')
+            lines[i] = '__version__ = "' + version + '"\n'
+
+            file.seek(0)
+            file.writelines(lines)
+
+        # return version number
+        return version
+
 package_name = 'morss'

 setup(
    name = package_name,
+    version = get_version(),
    description = 'Get full-text RSS feeds',
-    author = 'pictuga, Samuel Marks',
-    author_email = 'contact at pictuga dot com',
+    long_description = open('README.md').read(),
+    long_description_content_type = 'text/markdown',
+    author = 'pictuga',
+    author_email = 'contact@pictuga.com',
    url = 'http://morss.it/',
-    download_url = 'https://git.pictuga.com/pictuga/morss',
+    project_urls = {
+        'Source': 'https://git.pictuga.com/pictuga/morss',
+        'Bug Tracker': 'https://github.com/pictuga/morss/issues',
+    },
    license = 'AGPL v3',
    packages = [package_name],
-    install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet', 'pymysql'],
+    install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'],
+    extras_require = {
+        'full': ['redis', 'diskcache', 'gunicorn', 'setproctitle'],
+        'dev': ['pylint', 'pyenchant', 'pytest', 'pytest-cov'],
+    },
+    python_requires = '>=2.7',
    package_data = {package_name: ['feedify.ini']},
    data_files = [
        ('share/' + package_name, ['README.md', 'LICENSE']),
        ('share/' + package_name + '/www', glob('www/*.*')),
-        ('share/' + package_name + '/www/cgi', [])
    ],
    entry_points = {
-        'console_scripts': [package_name + '=' + package_name + '.__main__:main']
-    })
+        'console_scripts': [package_name + '=' + package_name + '.__main__:main'],
+    },
+    scripts = ['morss-helper'],
+)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,60 @@
+import os
+import os.path
+import threading
+
+import pytest
+
+try:
+    # python2
+    from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
+    from SimpleHTTPServer import SimpleHTTPRequestHandler
+except:
+    # python3
+    from http.server import (BaseHTTPRequestHandler, HTTPServer,
+                             SimpleHTTPRequestHandler)
+
+class HTTPReplayHandler(SimpleHTTPRequestHandler):
+    " Serves pages saved alongside with headers. See `curl --http1.1 -is http://...` "
+
+    directory = os.path.join(os.path.dirname(__file__), './samples/')
+
+    __init__ = BaseHTTPRequestHandler.__init__
+
+    def do_GET(self):
+        path = self.translate_path(self.path)
+
+        if os.path.isdir(path):
+            f = self.list_directory(path)
+
+        else:
+            f = open(path, 'rb')
+
+        try:
+            self.copyfile(f, self.wfile)
+
+        finally:
+            f.close()
+
+class MuteHTTPServer(HTTPServer):
+    def handle_error(self, request, client_address):
+        # mute errors
+        pass
+
+def make_server(port=8888):
+    print('Serving http://localhost:%s/' % port)
+    return MuteHTTPServer(('', port), RequestHandlerClass=HTTPReplayHandler)
+
+@pytest.fixture
+def replay_server():
+    httpd = make_server()
+    thread = threading.Thread(target=httpd.serve_forever)
+    thread.start()
+
+    yield
+
+    httpd.shutdown()
+    thread.join()
+
+if __name__ == '__main__':
+    httpd = make_server()
+    httpd.serve_forever()
--- a/tests/samples/200-ok.txt
+++ b/tests/samples/200-ok.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 200 OK
+content-type: text/plain
+
+success
--- a/tests/samples/301-redirect-abs.txt
+++ b/tests/samples/301-redirect-abs.txt
@@ -0,0 +1,3 @@
+HTTP/1.1 301 Moved Permanently
+location: /200-ok.txt
+
--- a/tests/samples/301-redirect-rel.txt
+++ b/tests/samples/301-redirect-rel.txt
@@ -0,0 +1,3 @@
+HTTP/1.1 301 Moved Permanently
+location: ./200-ok.txt
+
--- a/tests/samples/301-redirect-url.txt
+++ b/tests/samples/301-redirect-url.txt
@@ -0,0 +1,3 @@
+HTTP/1.1 301 Moved Permanently
+location: http://localhost:8888/200-ok.txt
+
--- a/tests/samples/308-redirect.txt
+++ b/tests/samples/308-redirect.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 308 Permanent Redirect
+location: /200-ok.txt
+
+/200-ok.txt
--- a/tests/samples/alternate-abs.txt
+++ b/tests/samples/alternate-abs.txt
@@ -0,0 +1,8 @@
+HTTP/1.1 200 OK
+content-type: text/html; charset=UTF-8
+
+<!DOCTYPE html>
+<html>
+<head><link rel="alternate" type="application/rss+xml" href="/200-ok.txt" /></head>
+<body>meta redirect</body>
+</html>
--- a/tests/samples/enc-gb2312-header.txt
+++ b/tests/samples/enc-gb2312-header.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 200 OK
+content-type: text/plain; charset=gb2312
+
+<EFBFBD>ɹ<EFBFBD>
--- a/tests/samples/enc-gb2312-meta.txt
+++ b/tests/samples/enc-gb2312-meta.txt
@@ -0,0 +1,10 @@
+HTTP/1.1 200 OK
+content-type: text/html
+
+
+<!DOCTYPE html>
+<html>
+<head><meta charset="gb2312"/></head>
+<body>
+<EFBFBD>ɹ<EFBFBD>
+</body></html>
--- a/tests/samples/enc-iso-8859-1-header.txt
+++ b/tests/samples/enc-iso-8859-1-header.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 200 OK
+content-type: text/plain; charset=iso-8859-1
+
+succ<EFBFBD>s
--- a/tests/samples/enc-iso-8859-1-missing.txt
+++ b/tests/samples/enc-iso-8859-1-missing.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 200 OK
+content-type: text/plain
+
+succ<EFBFBD>s
--- a/tests/samples/enc-utf-8-header.txt
+++ b/tests/samples/enc-utf-8-header.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 200 OK
+content-type: text/plain; charset=UTF-8
+
+succès
--- a/tests/samples/feed-atom-utf-8.txt
+++ b/tests/samples/feed-atom-utf-8.txt
@@ -0,0 +1,16 @@
+HTTP/1.1 200 OK
+Content-Type: text/xml; charset=utf-8
+
+<?xml version='1.0' encoding='utf-8'?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+	<title>!TITLE!</title>
+	<subtitle>!DESC!</subtitle>
+	<entry>
+		<title>!ITEM_TITLE!</title>
+		<summary>!ITEM_DESC!</summary>
+		<content type="html">!ITEM_CONTENT!</content>
+		<link href="!ITEM_LINK!"/>
+		<updated>2022-01-01T00:00:01+01:00</updated>
+		<published>2022-01-01T00:00:02+01:00</published>
+	</entry>
+</feed>
--- a/tests/samples/feed-atom03-utf-8.txt
+++ b/tests/samples/feed-atom03-utf-8.txt
@@ -0,0 +1,15 @@
+HTTP/1.1 200 OK
+content-type: application/xml
+
+<?xml version='1.0' encoding='utf-8' ?>
+<feed version='0.3' xmlns='http://purl.org/atom/ns#'>
+	<title>!TITLE!</title>
+	<subtitle>!DESC!</subtitle>
+	<entry>
+		<title>!ITEM_TITLE!</title>
+		<link rel='alternate' type='text/html' href='!ITEM_LINK!' />
+		<summary>!ITEM_DESC!</summary>
+		<content>!ITEM_CONTENT!</content>
+		<issued>2022-01-01T00:00:01+01:00</issued> <!-- FIXME -->
+	</entry>
+</feed>
--- a/tests/samples/feed-html-utf-8.txt
+++ b/tests/samples/feed-html-utf-8.txt
@@ -0,0 +1,22 @@
+HTTP/1.1 200 OK
+Content-Type: text/html; charset=utf-8
+
+<html>
+<head></head>
+
+<body>
+<div id="header">
+	<h1>!TITLE!</h1>
+	<p>!DESC!</p>
+</div>
+
+<div id="content">
+	<div class="item">
+		<a target="_blank" href="!ITEM_LINK!">!ITEM_TITLE!</a>
+		<div class="desc">!ITEM_DESC!</div>
+		<div class="content">!ITEM_CONTENT!</div>
+	</div>
+</div>
+
+</body>
+</html>
--- a/tests/samples/feed-json-utf-8.txt
+++ b/tests/samples/feed-json-utf-8.txt
@@ -0,0 +1,16 @@
+HTTP/1.1 200 OK
+Content-Type: application/json; charset=utf-8
+
+{
+	"title": "!TITLE!",
+	"desc": "!DESC!",
+	"items": [
+		{
+			"title": "!ITEM_TITLE!",
+			"time": "2022-01-01T00:00:01+0100",
+			"url": "!ITEM_LINK!",
+			"desc": "!ITEM_DESC!",
+			"content": "!ITEM_CONTENT!"
+		}
+	]
+}
--- a/tests/samples/feed-rss-channel-utf-8.txt
+++ b/tests/samples/feed-rss-channel-utf-8.txt
@@ -0,0 +1,17 @@
+HTTP/1.1 200 OK
+Content-Type: text/xml; charset=utf-8
+
+<?xml version='1.0' encoding='utf-8'?>
+<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
+  <channel>
+    <title>!TITLE!</title>
+    <description>!DESC!</description>
+    <item>
+      <title>!ITEM_TITLE!</title>
+      <pubDate>Mon, 01 Jan 2022 00:00:01 +0100</pubDate>
+      <link>!ITEM_LINK!</link>
+      <description>!ITEM_DESC!</description>
+      <content:encoded>!ITEM_CONTENT!</content:encoded>
+    </item>
+  </channel>
+</rss>
--- a/tests/samples/gzip.txt
+++ b/tests/samples/gzip.txt
--- a/tests/samples/header-refresh.txt
+++ b/tests/samples/header-refresh.txt
@@ -0,0 +1,3 @@
+HTTP/1.1 200 OK
+refresh: 0;url=/200-ok.txt
+
--- a/tests/samples/meta-redirect-abs.txt
+++ b/tests/samples/meta-redirect-abs.txt
@@ -0,0 +1,8 @@
+HTTP/1.1 200 OK
+content-type: text/html; charset=UTF-8
+
+<!DOCTYPE html>
+<html>
+<head><meta http-equiv="refresh" content="2; url = /200-ok.txt" /></head>
+<body>meta redirect</body>
+</html>
--- a/tests/samples/meta-redirect-rel.txt
+++ b/tests/samples/meta-redirect-rel.txt
@@ -0,0 +1,8 @@
+HTTP/1.1 200 OK
+content-type: text/html; charset=UTF-8
+
+<!DOCTYPE html>
+<html>
+<head><meta http-equiv="refresh" content="2; url = ./200-ok.txt" /></head>
+<body>meta redirect</body>
+</html>
--- a/tests/samples/meta-redirect-url.txt
+++ b/tests/samples/meta-redirect-url.txt
@@ -0,0 +1,8 @@
+HTTP/1.1 200 OK
+content-type: text/html; charset=UTF-8
+
+<!DOCTYPE html>
+<html>
+<head><meta http-equiv="refresh" content="2; url = http://localhost:8888/200-ok.txt" /></head>
+<body>meta redirect</body>
+</html>
--- a/tests/samples/size-1MiB.txt
+++ b/tests/samples/size-1MiB.txt
--- a/tests/test_crawler.py
+++ b/tests/test_crawler.py
@@ -0,0 +1,62 @@
+import pytest
+
+from morss.crawler import *
+
+
+def test_get(replay_server):
+    assert get('http://localhost:8888/200-ok.txt') == b'success\r\n'
+
+def test_adv_get(replay_server):
+    assert adv_get('http://localhost:8888/200-ok.txt')['data'] == b'success\r\n'
+
+@pytest.mark.parametrize('before,after', [
+    (b'http://localhost:8888/',     'http://localhost:8888/'),
+    ('localhost:8888/',             'http://localhost:8888/'),
+    ('http:/localhost:8888/',       'http://localhost:8888/'),
+    ('http://localhost:8888/&/',     'http://localhost:8888/&/'),
+    ('http://localhost:8888/ /',    'http://localhost:8888/%20/'),
+    ('http://localhost-€/€/',       'http://xn--localhost--077e/%E2%82%AC/'),
+    ('http://localhost-€:8888/€/',  'http://xn--localhost--077e:8888/%E2%82%AC/'),
+    ])
+def test_sanitize_url(before, after):
+    assert sanitize_url(before) == after
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(SizeLimitHandler(500*1024))])
+def test_size_limit_handler(replay_server, opener):
+    assert len(opener.open('http://localhost:8888/size-1MiB.txt').read()) == 500*1024
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(GZIPHandler())])
+def test_gzip_handler(replay_server, opener):
+    assert opener.open('http://localhost:8888/gzip.txt').read() == b'success\n'
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(EncodingFixHandler())])
+@pytest.mark.parametrize('url', [
+    'enc-gb2312-header.txt', 'enc-gb2312-meta.txt', #'enc-gb2312-missing.txt',
+    'enc-iso-8859-1-header.txt', 'enc-iso-8859-1-missing.txt',
+    'enc-utf-8-header.txt',
+    ])
+def test_encoding_fix_handler(replay_server, opener, url):
+    out = adv_get('http://localhost:8888/%s' % url)
+    out = out['data'].decode(out['encoding'])
+    assert 'succes' in out or 'succès' in out or '成功' in out
+
+@pytest.mark.parametrize('opener', [custom_opener(follow='rss'), build_opener(AlternateHandler(MIMETYPE['rss']))])
+def test_alternate_handler(replay_server, opener):
+    assert opener.open('http://localhost:8888/alternate-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPEquivHandler(), HTTPRefreshHandler())])
+def test_http_equiv_handler(replay_server, opener):
+    assert opener.open('http://localhost:8888/meta-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+    assert opener.open('http://localhost:8888/meta-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+    assert opener.open('http://localhost:8888/meta-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPAllRedirectHandler())])
+def test_http_all_redirect_handler(replay_server, opener):
+    assert opener.open('http://localhost:8888/308-redirect.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+    assert opener.open('http://localhost:8888/301-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+    assert opener.open('http://localhost:8888/301-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+    assert opener.open('http://localhost:8888/301-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPRefreshHandler())])
+def test_http_refresh_handler(replay_server, opener):
+    assert opener.open('http://localhost:8888/header-refresh.txt').geturl() == 'http://localhost:8888/200-ok.txt'
--- a/tests/test_feeds.py
+++ b/tests/test_feeds.py
@@ -0,0 +1,108 @@
+import pytest
+
+from morss.crawler import adv_get
+from morss.feeds import *
+
+
+def get_feed(url):
+    url = 'http://localhost:8888/%s' % url
+    out = adv_get(url)
+    feed = parse(out['data'], url=url, encoding=out['encoding'])
+    return feed
+
+def check_feed(feed):
+    # NB. time and updated not covered
+    assert feed.title == '!TITLE!'
+    assert feed.desc == '!DESC!'
+    assert feed[0] == feed.items[0]
+    assert feed[0].title == '!ITEM_TITLE!'
+    assert feed[0].link == '!ITEM_LINK!'
+    assert '!ITEM_DESC!' in feed[0].desc # broader test due to possible inclusion of surrounding <div> in xml
+    assert '!ITEM_CONTENT!' in feed[0].content
+
+def check_output(feed):
+    output = feed.tostring()
+    assert '!TITLE!' in output
+    assert '!DESC!' in output
+    assert '!ITEM_TITLE!' in output
+    assert '!ITEM_LINK!' in output
+    assert '!ITEM_DESC!' in output
+    assert '!ITEM_CONTENT!' in output
+
+def check_change(feed):
+    feed.title = '!TITLE2!'
+    feed.desc = '!DESC2!'
+    feed[0].title = '!ITEM_TITLE2!'
+    feed[0].link = '!ITEM_LINK2!'
+    feed[0].desc = '!ITEM_DESC2!'
+    feed[0].content = '!ITEM_CONTENT2!'
+
+    assert feed.title == '!TITLE2!'
+    assert feed.desc == '!DESC2!'
+    assert feed[0].title == '!ITEM_TITLE2!'
+    assert feed[0].link == '!ITEM_LINK2!'
+    assert '!ITEM_DESC2!' in feed[0].desc
+    assert '!ITEM_CONTENT2!' in feed[0].content
+
+def check_add(feed):
+    feed.append({
+        'title': '!ITEM_TITLE3!',
+        'link': '!ITEM_LINK3!',
+        'desc': '!ITEM_DESC3!',
+        'content': '!ITEM_CONTENT3!',
+    })
+
+    assert feed[1].title == '!ITEM_TITLE3!'
+    assert feed[1].link == '!ITEM_LINK3!'
+    assert '!ITEM_DESC3!' in feed[1].desc
+    assert '!ITEM_CONTENT3!' in feed[1].content
+
+each_format = pytest.mark.parametrize('url', [
+    'feed-rss-channel-utf-8.txt', 'feed-atom-utf-8.txt',
+    'feed-atom03-utf-8.txt', 'feed-json-utf-8.txt', 'feed-html-utf-8.txt',
+    ])
+
+each_check = pytest.mark.parametrize('check', [
+    check_feed, check_output, check_change, check_add,
+    ])
+
+@each_format
+@each_check
+def test_parse(replay_server, url, check):
+    feed = get_feed(url)
+    check(feed)
+
+@each_format
+@each_check
+def test_convert_rss(replay_server, url, check):
+    feed = get_feed(url)
+    feed = feed.convert(FeedXML)
+    check(feed)
+
+@each_format
+@each_check
+def test_convert_json(replay_server, url, check):
+    feed = get_feed(url)
+    feed = feed.convert(FeedJSON)
+    check(feed)
+
+@each_format
+@each_check
+def test_convert_html(replay_server, url, check):
+    feed = get_feed(url)
+    feed = feed.convert(FeedHTML)
+    if len(feed) > 1:
+        # remove the 'blank' default html item
+        del feed[0]
+    check(feed)
+
+@each_format
+def test_convert_csv(replay_server, url):
+    # only csv output, not csv feed, check therefore differnet
+    feed = get_feed(url)
+    output = feed.tocsv()
+
+    assert '!ITEM_TITLE!' in output
+    assert '!ITEM_LINK!' in output
+    assert '!ITEM_DESC!' in output
+    assert '!ITEM_CONTENT!' in output
--- a/www/.htaccess
+++ b/www/.htaccess
@@ -1,15 +0,0 @@
-Options -Indexes
-
-ErrorDocument 403 "Access forbidden"
-ErrorDocument 404 /cgi/main.py
-ErrorDocument 500 "A very nasty bug found his way onto this very server"
-
-# Uncomment below line to turn debug on for all requests
-#SetEnv DEBUG 1
-
-# Uncomment below line to turn debug on for requests with :debug in the url
-#SetEnvIf Request_URI :debug DEBUG=1
-
-<Files ~ "\.(py|pyc|db|log)$">
-	deny from all
-</Files>
--- a/www/cgi/.htaccess
+++ b/www/cgi/.htaccess
@@ -1,9 +0,0 @@
-order allow,deny
-
-deny from all
-
-<Files main.py>
-	allow from all
-	AddHandler cgi-script .py
-	Options +ExecCGI
-</Files>
--- a/www/sheet.xsl
+++ b/www/sheet.xsl
@@ -16,6 +16,7 @@
 			<title>RSS feed by morss</title>
 			<meta name="viewport" content="width=device-width; initial-scale=1.0;" />
 			<meta name="robots" content="noindex" />
+			<link rel="shortcut icon" type="image/svg+xml" href="/logo.svg" sizes="any" />

 			<style type="text/css">
 				body * {
@@ -191,9 +192,9 @@
 					feed as 
 					<select>
 						<option value="">RSS</option>
-						<option value=":json:cors">JSON</option>
-						<option value=":html">HTML</option>
-						<option value=":csv">CSV</option>
+						<option value=":format=json:cors">JSON</option>
+						<option value=":format=html">HTML</option>
+						<option value=":format=csv">CSV</option>
 					</select>
 					using the 
 					<select>
@@ -203,7 +204,9 @@
 					link of the 
 					<select>
 						<option value="">first</option>
-						<option value=":newest" title="Select feed items by publication date (instead of appearing order)">newest (?)</option>
+						<option value=":order=newest" title="Select feed items by publication date (instead of appearing order)">newest (?)</option>
+						<option value=":order=last">last</option>
+						<option value=":order=oldest">oldest</option>
 					</select>
 					items and 
 					<select>