o

2020-04-19 13:14:13 +02:00
51 changed files with 1110 additions and 11969 deletions
--- a/.github/workflows/default.yml
+++ b/.github/workflows/default.yml
@@ -1,78 +0,0 @@
 name: default
 on:
    push:
        branches:
            - master
 jobs:
    test-lint:
        runs-on: ubuntu-latest
        steps:
            - name: Checkout
              uses: actions/checkout@v3
              with:
                  fetch-depth: 0
            - name: Prepare image
              run: apt-get -y update && apt-get -y install python3-pip libenchant-2-2 aspell-en
            - name: Install dependencies
              run: pip3 install .[full] .[dev]
            - run: isort --check-only --diff .
            - run: pylint morss --rcfile=.pylintrc --disable=C,R,W --fail-under=8
            - run: pytest --cov=morss tests
    python-publish:
        runs-on: ubuntu-latest
        steps:
            - name: Checkout
              uses: actions/checkout@v3
              with:
                  fetch-depth: 0
            - name: Prepare image
              run: apt-get -y update && apt-get -y install python3-pip python3-build
            - name: Build package
              run: python3 -m build
            - name: Publish package
              uses: https://github.com/pypa/gh-action-pypi-publish@release/v1
              with:
                  password: ${{ secrets.pypi_api_token }}
    docker-publish-deploy:
        runs-on: ubuntu-latest
        container:
            image: catthehacker/ubuntu:act-latest
        steps:
            - name: Checkout
              uses: actions/checkout@v3
            - name: Set up QEMU
              uses: https://github.com/docker/setup-qemu-action@v2
            - name: Set up Docker Buildx
              uses: https://github.com/docker/setup-buildx-action@v2
            - name: Login to Docker Hub
              uses: https://github.com/docker/login-action@v2
              with:
                  username: ${{ secrets.docker_user }}
                  password: ${{ secrets.docker_pwd }}
            - name: Build and push
              uses: https://github.com/docker/build-push-action@v4
              with:
                  context: .
                  platforms: linux/amd64,linux/arm64,linux/arm/v7
                  push: true
                  tags: ${{ secrets.docker_repo }}
            - name: Deploy on server
              uses: https://github.com/appleboy/ssh-action@v0.1.10
              with:
                  host: ${{ secrets.ssh_host }}
                  username: ${{ secrets.ssh_user }}
                  key: ${{ secrets.ssh_key }}
                  script: morss-update
--- a/.pylintrc
+++ b/.pylintrc
@@ -1,50 +0,0 @@
 [MASTER]
 ignore=CVS
 suggestion-mode=yes
 extension-pkg-allow-list=lxml.etree
 [MESSAGES CONTROL]
 disable=missing-function-docstring,
        missing-class-docstring,
        missing-module-docstring,
        wrong-spelling-in-comment,
 [REPORTS]
 reports=yes
 score=yes
 [SPELLING]
 spelling-dict=en_GB
 spelling-ignore-words=morss
 [STRING]
 check-quote-consistency=yes
 check-str-concat-over-line-jumps=yes
 [VARIABLES]
 allow-global-unused-variables=no
 init-import=no
 [FORMAT]
 expected-line-ending-format=LF
 indent-string='    '
 max-line-length=120
 max-module-lines=1000
 [BASIC]
 argument-naming-style=snake_case
 attr-naming-style=snake_case
 class-attribute-naming-style=snake_case
 class-const-naming-style=UPPER_CASE
 class-naming-style=PascalCase
 const-naming-style=UPPER_CASE
 function-naming-style=snake_case
 inlinevar-naming-style=snake_case
 method-naming-style=snake_case
 module-naming-style=snake_case
 variable-naming-style=snake_case
 include-naming-hint=yes
 bad-names=foo, bar
 good-names=i, j, k
--- a/18
+++ b/18
@@ -1,16 +1,8 @@
-FROM alpine:edge
+FROM alpine:latest
-ADD . /app
+RUN apk add python3 py3-lxml py3-gunicorn py3-pip git
-RUN set -ex; \
+ADD . /
-	apk add --no-cache --virtual .run-deps python3 py3-lxml py3-setproctitle py3-setuptools; \
+RUN pip3 install /
 	apk add --no-cache --virtual .build-deps py3-pip py3-wheel; \
 	pip3 install --no-cache-dir /app[full]; \
 	apk del .build-deps
-USER 1000:1000
+CMD gunicorn --bind 0.0.0.0:8080 -w 4 morss:cgi_standalone_app
 ENTRYPOINT ["/bin/sh", "/app/morss-helper"]
 CMD ["run"]
 HEALTHCHECK CMD /bin/sh /app/morss-helper check
--- a/README.md
+++ b/README.md
@@ -1,14 +1,10 @@
 # Morss - Get full-text RSS feeds
-[Homepage](https://morss.it/) • 
+_GNU AGPLv3 code_
 [Upstream source code](https://git.pictuga.com/pictuga/morss) • 
 [Github mirror](https://github.com/pictuga/morss) (for Issues & Pull requests)
-[![Build Status](https://ci.pictuga.com/api/badges/pictuga/morss/status.svg)](https://ci.pictuga.com/pictuga/morss)
+Upstream source code: https://git.pictuga.com/pictuga/morss  
-[![Github Stars](https://img.shields.io/github/stars/pictuga/morss?logo=github)](https://github.com/pictuga/morss/stargazers)
+Github mirror (for Issues & Pull requests): https://github.com/pictuga/morss  
-[![Github Forks](https://img.shields.io/github/forks/pictuga/morss?logo=github)](https://github.com/pictuga/morss/network/members)
+Homepage: https://morss.it/
 [![GNU AGPLv3 code](https://img.shields.io/static/v1?label=license&message=AGPLv3)](https://git.pictuga.com/pictuga/morss/src/branch/master/LICENSE)
 [![Logo is CC BY-NC-SA 4.0](https://img.shields.io/static/v1?label=CC&message=BY-NC-SA%204.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/)
 This tool's goal is to get full-text RSS feeds out of striped RSS feeds,
 commonly available on internet. Indeed most newspapers only make a small
@@ -22,7 +18,7 @@ Morss also provides additional features, such as: .csv and json export, extended
 control over output. A strength of morss is its ability to deal with broken
 feeds, and to replace tracking links with direct links to the actual content.
-Morss can also generate feeds from html and json files (see `feeds.py`), which
+Morss can also generate feeds from html and json files (see `feedify.py`), which
 for instance makes it possible to get feeds for Facebook or Twitter, using
 hand-written rules (ie. there's no automatic detection of links to build feeds).
 Please mind that feeds based on html files may stop working unexpectedly, due to
@@ -33,7 +29,6 @@ Additionally morss can detect rss feeds in html pages' `<meta>`.
 You can use this program online for free at **[morss.it](https://morss.it/)**.
 Some features of morss:
 - Read RSS/Atom feeds
 - Create RSS feeds from json/html pages
 - Export feeds as RSS/JSON/CSV/HTML
@@ -41,213 +36,73 @@ Some features of morss:
 - Follow 301/meta redirects
 - Recover xml feeds with corrupt encoding
 - Supports gzip-compressed http content
- HTTP caching with different backends (in-memory/redis/diskcache)
+- HTTP caching with 3 different backends (in-memory/sqlite/mysql)
 - Works as server/cli tool
 - Deobfuscate various tracking links
-## Install
+## Dependencies
-### Python package
+You do need:
-![Build Python](https://img.shields.io/badge/dynamic/json?label=build%20python&query=$.stages[?(@.name=='python')].status&url=https://ci.pictuga.com/api/repos/pictuga/morss/builds/latest)
+- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
-[![PyPI](https://img.shields.io/pypi/v/morss)](https://pypi.org/project/morss/)
+- [lxml](http://lxml.de/) for xml parsing
-[![PyPI Downloads](https://img.shields.io/pypi/dm/morss)](https://pypistats.org/packages/morss)
+- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
 - [dateutil](http://labix.org/python-dateutil) to parse feed dates
 - [chardet](https://pypi.python.org/pypi/chardet)
 - [six](https://pypi.python.org/pypi/six), a dependency of chardet
 - pymysql
-Simple install (without optional dependencies)
+Simplest way to get these:
 From pip
 ```shell
-pip install morss
+pip install git+https://git.pictuga.com/pictuga/morss.git@master
 ```
-From git
+You may also need:
-```shell
+- Apache, with python-cgi support, to run on a server
-pip install git+https://git.pictuga.com/pictuga/morss.git
+- a fast internet connection
 ```
-Full installation (including optional dependencies)
+## Arguments
-From pip
+morss accepts some arguments, to lightly alter the output of morss. Arguments
 may need to have a value (usually a string or a number). In the different "Use
 cases" below is detailed how to pass those arguments to morss.
-```shell
+The arguments are:
 pip install morss[full]
 ```
-From git
+- Change what morss does
 	- `json`: output as JSON
 	- `html`: outpout as HTML
 	- `csv`: outpout as CSV
 	- `proxy`: doesn't fill the articles
 	- `clip`: stick the full article content under the original feed content (useful for twitter)
 	- `search=STRING`: does a basic case-sensitive search in the feed
 - Advanced
 	- `csv`: export to csv
 	- `indent`: returns indented XML or JSON, takes more place, but human-readable
 	- `nolink`: drop links, but keeps links' inner text
 	- `noref`: drop items' link
 	- `cache`: only take articles from the cache (ie. don't grab new articles' content), so as to save time
 	- `debug`: to have some feedback from the script execution. Useful for debugging
 	- `theforce`: force download the rss feed and ignore cached http errros
 	- `silent`: don't output the final RSS (useless on its own, but can be nice when debugging)
 - http server only
 	- `callback=NAME`: for JSONP calls
 	- `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers)
 	- `txt`: changes the http content-type to txt (for faster "`view-source:`")
 - Custom feeds: you can turn any HTML page into a RSS feed using morss, using xpath rules. The article content will be fetched as usual (with readabilite). Please note that you will have to **replace** any `/` in your rule with a `|` when using morss as a webserver
 	- `items`: (**mandatory** to activate the custom feeds function) xpath rule to match all the RSS entries
 	- `item_link`: xpath rule relative to `items` to point to the entry's link
 	- `item_title`: entry's title
 	- `item_content`: entry's description
 	- `item_time`: entry's date & time (accepts a wide range of time formats)
-```shell
+## Use cases
 pip install git+https://git.pictuga.com/pictuga/morss.git#egg=morss[full]
 ```
 The full install includes all the cache backends. Otherwise, only in-memory
 cache is available. The full install also includes gunicorn (for more efficient
 HTTP handling).
 The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
 C code needs to be compiled). If possible on your distribution, try installing
 it with the system package manager.
 ### Docker
 ![Build Docker](https://img.shields.io/badge/dynamic/json?label=build%20docker&query=$.stages[?(@.name=='docker')].status&url=https://ci.pictuga.com/api/repos/pictuga/morss/builds/latest)
 [![Docker Hub](https://img.shields.io/docker/pulls/pictuga/morss)](https://hub.docker.com/r/pictuga/morss)
 [![Docker Arch](https://img.shields.io/badge/dynamic/json?color=blue&label=docker%20arch&query=$.results[0].images[*].architecture&url=https://hub.docker.com/v2/repositories/pictuga/morss/tags)](https://hub.docker.com/r/pictuga/morss/tags)
 From docker hub
 With cli
 ```shell
 docker pull pictuga/morss
 ```
 With docker-compose **(recommended)**
 ```yml
 services:
    app:
        image: pictuga/morss
        ports:
            - '8000:8000'
 ```
 Build from source
 With cli
 ```shell
 docker build --tag morss https://git.pictuga.com/pictuga/morss.git --no-cache --pull
 ```
 With docker-compose
 ```yml
 services:
    app:
        build: https://git.pictuga.com/pictuga/morss.git
        image: morss
        ports:
            - '8000:8000'
 ```
 Then execute
 ```shell
 docker-compose build --no-cache --pull
 ```
 ### Cloud providers
 One-click deployment:
 [![Heroku](https://img.shields.io/static/v1?label=deploy%20to&message=heroku&logo=heroku&color=79589F)](https://heroku.com/deploy?template=https://github.com/pictuga/morss)
 [![Google Cloud](https://img.shields.io/static/v1?label=deploy%20to&message=google&logo=google&color=4285F4)](https://deploy.cloud.run/?git_repo=https://github.com/pictuga/morss.git)
 Providers supporting `cloud-init` (AWS, Oracle Cloud Infrastructure), based on Ubuntu:
 ``` yml
 #cloud-config
 packages:
  - python3-pip
  - python3-wheel
  - python3-lxml
  - python3-setproctitle
  - ca-certificates
 write_files:
  - path: /etc/environment
    append: true
    content: |
      DEBUG=1
      CACHE=diskcache
      CACHE_SIZE=1073741824 # 1GiB
  - path: /var/lib/cloud/scripts/per-boot/morss.sh
    permissions: 744
    content: |
      #!/bin/sh
      /usr/local/bin/morss-helper daemon
 runcmd:
  - source /etc/environment
  - update-ca-certificates
  - iptables -I INPUT 6 -m state --state NEW -p tcp --dport ${PORT:-8000} -j ACCEPT
  - netfilter-persistent save
  - pip install morss[full]
 ```
 ## Run
 morss will auto-detect what "mode" to use.
-### Running on/as a server
+### Running on a server
 Set up the server as indicated below, then visit:
 ```
 http://PATH/TO/MORSS/[main.py/][:argwithoutvalue[:argwithvalue=value[...]]]/FEEDURL
 ```
 For example: `http://morss.example/:clip/https://twitter.com/pictuga`
 *(Brackets indicate optional text)*
 The `main.py` part is only needed if your server doesn't support the Apache
 redirect rule set in the provided `.htaccess`.
 Works like a charm with [Tiny Tiny RSS](https://tt-rss.org/), and most probably
 other clients.
 #### Using Docker
 From docker hub
 ```shell
 docker run -p 8000:8000 pictuga/morss
 ```
 From source
 ```shell
 docker run -p 8000:8000 morss
 ```
 With docker-compose **(recommended)**
 ```shell
 docker-compose up
 ```
 #### Using Gunicorn
 ```shell
 gunicorn --preload morss
 ```
 #### Using uWSGI
 Running this command should do:
 ```shell
 uwsgi --http :8000 --plugin python --wsgi-file main.py
 ```
 #### Using morss' internal HTTP server
 Morss can run its own, **very basic**, HTTP server, meant for debugging mostly.
 The latter should start when you run morss without any argument, on port 8000.
 I'd highly recommend you to use gunicorn or something similar for better
 performance.
 ```shell
 morss
 ```
 You can change the port using environment variables like this `PORT=9000 morss`.
 #### Via mod_cgi/FastCGI with Apache/nginx
 For this, you'll want to change a bit the architecture of the files, for example
@@ -276,49 +131,73 @@ For this, you need to make sure your host allows python script execution. This
 method uses HTTP calls to fetch the RSS feeds, which will be handled through
 `mod_cgi` for example on Apache severs.
-Please pay attention to `main.py` permissions for it to be executable. See below
+Please pay attention to `main.py` permissions for it to be executable. Also
-some tips for the `.htaccess` file.
+ensure that the provided `/www/.htaccess` works well with your server.
-```htaccess
+#### Using uWSGI
 Options -Indexes
-ErrorDocument 404 /cgi/main.py
+Running this command should do:
-# Turn debug on for all requests
+```shell
-SetEnv DEBUG 1
+uwsgi --http :8080 --plugin python --wsgi-file main.py
 # Turn debug on for requests with :debug in the url
 SetEnvIf Request_URI :debug DEBUG=1
 <Files ~ "\.(py|pyc|db|log)$">
 	deny from all
 </Files>
 <Files main.py>
 	allow from all
 	AddHandler cgi-script .py
 	Options +ExecCGI
 </Files>
 ```
 #### Using Gunicorn
 ```shell
 gunicorn morss:cgi_standalone_app
 ```
 #### Using docker
 Build & run
 ```shell
 docker build https://git.pictuga.com/pictuga/morss.git -t morss
 docker run -p 8080:8080 morss
 ```
 In one line
 ```shell
 docker run -p 8080:8080 $(docker build -q https://git.pictuga.com/pictuga/morss.git)
 ```
 #### Using morss' internal HTTP server
 Morss can run its own HTTP server. The later should start when you run morss
 without any argument, on port 8080.
 ```shell
 morss
 ```
 You can change the port like this `morss 9000`.
 #### Passing arguments
 Then visit:
 ```
 http://PATH/TO/MORSS/[main.py/][:argwithoutvalue[:argwithvalue=value[...]]]/FEEDURL
 ```
 For example: `http://morss.example/:clip/https://twitter.com/pictuga`
 *(Brackets indicate optional text)*
 The `main.py` part is only needed if your server doesn't support the Apache redirect rule set in the provided `.htaccess`.
 Works like a charm with [Tiny Tiny RSS](http://tt-rss.org/redmine/projects/tt-rss/wiki), and most probably other clients.
 ### As a CLI application
 Run:
 ```
-morss [--argwithoutvalue] [--argwithvalue=value] [...] FEEDURL
+morss [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
 ```
-
+For example: `morss debug http://feeds.bbci.co.uk/news/rss.xml`
 For example: `morss --clip http://feeds.bbci.co.uk/news/rss.xml`
 *(Brackets indicate optional text)*
 If using Docker:
 ```shell
 docker run morss --clip http://feeds.bbci.co.uk/news/rss.xml
 ```
 ### As a newsreader hook
 To use it, the newsreader [Liferea](http://lzone.de/liferea/) is required
@@ -326,13 +205,10 @@ To use it, the newsreader [Liferea](http://lzone.de/liferea/) is required
 scripts can be run on top of the RSS feed, using its
 [output](http://lzone.de/liferea/scraping.htm) as an RSS feed.
-To use this script, you have to enable "(Unix) command" in liferea feed
+To use this script, you have to enable "(Unix) command" in liferea feed settings, and use the command:
 settings, and use the command:
 ```
-morss [--argwithoutvalue] [--argwithvalue=value] [...] FEEDURL
+morss [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
 ```
 For example: `morss http://feeds.bbci.co.uk/news/rss.xml`
 *(Brackets indicate optional text)*
@@ -340,7 +216,6 @@ For example: `morss http://feeds.bbci.co.uk/news/rss.xml`
 ### As a python library
 Quickly get a full-text feed:
 ```python
 >>> import morss
 >>> xml_string = morss.process('http://feeds.bbci.co.uk/news/rss.xml')
@@ -349,11 +224,10 @@ Quickly get a full-text feed:
 ```
 Using cache and passing arguments:
 ```python
 >>> import morss
 >>> url = 'http://feeds.bbci.co.uk/news/rss.xml'
->>> cache = '/tmp/morss-cache' # diskcache cache location
+>>> cache = '/tmp/morss-cache.db' # sqlite cache location
 >>> options = {'csv':True}
 >>> xml_string = morss.process(url, cache, options)
 >>> xml_string[:50]
@@ -365,165 +239,53 @@ possible to call the simpler functions, to have more control on what's happening
 under the hood.
 Doing it step-by-step:
 ```python
-import morss
+import morss, morss.crawler
 url = 'http://newspaper.example/feed.xml'
 options = morss.Options(csv=True) # arguments
 morss.crawler.sqlite_default = '/tmp/morss-cache.db' # sqlite cache location
-url, rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
+rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
 rss = morss.FeedGather(rss, url, options) # this fills the feed and cleans it up
-output = morss.FeedFormat(rss, options, 'unicode') # formats final feed
+output = morss.Format(rss, options) # formats final feed
 ```
-## Arguments and settings
+## Cache information
-### Arguments
+morss uses caching to make loading faster. There are 3 possible cache backends
 (visible in `morss/crawler.py`):
-morss accepts some arguments, to lightly alter the output of morss. Arguments
+- `{}`: a simple python in-memory dict() object
-may need to have a value (usually a string or a number). How to pass those
+- `SQLiteCache`: sqlite3 cache. Default file location is in-memory (i.e. it will
-arguments to morss is explained in Run above.
+be cleared every time the program is run
 - `MySQLCacheHandler`
-The list of arguments can be obtained by running `morss --help`
+## Configuration
-
+### Length limitation
 ```
 usage: morss [-h] [--post STRING] [--xpath XPATH]
             [--format {rss,json,html,csv}] [--search STRING] [--clip]
             [--indent] [--cache] [--force] [--proxy]
             [--order {first,last,newest,oldest}] [--firstlink] [--resolve]
             [--items XPATH] [--item_link XPATH] [--item_title XPATH]
             [--item_content XPATH] [--item_time XPATH]
             [--mode {xml,html,json}] [--nolink] [--noref] [--silent]
             url
 Get full-text RSS feeds
 positional arguments:
  url                   feed url
 options:
  -h, --help            show this help message and exit
  --post STRING         POST request
  --xpath XPATH         xpath rule to manually detect the article
 output:
  --format {rss,json,html,csv}
                        output format
  --search STRING       does a basic case-sensitive search in the feed
  --clip                stick the full article content under the original feed
                        content (useful for twitter)
  --indent              returns indented XML or JSON, takes more place, but
                        human-readable
 action:
  --cache               only take articles from the cache (ie. don't grab new
                        articles' content), so as to save time
  --force               force refetch the rss feed and articles
  --proxy               doesn't fill the articles
  --order {first,last,newest,oldest}
                        order in which to process items (which are however NOT
                        sorted in the output)
  --firstlink           pull the first article mentioned in the description
                        instead of the default link
  --resolve             replace tracking links with direct links to articles
                        (not compatible with --proxy)
 custom feeds:
  --items XPATH         (mandatory to activate the custom feeds function)
                        xpath rule to match all the RSS entries
  --item_link XPATH     xpath rule relative to items to point to the entry's
                        link
  --item_title XPATH    entry's title
  --item_content XPATH  entry's content
  --item_time XPATH     entry's date & time (accepts a wide range of time
                        formats)
  --mode {xml,html,json}
                        parser to use for the custom feeds
 misc:
  --nolink              drop links, but keeps links' inner text
  --noref               drop items' link
  --silent              don't output the final RSS (useless on its own, but
                        can be nice when debugging)
 GNU AGPLv3 code
 ```
 Further HTTP-only options:
 - `callback=NAME`: for JSONP calls
 - `cors`: allow Cross-origin resource sharing (allows XHR calls from other
 servers)
 - `txt`: changes the http content-type to txt (for faster "`view-source:`")
 ### Environment variables
 To pass environment variables:
 - Docker-cli: `docker run -p 8000:8000 morss --env KEY=value`
 - docker-compose: add an `environment:` section in the .yml file
 - Gunicorn/uWSGI/CLI: prepend `KEY=value` before the command
 - Apache: via the `SetEnv` instruction (see sample `.htaccess` provided)
 - cloud-init: in the `/etc/environment` file
 Generic:
 - `DEBUG=1`: to have some feedback from the script execution. Useful for
 debugging.
 - `IGNORE_SSL=1`: to ignore SSL certs when fetch feeds and articles
 - `DELAY` (seconds) sets the browser cache delay, only for HTTP clients
 - `TIMEOUT` (seconds) sets the HTTP timeout when fetching rss feeds and articles
 - `DATA_PATH`: to set custom file location for the `www` folder
 When parsing long feeds, with a lot of items (100+), morss might take a lot of
 time to parse it, or might even run into a memory overflow on some shared
 hosting plans (limits around 10Mb), in which case you might want to adjust the
-below settings via environment variables.
+different values at the top of the script.
-Also, if the request takes too long to process, the http request might be
+- `MAX_TIME` sets the maximum amount of time spent *fetching* articles, more time might be spent taking older articles from cache. `-1` for unlimited.
-discarded. See relevant config for
+- `MAX_ITEM` sets the maximum number of articles to fetch. `-1` for unlimited. More articles will be taken from cache following the nexts settings.
-[gunicorn](https://docs.gunicorn.org/en/stable/settings.html#timeout) or
+- `LIM_TIME` sets the maximum amount of time spent working on the feed (whether or not it's already cached). Articles beyond that limit will be dropped from the feed. `-1` for unlimited.
-[nginx](http://nginx.org/en/docs/http/ngx_http_proxy_module.html#proxy_read_timeout).
+- `LIM_ITEM` sets the maximum number of article checked, limiting both the number of articles fetched and taken from cache. Articles beyond that limit will be dropped from the feed, even if they're cached. `-1` for unlimited.
- `MAX_TIME` (seconds) sets the maximum amount of time spent *fetching*
+### Other settings
 articles, more time might be spent taking older articles from cache. `-1` for
 unlimited.
 - `MAX_ITEM` sets the maximum number of articles to fetch. `-1` for unlimited.
 More articles will be taken from cache following the nexts settings.
 - `LIM_TIME` (seconds) sets the maximum amount of time spent working on the feed
 (whether or not it's already cached). Articles beyond that limit will be dropped
 from the feed. `-1` for unlimited.
 - `LIM_ITEM` sets the maximum number of article checked, limiting both the
 number of articles fetched and taken from cache. Articles beyond that limit will
 be dropped from the feed, even if they're cached. `-1` for unlimited.
-morss uses caching to make loading faster. There are 3 possible cache backends:
+- `DELAY` sets the browser cache delay, only for HTTP clients
-
+- `TIMEOUT` sets the HTTP timeout when fetching rss feeds and articles
 - `(nothing/default)`: a simple python in-memory dict-like object.
 - `CACHE=redis`: Redis cache. Connection can be defined with the following
 environment variables: `REDIS_HOST`, `REDIS_PORT`, `REDIS_DB`, `REDIS_PWD`
 - `CACHE=diskcache`: disk-based cache. Target directory canbe defined with
 `DISKCACHE_DIR`.
 To limit the size of the cache:
 - `CACHE_SIZE` sets the target number of items in the cache (further items will
 be deleted but the cache might be temporarily bigger than that). Defaults to 1k
 entries. NB. When using `diskcache`, this is the cache max size in Bytes.
 - `CACHE_LIFESPAN` (seconds) sets how often the cache must be trimmed (i.e. cut
 down to the number of items set in `CACHE_SIZE`). Defaults to 1min.
 Gunicorn also accepts command line arguments via the `GUNICORN_CMD_ARGS`
 environment variable.
 ### Content matching
 The content of articles is grabbed with our own readability fork. This means
 that most of the time the right content is matched. However sometimes it fails,
 therefore some tweaking is required. Most of the time, what has to be done is to
-add some "rules" in the main script file in `readabilite.py` (not in morss).
+add some "rules" in the main script file in *readability* (not in morss).
 Most of the time when hardly nothing is matched, it means that the main content
 of the article is made of images, videos, pictures, etc., which readability
@@ -534,3 +296,14 @@ morss will also try to figure out whether the full content is already in place
 (for those websites which understood the whole point of RSS feeds). However this
 detection is very simple, and only works if the actual content is put in the
 "content" section in the feed and not in the "summary" section.
 ***
 ## Todo
 You can contribute to this project. If you're not sure what to do, you can pick
 from this list:
 - Add ability to run morss.py as an update daemon
 - Add ability to use custom xpath rule instead of readability
 - More ideas here <https://github.com/pictuga/morss/issues/15>
--- a/app.json
+++ b/app.json
@@ -1,21 +0,0 @@
 {
 	"stack": "container",
 	"env": {
 		"DEBUG": {
 			"value": 1,
 			"required": false
 		},
 		"GUNICORN_CMD_ARGS": {
 			"value": "",
 			"required": false
 		},
 		"CACHE": {
 			"value": "diskcache",
 			"required": false
 		},
 		"CACHE_SIZE": {
 			"value": 1073741824,
 			"required": false
 		}
 	}
 }
--- a/heroku.yml
+++ b/heroku.yml
@@ -1,3 +0,0 @@
 build: 
  docker:
    web: Dockerfile
--- a/main.py
+++ b/main.py
@@ -1,24 +1,6 @@
 #!/usr/bin/env python
-# This file is part of morss
+from morss import main, cgi_standalone_app as application
 #
 # Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU Affero General Public License as published by the Free
 # Software Foundation, either version 3 of the License, or (at your option) any
 # later version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
 # details.
 #
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.
 from morss.__main__ import main
 from morss.wsgi import application
 if __name__ == '__main__':
    main()
--- a/47
+++ b/47
@@ -1,47 +0,0 @@
 #! /bin/sh
 set -ex
 if ! command -v python && command -v python3 ; then
 	alias python='python3'
 fi
 run() {
 	gunicorn --bind 0.0.0.0:${PORT:-8000} --preload --access-logfile - morss
 }
 daemon() {
 	gunicorn --bind 0.0.0.0:${PORT:-8000} --preload --access-logfile - --daemon morss
 }
 reload() {
 	pid=$(pidof 'gunicorn: master [morss]' || true)
 		# NB. requires python-setproctitle
 		# `|| true` due to `set -e`
 	if [ -z "$pid" ]; then
 		# if gunicorn is not currently running
 		daemon
 	else
 		kill -s USR2 $pid
 		kill -s WINCH $pid
 		sleep 1 # give gunicorn some time to reload
 		kill -s TERM $pid
 	fi
 }
 check() {
 	python -m morss.crawler http://localhost:${PORT:-8000}/ > /dev/null 2>&1
 }
 if [ -z "$1" ]; then
 	run
 elif [ "$1" = "sh" ] || [ "$1" = "bash" ] || command -v "$1" ; then
 	$@
 else
 	python -m morss $@
 fi
--- a/morss.service
+++ b/morss.service
@@ -1,13 +0,0 @@
 [Unit]
 Description=morss server (gunicorn)
 After=network.target
 [Service]
 ExecStart=/usr/local/bin/morss-helper run
 ExecReload=/usr/local/bin/morss-helper reload
 KillMode=process
 Restart=always
 User=http
 [Install]
 WantedBy=multi-user.target
--- a/morss/init.py
+++ b/morss/init.py
@@ -1,25 +1,2 @@
 # This file is part of morss
 #
 # Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU Affero General Public License as published by the Free
 # Software Foundation, either version 3 of the License, or (at your option) any
 # later version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
 # details.
 #
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.
 # ran on `import morss`
 # pylint: disable=unused-import,unused-variable
 __version__ = ""
 from .morss import *
 from .wsgi import application
--- a/morss/main.py
+++ b/morss/main.py
@@ -1,48 +1,5 @@
 # This file is part of morss
 #
 # Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU Affero General Public License as published by the Free
 # Software Foundation, either version 3 of the License, or (at your option) any
 # later version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
 # details.
 #
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.
 # ran on `python -m morss`
-
+from .morss import main
 import os
 import sys
 from . import cli, wsgi
 from .morss import MorssException
 def main():
    if 'REQUEST_URI' in os.environ:
        # mod_cgi (w/o file handler)
        wsgi.cgi_handle_request()
    elif len(sys.argv) <= 1:
        # start internal (basic) http server (w/ file handler)
        wsgi.cgi_start_server()
    else:
        # as a CLI app
        try:
            cli.cli_app()
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            print('ERROR: %s' % e.message)
 if __name__ == '__main__':
    main()
--- a/morss/caching.py
+++ b/morss/caching.py
@@ -1,122 +0,0 @@
 # This file is part of morss
 #
 # Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU Affero General Public License as published by the Free
 # Software Foundation, either version 3 of the License, or (at your option) any
 # later version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
 # details.
 #
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.
 import os
 import threading
 import time
 from collections import OrderedDict
 CACHE_SIZE = int(os.getenv('CACHE_SIZE', 1000)) # max number of items in cache (default: 1k items)
 CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60)) # how often to auto-clear the cache (default: 1min)
 class BaseCache:
    """ Subclasses must behave like a dict """
    def trim(self):
        pass
    def autotrim(self, delay=CACHE_LIFESPAN):
        # trim the cache every so often
        self.trim()
        t = threading.Timer(delay, self.autotrim)
        t.daemon = True
        t.start()
    def __contains__(self, url):
        try:
            self[url]
        except KeyError:
            return False
        else:
            return True
 class CappedDict(OrderedDict, BaseCache):
    def trim(self):
        if CACHE_SIZE >= 0:
            for i in range( max( len(self) - CACHE_SIZE , 0 )):
                self.popitem(False)
    def __setitem__(self, key, data):
        # https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
        if key in self:
            del self[key]
        OrderedDict.__setitem__(self, key, data)
 try:
    import redis # isort:skip
 except ImportError:
    pass
 class RedisCacheHandler(BaseCache):
    def __init__(self, host='localhost', port=6379, db=0, password=None):
        self.r = redis.Redis(host=host, port=port, db=db, password=password)
    def __getitem__(self, key):
        return self.r.get(key)
    def __setitem__(self, key, data):
        self.r.set(key, data)
 try:
    import diskcache # isort:skip
 except ImportError:
    pass
 class DiskCacheHandler(BaseCache):
    def __init__(self, directory=None, **kwargs):
        self.cache = diskcache.Cache(directory=directory, eviction_policy='least-frequently-used', **kwargs)
    def __del__(self):
        self.cache.close()
    def trim(self):
        self.cache.cull()
    def __getitem__(self, key):
        return self.cache[key]
    def __setitem__(self, key, data):
        self.cache.set(key, data)
 if 'CACHE' in os.environ:
    if os.environ['CACHE'] == 'redis':
        default_cache = RedisCacheHandler(
            host = os.getenv('REDIS_HOST', 'localhost'),
            port = int(os.getenv('REDIS_PORT', 6379)),
            db = int(os.getenv('REDIS_DB', 0)),
            password = os.getenv('REDIS_PWD', None)
        )
    elif os.environ['CACHE'] == 'diskcache':
        default_cache = DiskCacheHandler(
            directory = os.getenv('DISKCACHE_DIR', '/tmp/morss-diskcache'),
            size_limit = CACHE_SIZE # in Bytes
        )
 else:
        default_cache = CappedDict()
--- a/morss/cli.py
+++ b/morss/cli.py
@@ -1,72 +0,0 @@
 # This file is part of morss
 #
 # Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU Affero General Public License as published by the Free
 # Software Foundation, either version 3 of the License, or (at your option) any
 # later version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
 # details.
 #
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.
 import argparse
 import os.path
 import sys
 from .morss import FeedFetch, FeedFormat, FeedGather, Options
 def cli_app():
    parser = argparse.ArgumentParser(
        prog='morss',
        description='Get full-text RSS feeds',
        epilog='GNU AGPLv3 code'
        )
    parser.add_argument('url', help='feed url')
    parser.add_argument('--post', action='store', type=str, metavar='STRING', help='POST request')
    parser.add_argument('--xpath', action='store', type=str, metavar='XPATH', help='xpath rule to manually detect the article')
    group = parser.add_argument_group('output')
    group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
    group.add_argument('--search', action='store', type=str, metavar='STRING', help='does a basic case-sensitive search in the feed')
    group.add_argument('--clip', action='store_true', help='stick the full article content under the original feed content (useful for twitter)')
    group.add_argument('--indent', action='store_true', help='returns indented XML or JSON, takes more place, but human-readable')
    group = parser.add_argument_group('action')
    group.add_argument('--cache', action='store_true', help='only take articles from the cache (ie. don\'t grab new articles\' content), so as to save time')
    group.add_argument('--force', action='store_true', help='force refetch the rss feed and articles')
    group.add_argument('--proxy', action='store_true', help='doesn\'t fill the articles')
    group.add_argument('--order', default='first', choices=('first', 'last', 'newest', 'oldest'), help='order in which to process items (which are however NOT sorted in the output)')
    group.add_argument('--firstlink', action='store_true', help='pull the first article mentioned in the description instead of the default link')
    group.add_argument('--resolve', action='store_true', help='replace tracking links with direct links to articles (not compatible with --proxy)')
    group = parser.add_argument_group('custom feeds')
    group.add_argument('--items', action='store', type=str, metavar='XPATH', help='(mandatory to activate the custom feeds function) xpath rule to match all the RSS entries')
    group.add_argument('--item_link', action='store', type=str, metavar='XPATH', help='xpath rule relative to items to point to the entry\'s link')
    group.add_argument('--item_title', action='store', type=str, metavar='XPATH', help='entry\'s title')
    group.add_argument('--item_content', action='store', type=str, metavar='XPATH', help='entry\'s content')
    group.add_argument('--item_time', action='store', type=str, metavar='XPATH', help='entry\'s date & time (accepts a wide range of time formats)')
    group.add_argument('--mode', default=None, choices=('xml', 'html', 'json'), help='parser to use for the custom feeds')
    group = parser.add_argument_group('misc')
    group.add_argument('--nolink', action='store_true', help='drop links, but keeps links\' inner text')
    group.add_argument('--noref', action='store_true', help='drop items\' link')
    group.add_argument('--silent', action='store_true', help='don\'t output the final RSS (useless on its own, but can be nice when debugging)')
    options = Options(vars(parser.parse_args()))
    url = options.url
    url, rss = FeedFetch(url, options)
    rss = FeedGather(rss, url, options)
    out = FeedFormat(rss, options, 'unicode')
    if not options.silent:
        print(out)
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -1,52 +1,21 @@
 # This file is part of morss
 #
 # Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU Affero General Public License as published by the Free
 # Software Foundation, either version 3 of the License, or (at your option) any
 # later version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
 # details.
 #
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.
 import os
 import pickle
 import random
 import re
 import sys
-import time
+
 import zlib
 from cgi import parse_header
 from collections import OrderedDict
 from io import BytesIO, StringIO
-
+import re
 import chardet
-
+from cgi import parse_header
-from .caching import default_cache
+import lxml.html
 import time
 try:
    # python 2
-    from urllib import quote
+    from urllib2 import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
-
+    import mimetools
    from httplib import HTTPMessage
    from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
                         Request, addinfourl, build_opener, parse_http_list,
                         parse_keqv_list)
    from urlparse import urlsplit
 except ImportError:
    # python 3
-    from email import message_from_string
+    from urllib.request import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
-    from http.client import HTTPMessage
+    import email
    from urllib.parse import quote, urlsplit
    from urllib.request import (BaseHandler, HTTPCookieProcessor,
                                HTTPRedirectHandler, Request, addinfourl,
                                build_opener, parse_http_list, parse_keqv_list)
 try:
    # python 2
@@ -59,60 +28,34 @@ except NameError:
 MIMETYPE = {
    'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
    'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
-    'html': ['text/html', 'application/xhtml+xml', 'application/xml'],
+    'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
    'json': ['application/json'],
    }
-DEFAULT_UAS = [
+DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
    #https://gist.github.com/fijimunkii/952acac988f2d25bef7e0284bc63c406
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
    ]
 PROTOCOL = ['http', 'https']
 def get(*args, **kwargs):
-    return adv_get(*args, **kwargs)['data']
+    return adv_get(*args, **kwargs)[0]
-def adv_get(url, post=None, timeout=None, *args, **kwargs):
+def adv_get(url, timeout=None, *args, **kwargs):
    url = sanitize_url(url)
    if post is not None:
        post = post.encode('utf-8')
    if timeout is None:
-        con = custom_opener(*args, **kwargs).open(url, data=post)
+        con = custom_handler(*args, **kwargs).open(url)
    else:
-        con = custom_opener(*args, **kwargs).open(url, data=post, timeout=timeout)
+        con = custom_handler(*args, **kwargs).open(url, timeout=timeout)
    data = con.read()
    contenttype = con.info().get('Content-Type', '').split(';')[0]
    encoding= detect_encoding(data, con)
-    return {
+    return data, con, contenttype, encoding
        'data': data,
        'url': con.geturl(),
        'con': con,
        'contenttype': contenttype,
        'encoding': encoding
    }
-def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
+def custom_handler(follow=None, delay=None, encoding=None):
    handlers = []
    # as per urllib2 source code, these Handelers are added first
    # *unless* one of the custom handlers inherits from one of them
    #
@@ -120,135 +63,25 @@ def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
    # HTTPDefaultErrorHandler, HTTPRedirectHandler,
    # FTPHandler, FileHandler, HTTPErrorProcessor]
    # & HTTPSHandler
    #
    # when processing a request:
    # (1) all the *_request are run
    # (2) the *_open are run until sth is returned (other than None)
    # (3) all the *_response are run
    #
    # During (3), if an http error occurs (i.e. not a 2XX response code), the
    # http_error_* are run until sth is returned (other than None). If they all
    # return nothing, a python error is raised
-    handlers = [
+    #handlers.append(DebugHandler())
-        #DebugHandler(),
+    handlers.append(SizeLimitHandler(100*1024)) # 100KiB
-        SizeLimitHandler(500*1024), # 500KiB
+    handlers.append(HTTPCookieProcessor())
-        HTTPCookieProcessor(),
+    handlers.append(GZIPHandler())
-        GZIPHandler(),
+    handlers.append(HTTPEquivHandler())
-        HTTPAllRedirectHandler(),
+    handlers.append(HTTPRefreshHandler())
-        HTTPEquivHandler(),
+    handlers.append(UAHandler(DEFAULT_UA))
-        HTTPRefreshHandler(),
+    handlers.append(BrowserlyHeaderHandler())
-        UAHandler(random.choice(DEFAULT_UAS)),
+    handlers.append(EncodingFixHandler(encoding))
        BrowserlyHeaderHandler(),
        EncodingFixHandler(),
    ]
    if follow:
        handlers.append(AlternateHandler(MIMETYPE[follow]))
-    handlers.append(CacheHandler(policy=policy, force_min=force_min, force_max=force_max))
+    handlers.append(CacheHandler(force_min=delay))
    return build_opener(*handlers)
 def is_ascii(string):
    # there's a native function in py3, but home-made fix for backward compatibility
    try:
        string.encode('ascii')
    except UnicodeError:
        return False
    else:
        return True
 def soft_quote(string):
    " url-quote only when not a valid ascii string "
    if is_ascii(string):
        return string
    else:
        return quote(string.encode('utf-8'))
 def sanitize_url(url):
    # make sure the url is unicode, i.e. not bytes
    if isinstance(url, bytes):
        url = url.decode('utf-8')
    # make sure there's a protocol (http://)
    if url.split(':', 1)[0] not in PROTOCOL:
        url = 'http://' + url
    # turns out some websites have really badly fomatted urls (fix http:/badurl)
    url = re.sub('^(https?):/([^/])', r'\1://\2', url)
    # escape spaces
    url = url.replace(' ', '%20')
    # escape non-ascii unicode characters
    parts = urlsplit(url)
    parts = parts._replace(
        netloc=parts.netloc.replace(
            parts.hostname,
            parts.hostname.encode('idna').decode('ascii')
            ),
        path=soft_quote(parts.path),
        query=soft_quote(parts.query),
        fragment=soft_quote(parts.fragment),
    )
    return parts.geturl()
 class RespDataHandler(BaseHandler):
    " Make it easier to use the reponse body "
    def data_reponse(self, req, resp, data):
        pass
    def http_response(self, req, resp):
        # read data
        data = resp.read()
        # process data and use returned content (if any)
        data = self.data_response(req, resp, data) or data
        # reformat the stuff
        fp = BytesIO(data)
        old_resp = resp
        resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
        resp.msg = old_resp.msg
        return resp
    https_response = http_response
 class RespStrHandler(RespDataHandler):
    " Make it easier to use the _decoded_ reponse body "
    def str_reponse(self, req, resp, data_str):
        pass
    def data_response(self, req, resp, data):
        #decode
        enc = detect_encoding(data, resp)
        data_str = data.decode(enc, 'replace')
        #process
        data_str = self.str_response(req, resp, data_str)
        # return
        data = data_str.encode(enc) if data_str is not None else data
        #return
        return data
 class DebugHandler(BaseHandler):
    handler_order = 2000
@@ -269,7 +102,7 @@ class SizeLimitHandler(BaseHandler):
    handler_order = 450
-    def __init__(self, limit=5*1024**2):
+    def __init__(self, limit=5*1024^2):
        self.limit = limit
    def http_response(self, req, resp):
@@ -290,29 +123,32 @@ def UnGzip(data):
    return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data)
-class GZIPHandler(RespDataHandler):
+class GZIPHandler(BaseHandler):
    def http_request(self, req):
        req.add_unredirected_header('Accept-Encoding', 'gzip')
        return req
-    def data_response(self, req, resp, data):
+    def http_response(self, req, resp):
        if 200 <= resp.code < 300:
            if resp.headers.get('Content-Encoding') == 'gzip':
                data = resp.read()
                data = UnGzip(data)
                resp.headers['Content-Encoding'] = 'identity'
-                return UnGzip(data)
+                fp = BytesIO(data)
                old_resp = resp
                resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
                resp.msg = old_resp.msg
        return resp
    https_response = http_response
    https_request = http_request
 def detect_encoding(data, resp=None):
    enc = detect_raw_encoding(data, resp)
    if enc.lower() == 'gb2312':
        enc = 'gbk'
    return enc
 def detect_raw_encoding(data, resp=None):
    if resp is not None:
        enc = resp.headers.get('charset')
        if enc is not None:
@@ -337,9 +173,32 @@ def detect_raw_encoding(data, resp=None):
    return 'utf-8'
-class EncodingFixHandler(RespStrHandler):
+class EncodingFixHandler(BaseHandler):
-    def str_response(self, req, resp, data_str):
+    def __init__(self, encoding=None):
-        return data_str
+        self.encoding = encoding
    def http_response(self, req, resp):
        maintype = resp.info().get('Content-Type', '').split('/')[0]
        if 200 <= resp.code < 300 and maintype == 'text':
            data = resp.read()
            if not self.encoding:
                enc = detect_encoding(data, resp)
            else:
                enc = self.encoding
            if enc:
                data = data.decode(enc, 'replace')
                data = data.encode(enc)
            fp = BytesIO(data)
            old_resp = resp
            resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
            resp.msg = old_resp.msg
        return resp
    https_response = http_response
 class UAHandler(BaseHandler):
@@ -365,58 +224,59 @@ class BrowserlyHeaderHandler(BaseHandler):
    https_request = http_request
-def iter_html_tag(html_str, tag_name):
+class AlternateHandler(BaseHandler):
    " To avoid parsing whole pages when looking for a simple tag "
    re_tag = r'<%s\s+[^>]+>' % tag_name
    re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
    for tag_match in re.finditer(re_tag, html_str):
        attr_match = re.findall(re_attr, tag_match.group(0))
        if attr_match is not None:
            yield dict(attr_match)
 class AlternateHandler(RespStrHandler):
    " Follow <link rel='alternate' type='application/rss+xml' href='...' /> "
    def __init__(self, follow=None):
        self.follow = follow or []
-    def str_response(self, req, resp, data_str):
+    def http_response(self, req, resp):
        contenttype = resp.info().get('Content-Type', '').split(';')[0]
        if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
            # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
-            for link in iter_html_tag(data_str[:10000], 'link'):
+            data = resp.read()
-                if (link.get('rel') == 'alternate'
+            links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
-                        and link.get('type') in self.follow
+
-                        and 'href' in link):
+            for link in links:
                if link.get('type', '') in self.follow:
                    resp.code = 302
                    resp.msg = 'Moved Temporarily'
                    resp.headers['location'] = link.get('href')
-                    break
+
            fp = BytesIO(data)
            old_resp = resp
            resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
            resp.msg = old_resp.msg
        return resp
    https_response = http_response
-class HTTPEquivHandler(RespStrHandler):
+class HTTPEquivHandler(BaseHandler):
    " Handler to support <meta http-equiv='...' content='...' />, since it defines HTTP headers "
    handler_order = 600
-    def str_response(self, req, resp, data_str):
+    def http_response(self, req, resp):
        contenttype = resp.info().get('Content-Type', '').split(';')[0]
        if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
            data = resp.read()
-            for meta in iter_html_tag(data_str[:10000], 'meta'):
+            headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
                if 'http-equiv' in meta and 'content' in meta:
                    resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
            for header in headers:
                resp.headers[header.get('http-equiv').lower()] = header.get('content')
-class HTTPAllRedirectHandler(HTTPRedirectHandler):
+            fp = BytesIO(data)
-    def http_error_308(self, req, fp, code, msg, headers):
+            old_resp = resp
-        return self.http_error_301(req, fp, 301, msg, headers)
+            resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
            resp.msg = old_resp.msg
        return resp
    https_response = http_response
 class HTTPRefreshHandler(BaseHandler):
@@ -425,7 +285,7 @@ class HTTPRefreshHandler(BaseHandler):
    def http_response(self, req, resp):
        if 200 <= resp.code < 300:
            if resp.headers.get('refresh'):
-                regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url\s*=\s*(["\']?)(?P<url>.+)\2$'
+                regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url=(["\']?)(?P<url>.+)\2$'
                match = re.search(regex, resp.headers.get('refresh'))
                if match:
@@ -441,105 +301,139 @@ class HTTPRefreshHandler(BaseHandler):
    https_response = http_response
-def parse_headers(text=u'\n\n'):
+default_cache = {}
    if sys.version_info[0] >= 3:
        # python 3
        return message_from_string(text, _class=HTTPMessage)
    else:
        # python 2
        return HTTPMessage(StringIO(text))
 def error_response(code, msg, url=''):
    # return an error as a response
    resp = addinfourl(BytesIO(), parse_headers(), url, code)
    resp.msg = msg
    return resp
 class CacheHandler(BaseHandler):
    " Cache based on etags/last-modified "
-    privacy = 'private' # Websites can indicate whether the page should be cached
+    private_cache = False # False to behave like a CDN (or if you just don't care), True like a PC
                        # by CDNs (e.g. shouldn't be the case for
                        # private/confidential/user-specific pages. With this
                        # setting, decide whether you want the cache to behave
                        # like a CDN (i.e. don't cache private pages, 'public'),
                        # or to behave like a end-user private pages
                        # ('private'). If unsure, 'public' is the safest bet,
                        # but many websites abuse this feature...
                      # NB. This overrides all the other min/max/policy settings.
    handler_order = 499
-    def __init__(self, cache=None, force_min=None, force_max=None, policy=None):
+    def __init__(self, cache=None, force_min=None):
        self.cache = cache or default_cache
-        self.force_min = force_min
+        self.force_min = force_min # force_min (seconds) to bypass http headers, -1 forever, 0 never, -2 do nothing if not in cache
        self.force_max = force_max
        self.policy = policy # can be cached/refresh/offline/None (default)
        # Servers indicate how long they think their content is "valid". With
        # this parameter (force_min/max, expressed in seconds), we can override
        # the validity period (i.e. bypassing http headers)
        # Special choices, via "policy":
        #   cached: use the cache no matter what (and fetch the page online if
        #           not present in cache)
        #   refresh: valid zero second, i.e. force refresh
        #   offline: same as cached, i.e. use the cache no matter what, but do
        #            NOT fetch the page online if not present in cache, throw an
        #            error instead
        #   None: just follow protocols
        # sanity checks
        assert self.force_max is None or self.force_max >= 0
        assert self.force_min is None or self.force_min >= 0
        assert self.force_max is None or self.force_min is None or self.force_max >= self.force_min
    def load(self, url):
        try:
-            data = pickle.loads(self.cache[url])
+            out = list(self.cache[url])
        except KeyError:
-            data = None
+            out = [None, None, unicode(), bytes(), 0]
        if sys.version_info[0] >= 3:
            out[2] = email.message_from_string(out[2] or unicode()) # headers
        else:
            out[2] = mimetools.Message(StringIO(out[2] or unicode()))
        return out
    def save(self, url, code, msg, headers, data, timestamp):
        self.cache[url] = (code, msg, unicode(headers), data, timestamp)
    def http_request(self, req):
        (code, msg, headers, data, timestamp) = self.load(req.get_full_url())
        if 'etag' in headers:
            req.add_unredirected_header('If-None-Match', headers['etag'])
        if 'last-modified' in headers:
            req.add_unredirected_header('If-Modified-Since', headers.get('last-modified'))
        return req
    def http_open(self, req):
        (code, msg, headers, data, timestamp) = self.load(req.get_full_url())
        # some info needed to process everything
        cache_control = parse_http_list(headers.get('cache-control', ()))
        cache_control += parse_http_list(headers.get('pragma', ()))
        cc_list = [x for x in cache_control if '=' not in x]
        cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
        cache_age = time.time() - timestamp
        # list in a simple way what to do when
        if req.get_header('Morss') == 'from_304': # for whatever reason, we need an uppercase
            # we're just in the middle of a dirty trick, use cache
            pass
        elif self.force_min == -2:
            if code is not None:
                # already in cache, perfect, use cache
                pass
            else:
                headers['Morss'] = 'from_cache'
                resp = addinfourl(BytesIO(), headers, req.get_full_url(), 409)
                resp.msg = 'Conflict'
                return resp
        elif code is None:
            # cache empty, refresh
            return None
        elif self.force_min == -1:
            # force use cache
            pass
        elif self.force_min == 0:
            # force refresh
            return None
        elif code == 301 and cache_age < 7*24*3600:
            # "301 Moved Permanently" has to be cached...as long as we want (awesome HTTP specs), let's say a week (why not?)
            # use force_min=0 if you want to bypass this (needed for a proper refresh)
            pass
        elif  self.force_min is None and ('no-cache' in cc_list
                                        or 'no-store' in cc_list
                                        or ('private' in cc_list and not self.private_cache)):
            # kindly follow web servers indications, refresh
            return None
        elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
            # server says it's still fine (and we trust him, if not, use force_min=0), use cache
            pass
        elif self.force_min is not None and self.force_min > cache_age:
            # still recent enough for us, use cache
            pass
        else:
-            data['headers'] = parse_headers(data['headers'] or unicode())
+            # according to the www, we have to refresh when nothing is said
            return None
-        return data
+        # return the cache as a response
        headers['morss'] = 'from_cache' # TODO delete the morss header from incoming pages, to avoid websites messing up with us
        resp = addinfourl(BytesIO(data), headers, req.get_full_url(), code)
        resp.msg = msg
-    def save(self, key, data):
+        return resp
        data['headers'] = unicode(data['headers'])
        self.cache[key] = pickle.dumps(data, 0)
-    def cached_response(self, req, fallback=None):
+    def http_response(self, req, resp):
-        req.from_morss_cache = True
+        # code for after-fetch, to know whether to save to hard-drive (if stiking to http headers' will)
-        data = self.load(req.get_full_url())
+        if resp.code == 304:
        if data is not None:
            # return the cache as a response
            resp = addinfourl(BytesIO(data['data']), data['headers'], req.get_full_url(), data['code'])
            resp.msg = data['msg']
            return resp
-        else:
+        if ('cache-control' in resp.headers or 'pragma' in resp.headers) and self.force_min is None:
-            return fallback
+            cache_control = parse_http_list(resp.headers.get('cache-control', ()))
            cache_control += parse_http_list(resp.headers.get('pragma', ()))
-    def save_response(self, req, resp):
+            cc_list = [x for x in cache_control if '=' not in x]
-        if req.from_morss_cache:
+
-            # do not re-save (would reset the timing)
+            if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache):
                # kindly follow web servers indications
                return resp
        if resp.headers.get('Morss') == 'from_cache':
            # it comes from cache, so no need to save it again
            return resp
        # save to disk
        data = resp.read()
-
+        self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
        self.save(req.get_full_url(), {
            'code': resp.code,
            'msg': resp.msg,
            'headers': resp.headers,
            'data': data,
            'timestamp': time.time()
            })
        fp = BytesIO(data)
        old_resp = resp
@@ -548,140 +442,111 @@ class CacheHandler(BaseHandler):
        return resp
-    def http_request(self, req):
+    def http_error_304(self, req, fp, code, msg, headers):
-        req.from_morss_cache = False # to track whether it comes from cache
+        cache = list(self.load(req.get_full_url()))
-        data = self.load(req.get_full_url())
+        if cache[0]:
            cache[-1] = time.time()
            self.save(req.get_full_url(), *cache)
-        if data is not None:
+            new = Request(req.get_full_url(),
-            if 'etag' in data['headers']:
+                           headers=req.headers,
-                req.add_unredirected_header('If-None-Match', data['headers']['etag'])
+                           unverifiable=True)
-            if 'last-modified' in data['headers']:
+            new.add_unredirected_header('Morss', 'from_304')
                req.add_unredirected_header('If-Modified-Since', data['headers']['last-modified'])
-        return req
+            return self.parent.open(new, timeout=req.timeout)
-    def http_open(self, req):
+        return None
        # Reminder of how/when this function is called by urllib2:
        # If 'None' is returned, try your chance with the next-available handler
        # If a 'resp' is returned, stop there, and proceed with 'http_response'
        # Here, we try to see whether we want to use data from cache (i.e.
        # return 'resp'), or whether we want to refresh the content (return
        # 'None')
        data = self.load(req.get_full_url())
        if data is not None:
            # some info needed to process everything
            cache_control = parse_http_list(data['headers'].get('cache-control', ()))
            cache_control += parse_http_list(data['headers'].get('pragma', ()))
            cc_list = [x for x in cache_control if '=' not in x]
            cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
            cache_age = time.time() - data['timestamp']
        # list in a simple way what to do in special cases
        if data is not None and 'private' in cc_list and self.privacy == 'public':
            # private data but public cache, do not use cache
            # privacy concern, so handled first and foremost
            # (and doesn't need to be addressed anymore afterwards)
            return None
        elif self.policy == 'offline':
            # use cache, or return an error
            return self.cached_response(
                req,
                error_response(409, 'Conflict', req.get_full_url())
            )
        elif self.policy == 'cached':
            # use cache, or fetch online
            return self.cached_response(req, None)
        elif self.policy == 'refresh':
            # force refresh
            return None
        elif data is None:
            # we have already settled all the cases that don't need the cache.
            # all the following ones need the cached item
            return None
        elif self.force_max is not None and cache_age > self.force_max:
            # older than we want, refresh
            return None
        elif self.force_min is not None and cache_age < self.force_min:
            # recent enough, use cache
            return self.cached_response(req)
        elif data['code'] == 301 and cache_age < 7*24*3600:
            # "301 Moved Permanently" has to be cached...as long as we want
            # (awesome HTTP specs), let's say a week (why not?). Use force_min=0
            # if you want to bypass this (needed for a proper refresh)
            return self.cached_response(req)
        elif self.force_min is None and ('no-cache' in cc_list or 'no-store' in cc_list):
            # kindly follow web servers indications, refresh if the same
            # settings are used all along, this section shouldn't be of any use,
            # since the page woudln't be cached in the first place the check is
            # only performed "just in case"
            # NB. NOT respected if force_min is set
            return None
        elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
            # server says it's still fine (and we trust him, if not, use overrides), use cache
            return self.cached_response(req)
        else:
            # according to the www, we have to refresh when nothing is said
            return None
    def http_response(self, req, resp):
        # code for after-fetch, to know whether to save to hard-drive (if sticking to http headers' will)
        if resp.code == 304 and resp.url in self.cache:
            # we are hopefully the first after the HTTP handler, so no need
            # to re-run all the *_response
            # here: cached page, returning from cache
            return self.cached_response(req)
        elif self.force_min is None and ('cache-control' in resp.headers or 'pragma' in resp.headers):
            cache_control = parse_http_list(resp.headers.get('cache-control', ()))
            cache_control += parse_http_list(resp.headers.get('pragma', ()))
            cc_list = [x for x in cache_control if '=' not in x]
            if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and self.privacy == 'public'):
                # kindly follow web servers indications (do not save & return)
                return resp
            else:
                # save
                return self.save_response(req, resp)
        else:
            return self.save_response(req, resp)
    https_request = http_request
    https_open = http_open
    https_response = http_response
-if 'IGNORE_SSL' in os.environ:
+class BaseCache:
-    import ssl
+    """ Subclasses must behave like a dict """
-    ssl._create_default_https_context = ssl._create_unverified_context
+
    def __contains__(self, url):
        try:
            self[url]
        except KeyError:
            return False
        else:
            return True
-if __name__ == '__main__':
+import sqlite3
    req = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
    if sys.flags.interactive:
        print('>>> Interactive shell: try using `req`')
-    else:
+class SQLiteCache(BaseCache):
-        print(req['data'].decode(req['encoding']))
+    def __init__(self, filename=':memory:'):
        self.con = sqlite3.connect(filename, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
        with self.con:
            self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)')
            self.con.execute('pragma journal_mode=WAL')
    def __del__(self):
        self.con.close()
    def __getitem__(self, url):
        row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
        if not row:
            raise KeyError
        return row[1:]
    def __setitem__(self, url, value): # value = (code, msg, headers, data, timestamp)
        value = list(value)
        value[3] = sqlite3.Binary(value[3]) # data
        value = tuple(value)
        if url in self:
            with self.con:
                self.con.execute('UPDATE data SET code=?, msg=?, headers=?, data=?, timestamp=? WHERE url=?',
                    value + (url,))
        else:
            with self.con:
                self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?)', (url,) + value)
 import pymysql.cursors
 class MySQLCacheHandler(BaseCache):
    def __init__(self, user, password, database, host='localhost'):
        self.user = user
        self.password = password
        self.database = database
        self.host = host
        with self.cursor() as cursor:
            cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)')
    def cursor(self):
        return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
    def __getitem__(self, url):
        cursor = self.cursor()
        cursor.execute('SELECT * FROM data WHERE url=%s', (url,))
        row = cursor.fetchone()
        if not row:
            raise KeyError
        return row[1:]
    def __setitem__(self, url, value): # (code, msg, headers, data, timestamp)
        if url in self:
            with self.cursor() as cursor:
                cursor.execute('UPDATE data SET code=%s, msg=%s, headers=%s, data=%s, timestamp=%s WHERE url=%s',
                    value + (url,))
        else:
            with self.cursor() as cursor:
                cursor.execute('INSERT INTO data VALUES (%s,%s,%s,%s,%s,%s)', (url,) + value)
--- a/morss/feedify.ini
+++ b/morss/feedify.ini
@@ -73,7 +73,7 @@ item_updated = atom03:updated
 mode = json
 mimetype = application/json
-timeformat = %Y-%m-%dT%H:%M:%S%z
+timeformat = %Y-%m-%dT%H:%M:%SZ
 base = {}
 title = title
@@ -90,6 +90,9 @@ item_updated = updated
 [html]
 mode = html
 path =
  http://localhost/
 title = //div[@id='header']/h1
 desc = //div[@id='header']/p
 items = //div[@id='content']/div
--- a/morss/feeds.py
+++ b/morss/feeds.py
@@ -1,45 +1,32 @@
-# This file is part of morss
+import sys
-#
+import os.path
 # Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU Affero General Public License as published by the Free
 # Software Foundation, either version 3 of the License, or (at your option) any
 # later version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
 # details.
 #
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.
 import csv
 import json
 import re
 from copy import deepcopy
 from datetime import datetime
 import re
 import json
 import csv
 from fnmatch import fnmatch
 import dateutil.parser
 import lxml.html
 from dateutil import tz
 from lxml import etree
 from dateutil import tz
 import dateutil.parser
 from copy import deepcopy
 import lxml.html
 from .readabilite import parse as html_parse
 from .util import *
 json.encoder.c_make_encoder = None
 try:
    # python 2
    from ConfigParser import RawConfigParser
    from StringIO import StringIO
    from ConfigParser import RawConfigParser
 except ImportError:
    # python 3
    from configparser import RawConfigParser
    from io import StringIO
    from configparser import RawConfigParser
 try:
    # python 2
@@ -51,7 +38,7 @@ except NameError:
 def parse_rules(filename=None):
    if not filename:
-        filename = pkg_path('feedify.ini')
+        filename = os.path.join(os.path.dirname(__file__), 'feedify.ini')
    config = RawConfigParser()
    config.read(filename)
@@ -65,10 +52,18 @@ def parse_rules(filename=None):
            # for each rule
            if rules[section][arg].startswith('file:'):
-                path = data_path('www', rules[section][arg][5:])
+                paths = [os.path.join(sys.prefix, 'share/morss/www', rules[section][arg][5:]),
-                file_raw = open(path).read()
+                    os.path.join(os.path.dirname(__file__), '../www', rules[section][arg][5:]),
-                file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
+                    os.path.join(os.path.dirname(__file__), '../..', rules[section][arg][5:])]
-                rules[section][arg] = file_clean
+
                for path in paths:
                    try:
                        file_raw = open(path).read()
                        file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
                        rules[section][arg] = file_clean
                    except IOError:
                        pass
            elif '\n' in rules[section][arg]:
                rules[section][arg] = rules[section][arg].split('\n')[1:]
@@ -76,25 +71,20 @@ def parse_rules(filename=None):
    return rules
-def parse(data, url=None, encoding=None, ruleset=None):
+def parse(data, url=None, mimetype=None, encoding=None):
    " Determine which ruleset to use "
-    if ruleset is not None:
+    rulesets = parse_rules()
        rulesets = [ruleset]
    else:
        rulesets = parse_rules().values()
    parsers = [FeedXML, FeedHTML, FeedJSON]
    # 1) Look for a ruleset based on path
    if url is not None:
-        for ruleset in rulesets:
+        for ruleset in rulesets.values():
            if 'path' in ruleset:
                for path in ruleset['path']:
                    if fnmatch(url, path):
-                        parser = [x for x in parsers if x.mode == ruleset.get('mode')][0] # FIXME what if no mode specified?
+                        parser = [x for x in parsers if x.mode == ruleset['mode']][0]
                        return parser(data, ruleset, encoding=encoding)
    # 2) Try each and every parser
@@ -104,27 +94,26 @@ def parse(data, url=None, encoding=None, ruleset=None):
        # 3b) See if .items matches anything
    for parser in parsers:
        ruleset_candidates = [x for x in rulesets.values() if x['mode'] == parser.mode and 'path' not in x]
            # 'path' as they should have been caught beforehands
        try:
            feed = parser(data, encoding=encoding)
-        except (ValueError, SyntaxError):
+        except (ValueError):
            # parsing did not work
            pass
        else:
            # parsing worked, now we try the rulesets
            ruleset_candidates = [x for x in rulesets if x.get('mode') in (parser.mode, None) and 'path' not in x]
                # 'path' as they should have been caught beforehands
                # try anyway if no 'mode' specified
            for ruleset in ruleset_candidates:
                feed.rules = ruleset
                try:
                    feed.items[0]
-                except (AttributeError, IndexError, TypeError):
+                except (AttributeError, IndexError):
                    # parsing and or item picking did not work out
                    pass
@@ -187,12 +176,11 @@ class ParserBase(object):
        return self.convert(FeedHTML).tostring(**k)
    def convert(self, TargetParser):
-        target = TargetParser()
+        if type(self) == TargetParser:
        if type(self) == TargetParser and self.rules == target.rules:
            # check both type *AND* rules (e.g. when going from freeform xml to rss)
            return self
        target = TargetParser()
        for attr in target.dic:
            if attr == 'items':
                for item in self.items:
@@ -331,7 +319,7 @@ class ParserXML(ParserBase):
        return self.root.getparent().remove(self.root)
    def tostring(self, encoding='unicode', **k):
-        return etree.tostring(self.root, encoding=encoding, method='xml', **k)
+        return etree.tostring(self.root, encoding=encoding, **k)
    def _rule_parse(self, rule):
        test = re.search(r'^(.*)/@([a-z]+)$', rule) # to match //div/a/@href
@@ -361,13 +349,7 @@ class ParserXML(ParserBase):
    def rule_search_all(self, rule):
        try:
-            match = self.root.xpath(rule, namespaces=self.NSMAP)
+            return self.root.xpath(rule, namespaces=self.NSMAP)
            if isinstance(match, str):
                # some xpath rules return a single string instead of an array (e.g. concatenate() )
                return [match,]
            else:
                return match
        except etree.XPathEvalError:
            return []
@@ -419,8 +401,7 @@ class ParserXML(ParserBase):
            return
        elif key is not None:
-            if key in match.attrib:
+            del x.attrib[key]
                del match.attrib[key]
        else:
            match.getparent().remove(match)
@@ -430,7 +411,7 @@ class ParserXML(ParserBase):
        match = self.rule_search(rrule)
-        html_rich = ('atom' in rule or self.rules.get('mode') == 'html') \
+        html_rich = ('atom' in rule or self.rules['mode'] == 'html') \
            and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
        if key is not None:
@@ -441,7 +422,7 @@ class ParserXML(ParserBase):
                self._clean_node(match)
                match.append(lxml.html.fragment_fromstring(value, create_parent='div'))
-                if self.rules.get('mode') == 'html':
+                if self.rules['mode'] == 'html':
                    match.find('div').drop_tag() # not supported by lxml.etree
                else: # i.e. if atom
@@ -457,7 +438,7 @@ class ParserXML(ParserBase):
    def rule_str(self, rule):
        match = self.rule_search(rule)
-        html_rich = ('atom' in rule or self.mode == 'html') \
+        html_rich = ('atom' in rule or self.rules['mode'] == 'html') \
            and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
        if isinstance(match, etree._Element):
@@ -481,7 +462,7 @@ class ParserHTML(ParserXML):
        return html_parse(raw, encoding=self.encoding)
    def tostring(self, encoding='unicode', **k):
-        return lxml.html.tostring(self.root, encoding=encoding, method='html', **k)
+        return lxml.html.tostring(self.root, encoding=encoding, **k)
    def rule_search_all(self, rule):
        try:
@@ -490,14 +471,7 @@ class ParserHTML(ParserXML):
            repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
            rule = re.sub(pattern, repl, rule)
-            match = self.root.xpath(rule)
+            return self.root.xpath(rule)
            if isinstance(match, str):
                # for some xpath rules, see XML parser
                return [match,]
            else:
                return match
        except etree.XPathEvalError:
            return []
@@ -516,31 +490,24 @@ class ParserHTML(ParserXML):
 def parse_time(value):
    # parsing per se
    if value is None or value == 0:
-        time = None
+        return None
    elif isinstance(value, basestring):
        if re.match(r'^[0-9]+$', value):
-            time = datetime.fromtimestamp(int(value))
+            return datetime.fromtimestamp(int(value), tz.tzutc())
        else:
-            time = dateutil.parser.parse(value)
+            return dateutil.parser.parse(value).replace(tzinfo=tz.tzutc())
    elif isinstance(value, int):
-        time = datetime.fromtimestamp(value)
+        return datetime.fromtimestamp(value, tz.tzutc())
    elif isinstance(value, datetime):
-        time = value
+        return value
    else:
-        time = None
+        return None
    # add default time zone if none set
    if time is not None and time.tzinfo is None:
            time = time.replace(tzinfo=tz.tzutc())
    return time
 class ParserJSON(ParserBase):
@@ -641,41 +608,34 @@ class ParserJSON(ParserBase):
        return out.replace('\n', '<br/>') if out else out
-def wrap_uniq(wrapper_fn_name):
+class Uniq(object):
-    " Wraps the output of the function with the specified function "
+    _map = {}
-    # This is called when parsing "wrap_uniq('wrap_item')"
+    _id = None
-    def decorator(func):
+    def __new__(cls, *args, **kwargs):
-        # This is called when parsing "@wrap_uniq('wrap_item')"
+        # check if a wrapper was already created for it
        # if so, reuse it
        # if not, create a new one
        # note that the item itself (the tree node) is created beforehands
-        def wrapped_func(self, *args, **kwargs):
+        tmp_id = cls._gen_id(*args, **kwargs)
-            # This is called when the wrapped function is called
+        if tmp_id in cls._map:
            return cls._map[tmp_id]
-            output = func(self, *args, **kwargs)
+        else:
-            output_id = id(output)
+            obj = object.__new__(cls) #, *args, **kwargs)
-
+            cls._map[tmp_id] = obj
-            try:
+            return obj
                return self._map[output_id]
            except (KeyError, AttributeError):
                if not hasattr(self, '_map'):
                    self._map = {}
                wrapper_fn = getattr(self, wrapper_fn_name)
                obj = wrapper_fn(output)
                self._map[output_id] = obj
                return obj
        return wrapped_func
    return decorator
 class Feed(object):
-    itemsClass = property(lambda x: Item) # because Item is define below, i.e. afterwards
+    itemsClass = 'Item'
    dic = ('title', 'desc', 'items')
    def wrap_items(self, items):
        itemsClass = globals()[self.itemsClass]
        return [itemsClass(x, self.rules, self) for x in items]
    title = property(
        lambda f:   f.get('title'),
        lambda f,x: f.set('title', x),
@@ -691,7 +651,10 @@ class Feed(object):
        self.rule_create(self.rules['items'])
        item = self.items[-1]
-        for attr in self.itemsClass.dic:
+        if new is None:
            return
        for attr in globals()[self.itemsClass].dic:
            try:
                setattr(item, attr, getattr(new, attr))
@@ -699,17 +662,11 @@ class Feed(object):
                try:
                    setattr(item, attr, new[attr])
-                except (KeyError, IndexError, TypeError):
+                except (IndexError, TypeError):
                    pass
        return item
    def wrap_item(self, item):
        return self.itemsClass(item, self.rules, self)
    @wrap_uniq('wrap_item')
    def __getitem__(self, key):
-        return self.get_raw('items')[key]
+        return self.wrap_items(self.get_raw('items'))[key]
    def __delitem__(self, key):
        self[key].remove()
@@ -718,7 +675,7 @@ class Feed(object):
        return len(self.get_raw('items'))
-class Item(object):
+class Item(Uniq):
    dic = ('title', 'link', 'desc', 'content', 'time', 'updated')
    def __init__(self, xml=None, rules=None, parent=None):
@@ -757,45 +714,32 @@ class Item(object):
        lambda f:   f.rmv('item_updated') )
 class FeedXML(Feed, ParserXML):
    itemsClass = 'ItemXML'
    def tostring(self, encoding='unicode', **k):
        # override needed due to "getroottree" inclusion
        if self.root.getprevious() is None:
            self.root.addprevious(etree.PI('xml-stylesheet', 'type="text/xsl" href="/sheet.xsl"'))
        return etree.tostring(self.root.getroottree(), encoding=encoding, **k)
 class ItemXML(Item, ParserXML):
    pass
-class FeedXML(Feed, ParserXML):
+class FeedHTML(Feed, ParserHTML):
-    itemsClass = ItemXML
+    itemsClass = 'ItemHTML'
    def root_siblings(self):
        out = []
        current = self.root.getprevious()
        while current is not None:
            out.append(current)
            current = current.getprevious()
        return out
    def tostring(self, encoding='unicode', **k):
        # override needed due to "getroottree" inclusion
        # and to add stylesheet
        stylesheets = [x for x in self.root_siblings() if isinstance(x, etree.PIBase) and x.target == 'xml-stylesheet']
        for stylesheet in stylesheets:
            # remove all stylesheets present (be that ours or others')
            self.root.append(stylesheet) # needed as we can't delete root siblings https://stackoverflow.com/a/60232366
            self.root.remove(stylesheet)
        self.root.addprevious(etree.PI('xml-stylesheet', 'type="text/xsl" href="/sheet.xsl"'))
        return etree.tostring(self.root.getroottree(), encoding=encoding, method='xml', **k)
 class ItemHTML(Item, ParserHTML):
    pass
-class FeedHTML(Feed, ParserHTML):
+class FeedJSON(Feed, ParserJSON):
-    itemsClass = ItemHTML
+    itemsClass = 'ItemJSON'
 class ItemJSON(Item, ParserJSON):
@@ -809,22 +753,3 @@ class ItemJSON(Item, ParserJSON):
                return
            cur = cur[node]
 class FeedJSON(Feed, ParserJSON):
    itemsClass = ItemJSON
 if __name__ == '__main__':
    import sys
    from . import crawler
    req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')
    feed = parse(req['data'], url=req['url'], encoding=req['encoding'])
    if sys.flags.interactive:
        print('>>> Interactive shell: try using `feed`')
    else:
        for item in feed.items:
            print(item.title, item.link)
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -1,66 +1,74 @@
 # This file is part of morss
 #
 # Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU Affero General Public License as published by the Free
 # Software Foundation, either version 3 of the License, or (at your option) any
 # later version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
 # details.
 #
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.
 import os
 import re
 import sys
 import os
 import os.path
 import time
 from datetime import datetime
 from dateutil import tz
 from fnmatch import fnmatch
 import re
 import lxml.etree
 import lxml.html
 from dateutil import tz
-from . import caching, crawler, feeds, readabilite
+from . import feeds
 from . import crawler
 from . import readabilite
 import wsgiref.simple_server
 import wsgiref.handlers
 import cgitb
 try:
    # python 2
    from httplib import HTTPException
-    from urlparse import parse_qs, urljoin, urlparse
+    from urllib import unquote
    from urlparse import urlparse, urljoin, parse_qs
 except ImportError:
    # python 3
    from http.client import HTTPException
-    from urllib.parse import parse_qs, urljoin, urlparse
+    from urllib.parse import unquote
    from urllib.parse import urlparse, urljoin, parse_qs
 MAX_ITEM = 5  # cache-only beyond
 MAX_TIME = 2  # cache-only after (in sec)
 LIM_ITEM = 10  # deletes what's beyond
 LIM_TIME = 2.5  # deletes what's after
 DELAY = 10 * 60  # xml cache & ETag cache (in sec)
 TIMEOUT = 4  # http timeout (in sec)
 DEBUG = False
 PORT = 8080
 PROTOCOL = ['http', 'https']
-MAX_ITEM = int(os.getenv('MAX_ITEM', 5)) # cache-only beyond
+def filterOptions(options):
-MAX_TIME = int(os.getenv('MAX_TIME', 2)) # cache-only after (in sec)
+    return options
-LIM_ITEM = int(os.getenv('LIM_ITEM', 10)) # deletes what's beyond
+    # example of filtering code below
 LIM_TIME = int(os.getenv('LIM_TIME', 2.5)) # deletes what's after
-DELAY = int(os.getenv('DELAY', 10 * 60)) # xml cache & ETag cache (in sec)
+    #allowed = ['proxy', 'clip', 'cache', 'force', 'silent', 'pro', 'debug']
-TIMEOUT = int(os.getenv('TIMEOUT', 4)) # http timeout (in sec)
+    #filtered = dict([(key,value) for (key,value) in options.items() if key in allowed])
    #return filtered
 class MorssException(Exception):
    pass
-def log(txt):
+def log(txt, force=False):
-    if 'DEBUG' in os.environ:
+    if DEBUG or force:
        if 'REQUEST_URI' in os.environ:
            # when running on Apache
            open('morss.log', 'a').write("%s\n" % repr(txt))
        else:
-            # when using internal server or cli
+            print(repr(txt))
            print(repr(txt), file=sys.stderr)
 def len_html(txt):
@@ -87,12 +95,12 @@ class Options:
        else:
            self.options = options or {}
-    def __getattr__(self, key, default=None):
+    def __getattr__(self, key):
        if key in self.options:
            return self.options[key]
        else:
-            return default
+            return False
    def __setitem__(self, key, value):
        self.options[key] = value
@@ -100,10 +108,31 @@ class Options:
    def __contains__(self, key):
        return key in self.options
-    get = __getitem__ = __getattr__
+
 def parseOptions(options):
    """ Turns ['md=True'] into {'md':True} """
    out = {}
    for option in options:
        split = option.split('=', 1)
        if len(split) > 1:
            if split[0].lower() == 'true':
                out[split[0]] = True
            elif split[0].lower() == 'false':
                out[split[0]] = False
            else:
                out[split[0]] = split[1]
        else:
            out[split[0]] = True
    return out
-def ItemFix(item, options, feedurl='/'):
+def ItemFix(item, feedurl='/'):
    """ Improves feed items (absolute links, resolve feedburner links, etc) """
    # check unwanted uppercase title
@@ -122,13 +151,6 @@ def ItemFix(item, options, feedurl='/'):
            item.link = match[0]
            log(item.link)
    # at user's election, use first <a>
    if options.firstlink and (item.desc or item.content):
        match = lxml.html.fromstring(item.desc or item.content).xpath('//a/@href')
        if len(match):
            item.link = match[0]
            log(item.link)
    # check relative urls
    item.link = urljoin(feedurl, item.link)
@@ -190,46 +212,59 @@ def ItemFill(item, options, feedurl='/', fast=False):
    if not item.link:
        log('no link')
-        return True
+        return item
    log(item.link)
    link = item.link
    # twitter
    if urlparse(feedurl).netloc == 'twitter.com':
        match = lxml.html.fromstring(item.desc).xpath('//a/@data-expanded-url')
        if len(match):
            link = match[0]
            log(link)
        else:
            link = None
    # facebook
    if urlparse(feedurl).netloc == 'graph.facebook.com':
        match = lxml.html.fromstring(item.content).xpath('//a/@href')
        if len(match) and urlparse(match[0]).netloc != 'www.facebook.com':
            link = match[0]
            log(link)
        else:
            link = None
    if link is None:
        log('no used link')
        return True
    # download
    delay = -1
-    if fast or options.cache:
+    if fast:
-        # force cache, don't fetch
+        # super-fast mode
-        policy = 'offline'
+        delay = -2
    elif options.force:
        # force refresh
        policy = 'refresh'
    else:
        policy = None
    try:
-        req = crawler.adv_get(url=item.link, policy=policy, force_min=24*60*60, timeout=TIMEOUT)
+        data, con, contenttype, encoding = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
    except (IOError, HTTPException) as e:
        log('http error')
        return False # let's just delete errors stuff when in cache mode
-    if req['contenttype'] not in crawler.MIMETYPE['html'] and req['contenttype'] != 'text/plain':
+    if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
        log('non-text page')
        return True
-    if not req['data']:
+    out = readabilite.get_article(data, url=con.geturl(), encoding=encoding)
        log('empty page')
        return True
    out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode', xpath=options.xpath)
    if out is not None:
        item.content = out
    if options.resolve:
        item.link = req['url']
    return True
@@ -246,7 +281,7 @@ def ItemBefore(item, options):
 def ItemAfter(item, options):
    if options.clip and item.desc and item.content:
-        item.content = item.desc + "<br/><br/><hr/><br/><br/>" + item.content
+        item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
        del item.desc
    if options.nolink and item.content:
@@ -254,7 +289,7 @@ def ItemAfter(item, options):
        for link in content.xpath('//a'):
            log(link.text_content())
            link.drop_tag()
-        item.content = lxml.etree.tostring(content, method='html')
+        item.content = lxml.etree.tostring(content)
    if options.noref:
        item.link = ''
@@ -262,61 +297,67 @@ def ItemAfter(item, options):
    return item
 def UrlFix(url):
    if url is None:
        raise MorssException('No url provided')
    if isinstance(url, bytes):
        url = url.decode()
    if urlparse(url).scheme not in PROTOCOL:
        url = 'http://' + url
        log(url)
    url = url.replace(' ', '%20')
    return url
 def FeedFetch(url, options):
    # fetch feed
    delay = DELAY
-    if options.cache:
+    if options.theforce:
-        policy = 'offline'
+        delay = 0
    elif options.force:
        policy = 'refresh'
    else:
        policy = None
    try:
-        req = crawler.adv_get(url=url, post=options.post, follow=('rss' if not options.items else None), policy=policy, force_min=5*60, force_max=60*60, timeout=TIMEOUT)
+        xml, con, contenttype, encoding = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2)
    except (IOError, HTTPException):
        raise MorssException('Error downloading feed')
    if options.items:
        # using custom rules
-        ruleset = {}
+        rss = feeds.FeedHTML(xml, encoding=encoding)
-        ruleset['items'] = options.items
+        rss.rules['title'] = options.title              if options.title        else '//head/title'
        rss.rules['desc'] = options.desc                if options.desc         else '//head/meta[@name="description"]/@content'
-        if options.mode:
+        rss.rules['items'] = options.items
            ruleset['mode'] = options.mode
-        ruleset['title'] = options.get('title', '//head/title')
+        rss.rules['item_title'] = options.item_title    if options.item_title   else './/a|.'
-        ruleset['desc'] = options.get('desc', '//head/meta[@name="description"]/@content')
+        rss.rules['item_link'] = options.item_link      if options.item_link    else './@href|.//a/@href'
        ruleset['item_title'] = options.get('item_title', '.')
        ruleset['item_link'] = options.get('item_link', '(.|.//a|ancestor::a)/@href')
        if options.item_content:
-            ruleset['item_content'] = options.item_content
+            rss.rules['item_content'] = options.item_content
        if options.item_time:
-            ruleset['item_time'] = options.item_time
+            rss.rules['item_time'] = options.item_time
        rss = feeds.parse(req['data'], encoding=req['encoding'], ruleset=ruleset)
        rss = rss.convert(feeds.FeedXML)
    else:
        try:
-            rss = feeds.parse(req['data'], url=url, encoding=req['encoding'])
+            rss = feeds.parse(xml, url, contenttype, encoding=encoding)
            rss = rss.convert(feeds.FeedXML)
                # contains all fields, otherwise much-needed data can be lost
        except TypeError:
            log('random page')
-            log(req['contenttype'])
+            log(contenttype)
            raise MorssException('Link provided is not a valid feed')
-    return req['url'], rss
+    return rss
 def FeedGather(rss, url, options):
@@ -332,23 +373,9 @@ def FeedGather(rss, url, options):
    if options.cache:
        max_time = 0
-    # sort
+    now = datetime.now(tz.tzutc())
-    sorted_items = list(rss.items)
+    sorted_items = sorted(rss.items, key=lambda x:x.updated or x.time or now, reverse=True)
    if options.order == 'last':
    # `first` does nothing from a practical standpoint, so only `last` needs
    # to be addressed
        sorted_items = reversed(sorted_items)
    elif options.order in ['newest', 'oldest']:
        now = datetime.now(tz.tzutc())
        sorted_items = sorted(sorted_items, key=lambda x:x.updated or x.time or now) # oldest to newest
        if options.order == 'newest':
            sorted_items = reversed(sorted_items)
    for i, item in enumerate(sorted_items):
        # hard cap
        if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0:
            log('dropped')
            item.remove()
@@ -359,9 +386,8 @@ def FeedGather(rss, url, options):
        if item is None:
            continue
-        item = ItemFix(item, options, url)
+        item = ItemFix(item, url)
        # soft cap
        if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
            if not options.proxy:
                if ItemFill(item, options, url, True) is False:
@@ -396,24 +422,24 @@ def FeedFormat(rss, options, encoding='utf-8'):
        else:
            raise MorssException('Invalid callback var name')
-    elif options.format == 'json':
+    elif options.json:
        if options.indent:
            return rss.tojson(encoding=encoding, indent=4)
        else:
            return rss.tojson(encoding=encoding)
-    elif options.format == 'csv':
+    elif options.csv:
        return rss.tocsv(encoding=encoding)
-    elif options.format == 'html':
+    elif options.html:
        if options.indent:
            return rss.tohtml(encoding=encoding, pretty_print=True)
        else:
            return rss.tohtml(encoding=encoding)
-    else: # i.e. format == 'rss'
+    else:
        if options.indent:
            return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding, pretty_print=True)
@@ -428,9 +454,315 @@ def process(url, cache=None, options=None):
    options = Options(options)
    if cache:
-        caching.default_cache = caching.DiskCacheHandler(cache)
+        crawler.default_cache = crawler.SQLiteCache(cache)
-    url, rss = FeedFetch(url, options)
+    url = UrlFix(url)
    rss = FeedFetch(url, options)
    rss = FeedGather(rss, url, options)
-    return FeedFormat(rss, options, 'unicode')
+    return FeedFormat(rss, options)
 def cgi_parse_environ(environ):
    # get options
    if 'REQUEST_URI' in environ:
        url = environ['REQUEST_URI'][1:]
    else:
        url = environ['PATH_INFO'][1:]
        if environ['QUERY_STRING']:
            url += '?' + environ['QUERY_STRING']
    url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
    if url.startswith(':'):
        split = url.split('/', 1)
        raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:]
        if len(split) > 1:
            url = split[1]
        else:
            url = ''
    else:
        raw_options = []
    # init
    options = Options(filterOptions(parseOptions(raw_options)))
    global DEBUG
    DEBUG = options.debug
    return (url, options)
 def cgi_app(environ, start_response):
    url, options = cgi_parse_environ(environ)
    headers = {}
    # headers
    headers['status'] = '200 OK'
    headers['cache-control'] = 'max-age=%s' % DELAY
    if options.cors:
        headers['access-control-allow-origin'] = '*'
    if options.html:
        headers['content-type'] = 'text/html'
    elif options.txt or options.silent:
        headers['content-type'] = 'text/plain'
    elif options.json:
        headers['content-type'] = 'application/json'
    elif options.callback:
        headers['content-type'] = 'application/javascript'
    elif options.csv:
        headers['content-type'] = 'text/csv'
        headers['content-disposition'] = 'attachment; filename="feed.csv"'
    else:
        headers['content-type'] = 'text/xml'
    headers['content-type'] += '; charset=utf-8'
    crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
    # get the work done
    url = UrlFix(url)
    rss = FeedFetch(url, options)
    if headers['content-type'] == 'text/xml':
        headers['content-type'] = rss.mimetype[0]
    start_response(headers['status'], list(headers.items()))
    rss = FeedGather(rss, url, options)
    out = FeedFormat(rss, options)
    if options.silent:
        return ['']
    else:
        return [out]
 def middleware(func):
    " Decorator to turn a function into a wsgi middleware "
    # This is called when parsing the "@middleware" code
    def app_builder(app):
        # This is called when doing app = cgi_wrapper(app)
        def app_wrap(environ, start_response):
            # This is called when a http request is being processed
            return func(environ, start_response, app)
        return app_wrap
    return app_builder
@middleware
 def cgi_file_handler(environ, start_response, app):
    " Simple HTTP server to serve static files (.html, .css, etc.) "
    files = {
        '': 'text/html',
        'index.html': 'text/html',
        'sheet.xsl': 'text/xsl'}
    if 'REQUEST_URI' in environ:
        url = environ['REQUEST_URI'][1:]
    else:
        url = environ['PATH_INFO'][1:]
    if url in files:
        headers = {}
        if url == '':
            url = 'index.html'
        paths = [os.path.join(sys.prefix, 'share/morss/www', url),
            os.path.join(os.path.dirname(__file__), '../www', url)]
        for path in paths:
            try:
                body = open(path, 'rb').read()
                headers['status'] = '200 OK'
                headers['content-type'] = files[url]
                start_response(headers['status'], list(headers.items()))
                return [body]
            except IOError:
                continue
        else:
            # the for loop did not return, so here we are, i.e. no file found
            headers['status'] = '404 Not found'
            start_response(headers['status'], list(headers.items()))
            return ['Error %s' % headers['status']]
    else:
        return app(environ, start_response)
 def cgi_get(environ, start_response):
    url, options = cgi_parse_environ(environ)
    # get page
    PROTOCOL = ['http', 'https']
    if urlparse(url).scheme not in ['http', 'https']:
        url = 'http://' + url
    data, con, contenttype, encoding = crawler.adv_get(url=url, timeout=TIMEOUT)
    if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
        if options.get == 'page':
            html = readabilite.parse(data, encoding=encoding)
            html.make_links_absolute(con.geturl())
            kill_tags = ['script', 'iframe', 'noscript']
            for tag in kill_tags:
                for elem in html.xpath('//'+tag):
                    elem.getparent().remove(elem)
            output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
        elif options.get == 'article':
            output = readabilite.get_article(data, url=con.geturl(), encoding=encoding, debug=options.debug)
        else:
            raise MorssException('no :get option passed')
    else:
        output = data
    # return html page
    headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8'}
    start_response(headers['status'], list(headers.items()))
    return [output]
 dispatch_table = {
    'get': cgi_get,
    }
@middleware
 def cgi_dispatcher(environ, start_response, app):
    url, options = cgi_parse_environ(environ)
    for key in dispatch_table.keys():
        if key in options:
            return dispatch_table[key](environ, start_response)
    return app(environ, start_response)
@middleware
 def cgi_error_handler(environ, start_response, app):
    try:
        return app(environ, start_response)
    except (KeyboardInterrupt, SystemExit):
        raise
    except Exception as e:
        headers = {'status': '500 Oops', 'content-type': 'text/html'}
        start_response(headers['status'], list(headers.items()), sys.exc_info())
        log('ERROR: %s' % repr(e), force=True)
        return [cgitb.html(sys.exc_info())]
@middleware
 def cgi_encode(environ, start_response, app):
    out = app(environ, start_response)
    return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
 cgi_standalone_app = cgi_encode(cgi_error_handler(cgi_dispatcher(cgi_file_handler(cgi_app))))
 def cli_app():
    options = Options(filterOptions(parseOptions(sys.argv[1:-1])))
    url = sys.argv[-1]
    global DEBUG
    DEBUG = options.debug
    crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
    url = UrlFix(url)
    rss = FeedFetch(url, options)
    rss = FeedGather(rss, url, options)
    out = FeedFormat(rss, options, 'unicode')
    if not options.silent:
        print(out)
    log('done')
 def isInt(string):
    try:
        int(string)
        return True
    except ValueError:
        return False
 def main():
    if 'REQUEST_URI' in os.environ:
        # mod_cgi
        app = cgi_app
        app = cgi_dispatcher(app)
        app = cgi_error_handler(app)
        app = cgi_encode(app)
        wsgiref.handlers.CGIHandler().run(app)
    elif len(sys.argv) <= 1 or isInt(sys.argv[1]):
        # start internal (basic) http server
        if len(sys.argv) > 1 and isInt(sys.argv[1]):
            argPort = int(sys.argv[1])
            if argPort > 0:
                port = argPort
            else:
                raise MorssException('Port must be positive integer')
        else:
            port = PORT
        app = cgi_app
        app = cgi_file_handler(app)
        app = cgi_dispatcher(app)
        app = cgi_error_handler(app)
        app = cgi_encode(app)
        print('Serving http://localhost:%s/' % port)
        httpd = wsgiref.simple_server.make_server('', port, app)
        httpd.serve_forever()
    else:
        # as a CLI app
        try:
            cli_app()
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            print('ERROR: %s' % e.message)
 if __name__ == '__main__':
    main()
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@@ -1,36 +1,19 @@
 # This file is part of morss
 #
 # Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU Affero General Public License as published by the Free
 # Software Foundation, either version 3 of the License, or (at your option) any
 # later version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
 # details.
 #
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.
 import re
 import bs4.builder._lxml
 import lxml.etree
 import lxml.html
-import lxml.html.soupparser
+from bs4 import BeautifulSoup
-
+import re
 class CustomTreeBuilder(bs4.builder._lxml.LXMLTreeBuilder):
    def default_parser(self, encoding):
        return lxml.html.HTMLParser(target=self, remove_comments=True, remove_pis=True, encoding=encoding)
 def parse(data, encoding=None):
-    kwargs = {'from_encoding': encoding} if encoding else {}
+    if encoding:
-    return lxml.html.soupparser.fromstring(data, builder=CustomTreeBuilder, **kwargs)
+        data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')
    else:
        data = BeautifulSoup(data, 'lxml').prettify('utf-8')
    parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding='utf-8')
    return lxml.html.fromstring(data, parser=parser)
 def count_words(string):
@@ -43,8 +26,6 @@ def count_words(string):
    if string is None:
        return 0
    string = string.strip()
    i = 0
    count = 0
@@ -66,6 +47,12 @@ def count_content(node):
    return count_words(node.text_content()) + len(node.findall('.//img'))
 def percentile(N, P):
    # https://stackoverflow.com/a/7464107
    n = max(int(round(P * len(N) + 0.5)), 2)
    return N[n-2]
 class_bad = ['comment', 'community', 'extra', 'foot',
    'sponsor', 'pagination', 'pager', 'tweet', 'twitter', 'com-', 'masthead',
    'media', 'meta', 'related', 'shopping', 'tags', 'tool', 'author', 'about',
@@ -83,10 +70,9 @@ class_good = ['and', 'article', 'body', 'column', 'main',
 regex_good = re.compile('|'.join(class_good), re.I)
-tags_dangerous = ['script', 'head', 'iframe', 'object', 'style', 'link', 'meta']
+tags_junk = ['script', 'head', 'iframe', 'object', 'noscript',
-
+    'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea',
-tags_junk = tags_dangerous + ['noscript', 'param', 'embed', 'layer', 'applet',
+    'button', 'footer', 'link', 'meta']
    'form', 'input', 'textarea', 'button', 'footer']
 tags_bad = tags_junk + ['a', 'aside']
@@ -114,15 +100,12 @@ def score_node(node):
    " Score individual node "
    score = 0
-    class_id = (node.get('class') or '') + (node.get('id') or '')
+    class_id = node.get('class', '') + node.get('id', '')
    if (isinstance(node, lxml.html.HtmlComment)
            or isinstance(node, lxml.html.HtmlProcessingInstruction)):
        return 0
    if node.tag in tags_dangerous:
        return 0
    if node.tag in tags_junk:
        score += -1 # actuall -2 as tags_junk is included tags_bad
@@ -144,7 +127,7 @@ def score_node(node):
    if wc != 0:
        wca = count_words(' '.join([x.text_content() for x in node.findall('.//a')]))
-        score = score * ( 1 - 2 * float(wca)/wc )
+        score = score * ( 1 - float(wca)/wc )
    return score
@@ -154,20 +137,15 @@ def score_all(node):
    for child in node:
        score = score_node(child)
-        set_score(child, score, 'morss_own_score')
+        child.attrib['morss_own_score'] = str(float(score))
        if score > 0 or len(list(child.iterancestors())) <= 2:
            spread_score(child, score)
            score_all(child)
-def set_score(node, value, label='morss_score'):
+def set_score(node, value):
-    try:
+    node.attrib['morss_score'] = str(float(value))
        node.attrib[label] = str(float(value))
    except KeyError:
        # catch issues with e.g. html comments
        pass
 def get_score(node):
@@ -207,23 +185,12 @@ def clean_root(root, keep_threshold=None):
 def clean_node(node, keep_threshold=None):
    parent = node.getparent()
    # remove comments
    if (isinstance(node, lxml.html.HtmlComment)
            or isinstance(node, lxml.html.HtmlProcessingInstruction)):
        parent.remove(node)
        return
    if parent is None:
        # this is <html/> (or a removed element waiting for GC)
        return
-    # remove dangerous tags, no matter what
+    if keep_threshold is not None and get_score(node) >= keep_threshold:
-    if node.tag in tags_dangerous:
+        # high score, so keep
        parent.remove(node)
        return
    # high score, so keep
    if keep_threshold is not None and keep_threshold > 0 and get_score(node) >= keep_threshold:
        return
    gdparent = parent.getparent()
@@ -244,6 +211,11 @@ def clean_node(node, keep_threshold=None):
        parent.remove(node)
        return
    # remove comments
    if isinstance(node, lxml.html.HtmlComment) or isinstance(node, lxml.html.HtmlProcessingInstruction):
        parent.remove(node)
        return
    # remove if too many kids & too high link density
    wc = count_words(node.text_content())
    if wc != 0 and len(list(node.iter())) > 3:
@@ -301,95 +273,63 @@ def clean_node(node, keep_threshold=None):
            gdparent.insert(gdparent.index(parent)+1, new_node)
-def lowest_common_ancestor(node_a, node_b, max_depth=None):
+def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
-    ancestors_a = list(node_a.iterancestors())
+    ancestorsA = list(nodeA.iterancestors())
-    ancestors_b = list(node_b.iterancestors())
+    ancestorsB = list(nodeB.iterancestors())
    if max_depth is not None:
-        ancestors_a = ancestors_a[:max_depth]
+        ancestorsA = ancestorsA[:max_depth]
-        ancestors_b = ancestors_b[:max_depth]
+        ancestorsB = ancestorsB[:max_depth]
-    ancestors_a.insert(0, node_a)
+    ancestorsA.insert(0, nodeA)
-    ancestors_b.insert(0, node_b)
+    ancestorsB.insert(0, nodeB)
-    for ancestor_a in ancestors_a:
+    for ancestorA in ancestorsA:
-        if ancestor_a in ancestors_b:
+        if ancestorA in ancestorsB:
-            return ancestor_a
+            return ancestorA
-    return node_a # should always find one tho, at least <html/>, but needed for max_depth
+    return nodeA # should always find one tho, at least <html/>, but needed for max_depth
-def get_best_node(html, threshold=5):
+def rank_grades(grades):
-    # score all nodes
+    # largest score to smallest
-    score_all(html)
+    return sorted(grades.items(), key=lambda x: x[1], reverse=True)
    # rank all nodes (largest to smallest)
    ranked_nodes = sorted(html.iter(), key=lambda x: get_score(x), reverse=True)
    # minimum threshold
    if not len(ranked_nodes) or get_score(ranked_nodes[0]) < threshold:
        return None
    # take common ancestor or the two highest rated nodes
    if len(ranked_nodes) > 1:
        best = lowest_common_ancestor(ranked_nodes[0], ranked_nodes[1], 3)
    else:
        best = ranked_nodes[0]
    return best
-def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5, xpath=None):
+def get_best_node(ranked_grades):
    " To pick the best (raw) node. Another function will clean it "
    if len(ranked_grades) == 1:
        return ranked_grades[0]
    lowest = lowest_common_ancestor(ranked_grades[0][0], ranked_grades[1][0], 3)
    return lowest
 def get_article(data, url=None, encoding=None, debug=False):
    " Input a raw html string, returns a raw html string of the article "
-    html = parse(data, encoding_in)
+    html = parse(data, encoding)
    score_all(html)
    scores = rank_grades(get_all_scores(html))
-    if xpath is not None:
+    if not len(scores):
        xpath_match = html.xpath(xpath)
        if len(xpath_match):
            best = xpath_match[0]
        else:
            best = get_best_node(html, threshold)
    else:
        best = get_best_node(html, threshold)
    if best is None:
        # if threshold not met
        return None
-    # clean up
+    best = get_best_node(scores)
    if not debug:
-        keep_threshold = get_score(best) * 3/4
+        keep_threshold = percentile([x[1] for x in scores], 0.1)
        clean_root(best, keep_threshold)
    # check for spammy content (links only)
    wc = count_words(best.text_content())
    wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
    if not debug and (wc - wca < 50 or float(wca) / wc > 0.3):
        return None
    # fix urls
    if url:
        best.make_links_absolute(url)
-    return lxml.etree.tostring(best if not debug else html, method='html', encoding=encoding_out)
+    return lxml.etree.tostring(best if not debug else html, pretty_print=True)
 if __name__ == '__main__':
    import sys
    from . import crawler
    req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
    article = get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
    if sys.flags.interactive:
        print('>>> Interactive shell: try using `article`')
    else:
        print(article)
--- a/morss/util.py
+++ b/morss/util.py
@@ -1,57 +0,0 @@
 # This file is part of morss
 #
 # Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU Affero General Public License as published by the Free
 # Software Foundation, either version 3 of the License, or (at your option) any
 # later version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
 # details.
 #
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.
 import os
 import os.path
 import sys
 def pkg_path(*path_elements):
    return os.path.join(os.path.dirname(__file__), *path_elements)
 data_path_base = None
 def data_path(*path_elements):
    global data_path_base
    path = os.path.join(*path_elements)
    if data_path_base is not None:
        return os.path.join(data_path_base, path)
    bases = [
        os.path.join(sys.prefix, 'share/morss'), # when installed as root
        pkg_path('../../../share/morss'), 
        pkg_path('../../../../share/morss'),
        pkg_path('../share/morss'), # for `pip install --target=dir morss`
        pkg_path('..'), # when running from source tree
    ]
    if 'DATA_PATH' in os.environ:
        bases.append(os.environ['DATA_PATH'])
    for base in bases:
        full_path = os.path.join(base, path)
        if os.path.isfile(full_path):
            data_path_base = os.path.abspath(base)
            return data_path(path)
    else:
        raise IOError()
--- a/morss/wsgi.py
+++ b/morss/wsgi.py
@@ -1,298 +0,0 @@
 # This file is part of morss
 #
 # Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU Affero General Public License as published by the Free
 # Software Foundation, either version 3 of the License, or (at your option) any
 # later version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
 # details.
 #
 # You should have received a copy of the GNU Affero General Public License along
 # with this program. If not, see <https://www.gnu.org/licenses/>.
 import cgitb
 import mimetypes
 import os.path
 import re
 import sys
 import wsgiref.handlers
 import wsgiref.simple_server
 import wsgiref.util
 import lxml.etree
 try:
    # python 2
    from urllib import unquote
 except ImportError:
    # python 3
    from urllib.parse import unquote
 from . import caching, crawler, readabilite
 from .morss import (DELAY, TIMEOUT, FeedFetch, FeedFormat, FeedGather,
                    MorssException, Options, log)
 from .util import data_path
 PORT = int(os.getenv('PORT', 8000))
 def parse_options(options):
    """ Turns ['md=True'] into {'md':True} """
    out = {}
    for option in options:
        split = option.split('=', 1)
        if len(split) > 1:
            out[split[0]] = unquote(split[1]).replace('|', '/') # | -> / for backward compatibility (and Apache)
        else:
            out[split[0]] = True
    return out
 def request_uri(environ):
    if 'REQUEST_URI' in environ:
        # when running on Apache/uwsgi
        url = environ['REQUEST_URI']
    elif 'RAW_URI' in environ:
        # gunicorn
        url = environ['RAW_URI']
    else:
        # when using other servers
        url = environ['PATH_INFO']
        if environ['QUERY_STRING']:
            url += '?' + environ['QUERY_STRING']
    return url
 def cgi_parse_environ(environ):
    # get options
    url = request_uri(environ)[1:]
    url = re.sub(r'^(cgi/)?(morss.py|main.py)/', '', url)
    if url.startswith(':'):
        parts = url.split('/', 1)
        raw_options = parts[0].split(':')[1:]
        url = parts[1] if len(parts) > 1 else ''
    else:
        raw_options = []
    # init
    options = Options(parse_options(raw_options))
    return (url, options)
 def cgi_app(environ, start_response):
    url, options = cgi_parse_environ(environ)
    headers = {}
    # headers
    headers['status'] = '200 OK'
    headers['cache-control'] = 'max-age=%s' % DELAY
    headers['x-content-type-options'] = 'nosniff' # safari work around
    if options.cors:
        headers['access-control-allow-origin'] = '*'
    if options.format == 'html':
        headers['content-type'] = 'text/html'
    elif options.txt or options.silent:
        headers['content-type'] = 'text/plain'
    elif options.format == 'json':
        headers['content-type'] = 'application/json'
    elif options.callback:
        headers['content-type'] = 'application/javascript'
    elif options.format == 'csv':
        headers['content-type'] = 'text/csv'
        headers['content-disposition'] = 'attachment; filename="feed.csv"'
    else:
        headers['content-type'] = 'text/xml'
    headers['content-type'] += '; charset=utf-8'
    # get the work done
    url, rss = FeedFetch(url, options)
    start_response(headers['status'], list(headers.items()))
    rss = FeedGather(rss, url, options)
    out = FeedFormat(rss, options)
    if options.silent:
        return ['']
    else:
        return [out]
 def middleware(func):
    " Decorator to turn a function into a wsgi middleware "
    # This is called when parsing the "@middleware" code
    def app_builder(app):
        # This is called when doing app = cgi_wrapper(app)
        def app_wrap(environ, start_response):
            # This is called when a http request is being processed
            return func(environ, start_response, app)
        return app_wrap
    return app_builder
@middleware
 def cgi_file_handler(environ, start_response, app):
    " Simple HTTP server to serve static files (.html, .css, etc.) "
    url = request_uri(environ)[1:]
    if url == '':
        url = 'index.html'
    if re.match(r'^/?([a-zA-Z0-9_-][a-zA-Z0-9\._-]+/?)*$', url):
        # if it is a legitimate url (no funny relative paths)
        try:
            path = data_path('www', url)
            f = open(path, 'rb')
        except IOError:
            # problem with file (cannot open or not found)
            pass
        else:
            # file successfully open
            headers = {}
            headers['status'] = '200 OK'
            headers['content-type'] = mimetypes.guess_type(path)[0] or 'application/octet-stream'
            start_response(headers['status'], list(headers.items()))
            return wsgiref.util.FileWrapper(f)
    # regex didn't validate or no file found
    return app(environ, start_response)
 def cgi_get(environ, start_response):
    url, options = cgi_parse_environ(environ)
    # get page
    if options['get'] in ('page', 'article'):
        req = crawler.adv_get(url=url, timeout=TIMEOUT)
        if req['contenttype'] in crawler.MIMETYPE['html']:
            if options['get'] == 'page':
                html = readabilite.parse(req['data'], encoding=req['encoding'])
                html.make_links_absolute(req['url'])
                kill_tags = ['script', 'iframe', 'noscript']
                for tag in kill_tags:
                    for elem in html.xpath('//'+tag):
                        elem.getparent().remove(elem)
                output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
            else: # i.e. options['get'] == 'article'
                output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
        elif req['contenttype'] in crawler.MIMETYPE['xml'] + crawler.MIMETYPE['rss'] + crawler.MIMETYPE['json']:
            output = req['data']
        else:
            raise MorssException('unsupported mimetype')
    else:
        raise MorssException('no :get option passed')
    # return html page
    headers = {'status': '200 OK', 'content-type': req['contenttype'], 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
    start_response(headers['status'], list(headers.items()))
    return [output]
 dispatch_table = {
    'get': cgi_get,
    }
@middleware
 def cgi_dispatcher(environ, start_response, app):
    url, options = cgi_parse_environ(environ)
    for key in dispatch_table.keys():
        if key in options:
            return dispatch_table[key](environ, start_response)
    return app(environ, start_response)
@middleware
 def cgi_error_handler(environ, start_response, app):
    try:
        return app(environ, start_response)
    except (KeyboardInterrupt, SystemExit):
        raise
    except Exception as e:
        headers = {'status': '404 Not Found', 'content-type': 'text/html', 'x-morss-error': repr(e)}
        start_response(headers['status'], list(headers.items()), sys.exc_info())
        log('ERROR: %s' % repr(e))
        return [cgitb.html(sys.exc_info())]
@middleware
 def cgi_encode(environ, start_response, app):
    out = app(environ, start_response)
    return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
 application = cgi_app
 application = cgi_file_handler(application)
 application = cgi_dispatcher(application)
 application = cgi_error_handler(application)
 application = cgi_encode(application)
 def cgi_handle_request():
    app = cgi_app
    app = cgi_dispatcher(app)
    app = cgi_error_handler(app)
    app = cgi_encode(app)
    wsgiref.handlers.CGIHandler().run(app)
 class WSGIRequestHandlerRequestUri(wsgiref.simple_server.WSGIRequestHandler):
    def get_environ(self):
        env = wsgiref.simple_server.WSGIRequestHandler.get_environ(self)
        env['REQUEST_URI'] = self.path
        return env
 def cgi_start_server():
    caching.default_cache.autotrim()
    print('Serving http://localhost:%s/' % PORT)
    httpd = wsgiref.simple_server.make_server('', PORT, application, handler_class=WSGIRequestHandlerRequestUri)
    httpd.serve_forever()
 if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''):
    caching.default_cache.autotrim()
--- a/setup.py
+++ b/setup.py
@@ -1,60 +1,24 @@
 from datetime import datetime
 from glob import glob
 from setuptools import setup
-
+from glob import glob
 def get_version():
    with open('morss/__init__.py', 'r+') as file:
        lines = file.readlines()
        # look for hard coded version number
        for i in range(len(lines)):
            if lines[i].startswith('__version__'):
                version = lines[i].split('"')[1]
                break
        # create (& save) one if none found
        if version == '':
            version = datetime.now().strftime('%Y%m%d.%H%M')
            lines[i] = '__version__ = "' + version + '"\n'
            file.seek(0)
            file.writelines(lines)
        # return version number
        return version
 package_name = 'morss'
 setup(
    name = package_name,
    version = get_version(),
    description = 'Get full-text RSS feeds',
-    long_description = open('README.md').read(),
+    author = 'pictuga, Samuel Marks',
-    long_description_content_type = 'text/markdown',
+    author_email = 'contact at pictuga dot com',
    author = 'pictuga',
    author_email = 'contact@pictuga.com',
    url = 'http://morss.it/',
-    project_urls = {
+    download_url = 'https://git.pictuga.com/pictuga/morss',
        'Source': 'https://git.pictuga.com/pictuga/morss',
        'Bug Tracker': 'https://github.com/pictuga/morss/issues',
    },
    license = 'AGPL v3',
    packages = [package_name],
-    install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'],
+    install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet', 'pymysql'],
    extras_require = {
        'full': ['redis', 'diskcache', 'gunicorn', 'setproctitle'],
        'dev': ['pylint', 'pyenchant', 'pytest', 'pytest-cov'],
    },
    python_requires = '>=2.7',
    package_data = {package_name: ['feedify.ini']},
    data_files = [
        ('share/' + package_name, ['README.md', 'LICENSE']),
        ('share/' + package_name + '/www', glob('www/*.*')),
        ('share/' + package_name + '/www/cgi', [])
    ],
    entry_points = {
-        'console_scripts': [package_name + '=' + package_name + '.__main__:main'],
+        'console_scripts': [package_name + '=' + package_name + ':main']
-    },
+    })
    scripts = ['morss-helper'],
 )
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,60 +0,0 @@
 import os
 import os.path
 import threading
 import pytest
 try:
    # python2
    from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
    from SimpleHTTPServer import SimpleHTTPRequestHandler
 except:
    # python3
    from http.server import (BaseHTTPRequestHandler, HTTPServer,
                             SimpleHTTPRequestHandler)
 class HTTPReplayHandler(SimpleHTTPRequestHandler):
    " Serves pages saved alongside with headers. See `curl --http1.1 -is http://...` "
    directory = os.path.join(os.path.dirname(__file__), './samples/')
    __init__ = BaseHTTPRequestHandler.__init__
    def do_GET(self):
        path = self.translate_path(self.path)
        if os.path.isdir(path):
            f = self.list_directory(path)
        else:
            f = open(path, 'rb')
        try:
            self.copyfile(f, self.wfile)
        finally:
            f.close()
 class MuteHTTPServer(HTTPServer):
    def handle_error(self, request, client_address):
        # mute errors
        pass
 def make_server(port=8888):
    print('Serving http://localhost:%s/' % port)
    return MuteHTTPServer(('', port), RequestHandlerClass=HTTPReplayHandler)
@pytest.fixture
 def replay_server():
    httpd = make_server()
    thread = threading.Thread(target=httpd.serve_forever)
    thread.start()
    yield
    httpd.shutdown()
    thread.join()
 if __name__ == '__main__':
    httpd = make_server()
    httpd.serve_forever()
--- a/tests/samples/200-ok.txt
+++ b/tests/samples/200-ok.txt
@@ -1,4 +0,0 @@
 HTTP/1.1 200 OK
 content-type: text/plain
 success
--- a/tests/samples/301-redirect-abs.txt
+++ b/tests/samples/301-redirect-abs.txt
@@ -1,3 +0,0 @@
 HTTP/1.1 301 Moved Permanently
 location: /200-ok.txt
--- a/tests/samples/301-redirect-rel.txt
+++ b/tests/samples/301-redirect-rel.txt
@@ -1,3 +0,0 @@
 HTTP/1.1 301 Moved Permanently
 location: ./200-ok.txt
--- a/tests/samples/301-redirect-url.txt
+++ b/tests/samples/301-redirect-url.txt
@@ -1,3 +0,0 @@
 HTTP/1.1 301 Moved Permanently
 location: http://localhost:8888/200-ok.txt
--- a/tests/samples/308-redirect.txt
+++ b/tests/samples/308-redirect.txt
@@ -1,4 +0,0 @@
 HTTP/1.1 308 Permanent Redirect
 location: /200-ok.txt
 /200-ok.txt
--- a/tests/samples/alternate-abs.txt
+++ b/tests/samples/alternate-abs.txt
@@ -1,8 +0,0 @@
 HTTP/1.1 200 OK
 content-type: text/html; charset=UTF-8
 <!DOCTYPE html>
 <html>
 <head><link rel="alternate" type="application/rss+xml" href="/200-ok.txt" /></head>
 <body>meta redirect</body>
 </html>
--- a/tests/samples/enc-gb2312-header.txt
+++ b/tests/samples/enc-gb2312-header.txt
@@ -1,4 +0,0 @@
 HTTP/1.1 200 OK
 content-type: text/plain; charset=gb2312
 <EFBFBD>ɹ<EFBFBD>
--- a/tests/samples/enc-gb2312-meta.txt
+++ b/tests/samples/enc-gb2312-meta.txt
@@ -1,10 +0,0 @@
 HTTP/1.1 200 OK
 content-type: text/html
 <!DOCTYPE html>
 <html>
 <head><meta charset="gb2312"/></head>
 <body>
 <EFBFBD>ɹ<EFBFBD>
 </body></html>
--- a/tests/samples/enc-iso-8859-1-header.txt
+++ b/tests/samples/enc-iso-8859-1-header.txt
@@ -1,4 +0,0 @@
 HTTP/1.1 200 OK
 content-type: text/plain; charset=iso-8859-1
 succ<EFBFBD>s
--- a/tests/samples/enc-iso-8859-1-missing.txt
+++ b/tests/samples/enc-iso-8859-1-missing.txt
@@ -1,4 +0,0 @@
 HTTP/1.1 200 OK
 content-type: text/plain
 succ<EFBFBD>s
--- a/tests/samples/enc-utf-8-header.txt
+++ b/tests/samples/enc-utf-8-header.txt
@@ -1,4 +0,0 @@
 HTTP/1.1 200 OK
 content-type: text/plain; charset=UTF-8
 succès
--- a/tests/samples/feed-atom-utf-8.txt
+++ b/tests/samples/feed-atom-utf-8.txt
@@ -1,16 +0,0 @@
 HTTP/1.1 200 OK
 Content-Type: text/xml; charset=utf-8
 <?xml version='1.0' encoding='utf-8'?>
 <feed xmlns="http://www.w3.org/2005/Atom">
 	<title>!TITLE!</title>
 	<subtitle>!DESC!</subtitle>
 	<entry>
 		<title>!ITEM_TITLE!</title>
 		<summary>!ITEM_DESC!</summary>
 		<content type="html">!ITEM_CONTENT!</content>
 		<link href="!ITEM_LINK!"/>
 		<updated>2022-01-01T00:00:01+01:00</updated>
 		<published>2022-01-01T00:00:02+01:00</published>
 	</entry>
 </feed>
--- a/tests/samples/feed-atom03-utf-8.txt
+++ b/tests/samples/feed-atom03-utf-8.txt
@@ -1,15 +0,0 @@
 HTTP/1.1 200 OK
 content-type: application/xml
 <?xml version='1.0' encoding='utf-8' ?>
 <feed version='0.3' xmlns='http://purl.org/atom/ns#'>
 	<title>!TITLE!</title>
 	<subtitle>!DESC!</subtitle>
 	<entry>
 		<title>!ITEM_TITLE!</title>
 		<link rel='alternate' type='text/html' href='!ITEM_LINK!' />
 		<summary>!ITEM_DESC!</summary>
 		<content>!ITEM_CONTENT!</content>
 		<issued>2022-01-01T00:00:01+01:00</issued> <!-- FIXME -->
 	</entry>
 </feed>
--- a/tests/samples/feed-html-utf-8.txt
+++ b/tests/samples/feed-html-utf-8.txt
@@ -1,22 +0,0 @@
 HTTP/1.1 200 OK
 Content-Type: text/html; charset=utf-8
 <html>
 <head></head>
 <body>
 <div id="header">
 	<h1>!TITLE!</h1>
 	<p>!DESC!</p>
 </div>
 <div id="content">
 	<div class="item">
 		<a target="_blank" href="!ITEM_LINK!">!ITEM_TITLE!</a>
 		<div class="desc">!ITEM_DESC!</div>
 		<div class="content">!ITEM_CONTENT!</div>
 	</div>
 </div>
 </body>
 </html>
--- a/tests/samples/feed-json-utf-8.txt
+++ b/tests/samples/feed-json-utf-8.txt
@@ -1,16 +0,0 @@
 HTTP/1.1 200 OK
 Content-Type: application/json; charset=utf-8
 {
 	"title": "!TITLE!",
 	"desc": "!DESC!",
 	"items": [
 		{
 			"title": "!ITEM_TITLE!",
 			"time": "2022-01-01T00:00:01+0100",
 			"url": "!ITEM_LINK!",
 			"desc": "!ITEM_DESC!",
 			"content": "!ITEM_CONTENT!"
 		}
 	]
 }
--- a/tests/samples/feed-rss-channel-utf-8.txt
+++ b/tests/samples/feed-rss-channel-utf-8.txt
@@ -1,17 +0,0 @@
 HTTP/1.1 200 OK
 Content-Type: text/xml; charset=utf-8
 <?xml version='1.0' encoding='utf-8'?>
 <rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
  <channel>
    <title>!TITLE!</title>
    <description>!DESC!</description>
    <item>
      <title>!ITEM_TITLE!</title>
      <pubDate>Mon, 01 Jan 2022 00:00:01 +0100</pubDate>
      <link>!ITEM_LINK!</link>
      <description>!ITEM_DESC!</description>
      <content:encoded>!ITEM_CONTENT!</content:encoded>
    </item>
  </channel>
 </rss>
--- a/tests/samples/gzip.txt
+++ b/tests/samples/gzip.txt
--- a/tests/samples/header-refresh.txt
+++ b/tests/samples/header-refresh.txt
@@ -1,3 +0,0 @@
 HTTP/1.1 200 OK
 refresh: 0;url=/200-ok.txt
--- a/tests/samples/meta-redirect-abs.txt
+++ b/tests/samples/meta-redirect-abs.txt
@@ -1,8 +0,0 @@
 HTTP/1.1 200 OK
 content-type: text/html; charset=UTF-8
 <!DOCTYPE html>
 <html>
 <head><meta http-equiv="refresh" content="2; url = /200-ok.txt" /></head>
 <body>meta redirect</body>
 </html>
--- a/tests/samples/meta-redirect-rel.txt
+++ b/tests/samples/meta-redirect-rel.txt
@@ -1,8 +0,0 @@
 HTTP/1.1 200 OK
 content-type: text/html; charset=UTF-8
 <!DOCTYPE html>
 <html>
 <head><meta http-equiv="refresh" content="2; url = ./200-ok.txt" /></head>
 <body>meta redirect</body>
 </html>
--- a/tests/samples/meta-redirect-url.txt
+++ b/tests/samples/meta-redirect-url.txt
@@ -1,8 +0,0 @@
 HTTP/1.1 200 OK
 content-type: text/html; charset=UTF-8
 <!DOCTYPE html>
 <html>
 <head><meta http-equiv="refresh" content="2; url = http://localhost:8888/200-ok.txt" /></head>
 <body>meta redirect</body>
 </html>
--- a/tests/samples/size-1MiB.txt
+++ b/tests/samples/size-1MiB.txt
--- a/tests/test_crawler.py
+++ b/tests/test_crawler.py
@@ -1,62 +0,0 @@
 import pytest
 from morss.crawler import *
 def test_get(replay_server):
    assert get('http://localhost:8888/200-ok.txt') == b'success\r\n'
 def test_adv_get(replay_server):
    assert adv_get('http://localhost:8888/200-ok.txt')['data'] == b'success\r\n'
@pytest.mark.parametrize('before,after', [
    (b'http://localhost:8888/',     'http://localhost:8888/'),
    ('localhost:8888/',             'http://localhost:8888/'),
    ('http:/localhost:8888/',       'http://localhost:8888/'),
    ('http://localhost:8888/&/',     'http://localhost:8888/&/'),
    ('http://localhost:8888/ /',    'http://localhost:8888/%20/'),
    ('http://localhost-€/€/',       'http://xn--localhost--077e/%E2%82%AC/'),
    ('http://localhost-€:8888/€/',  'http://xn--localhost--077e:8888/%E2%82%AC/'),
    ])
 def test_sanitize_url(before, after):
    assert sanitize_url(before) == after
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(SizeLimitHandler(500*1024))])
 def test_size_limit_handler(replay_server, opener):
    assert len(opener.open('http://localhost:8888/size-1MiB.txt').read()) == 500*1024
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(GZIPHandler())])
 def test_gzip_handler(replay_server, opener):
    assert opener.open('http://localhost:8888/gzip.txt').read() == b'success\n'
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(EncodingFixHandler())])
@pytest.mark.parametrize('url', [
    'enc-gb2312-header.txt', 'enc-gb2312-meta.txt', #'enc-gb2312-missing.txt',
    'enc-iso-8859-1-header.txt', 'enc-iso-8859-1-missing.txt',
    'enc-utf-8-header.txt',
    ])
 def test_encoding_fix_handler(replay_server, opener, url):
    out = adv_get('http://localhost:8888/%s' % url)
    out = out['data'].decode(out['encoding'])
    assert 'succes' in out or 'succès' in out or '成功' in out
@pytest.mark.parametrize('opener', [custom_opener(follow='rss'), build_opener(AlternateHandler(MIMETYPE['rss']))])
 def test_alternate_handler(replay_server, opener):
    assert opener.open('http://localhost:8888/alternate-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPEquivHandler(), HTTPRefreshHandler())])
 def test_http_equiv_handler(replay_server, opener):
    assert opener.open('http://localhost:8888/meta-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
    assert opener.open('http://localhost:8888/meta-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
    assert opener.open('http://localhost:8888/meta-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPAllRedirectHandler())])
 def test_http_all_redirect_handler(replay_server, opener):
    assert opener.open('http://localhost:8888/308-redirect.txt').geturl() == 'http://localhost:8888/200-ok.txt'
    assert opener.open('http://localhost:8888/301-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
    assert opener.open('http://localhost:8888/301-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
    assert opener.open('http://localhost:8888/301-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPRefreshHandler())])
 def test_http_refresh_handler(replay_server, opener):
    assert opener.open('http://localhost:8888/header-refresh.txt').geturl() == 'http://localhost:8888/200-ok.txt'
--- a/tests/test_feeds.py
+++ b/tests/test_feeds.py
@@ -1,108 +0,0 @@
 import pytest
 from morss.crawler import adv_get
 from morss.feeds import *
 def get_feed(url):
    url = 'http://localhost:8888/%s' % url
    out = adv_get(url)
    feed = parse(out['data'], url=url, encoding=out['encoding'])
    return feed
 def check_feed(feed):
    # NB. time and updated not covered
    assert feed.title == '!TITLE!'
    assert feed.desc == '!DESC!'
    assert feed[0] == feed.items[0]
    assert feed[0].title == '!ITEM_TITLE!'
    assert feed[0].link == '!ITEM_LINK!'
    assert '!ITEM_DESC!' in feed[0].desc # broader test due to possible inclusion of surrounding <div> in xml
    assert '!ITEM_CONTENT!' in feed[0].content
 def check_output(feed):
    output = feed.tostring()
    assert '!TITLE!' in output
    assert '!DESC!' in output
    assert '!ITEM_TITLE!' in output
    assert '!ITEM_LINK!' in output
    assert '!ITEM_DESC!' in output
    assert '!ITEM_CONTENT!' in output
 def check_change(feed):
    feed.title = '!TITLE2!'
    feed.desc = '!DESC2!'
    feed[0].title = '!ITEM_TITLE2!'
    feed[0].link = '!ITEM_LINK2!'
    feed[0].desc = '!ITEM_DESC2!'
    feed[0].content = '!ITEM_CONTENT2!'
    assert feed.title == '!TITLE2!'
    assert feed.desc == '!DESC2!'
    assert feed[0].title == '!ITEM_TITLE2!'
    assert feed[0].link == '!ITEM_LINK2!'
    assert '!ITEM_DESC2!' in feed[0].desc
    assert '!ITEM_CONTENT2!' in feed[0].content
 def check_add(feed):
    feed.append({
        'title': '!ITEM_TITLE3!',
        'link': '!ITEM_LINK3!',
        'desc': '!ITEM_DESC3!',
        'content': '!ITEM_CONTENT3!',
    })
    assert feed[1].title == '!ITEM_TITLE3!'
    assert feed[1].link == '!ITEM_LINK3!'
    assert '!ITEM_DESC3!' in feed[1].desc
    assert '!ITEM_CONTENT3!' in feed[1].content
 each_format = pytest.mark.parametrize('url', [
    'feed-rss-channel-utf-8.txt', 'feed-atom-utf-8.txt',
    'feed-atom03-utf-8.txt', 'feed-json-utf-8.txt', 'feed-html-utf-8.txt',
    ])
 each_check = pytest.mark.parametrize('check', [
    check_feed, check_output, check_change, check_add,
    ])
@each_format
@each_check
 def test_parse(replay_server, url, check):
    feed = get_feed(url)
    check(feed)
@each_format
@each_check
 def test_convert_rss(replay_server, url, check):
    feed = get_feed(url)
    feed = feed.convert(FeedXML)
    check(feed)
@each_format
@each_check
 def test_convert_json(replay_server, url, check):
    feed = get_feed(url)
    feed = feed.convert(FeedJSON)
    check(feed)
@each_format
@each_check
 def test_convert_html(replay_server, url, check):
    feed = get_feed(url)
    feed = feed.convert(FeedHTML)
    if len(feed) > 1:
        # remove the 'blank' default html item
        del feed[0]
    check(feed)
@each_format
 def test_convert_csv(replay_server, url):
    # only csv output, not csv feed, check therefore differnet
    feed = get_feed(url)
    output = feed.tocsv()
    assert '!ITEM_TITLE!' in output
    assert '!ITEM_LINK!' in output
    assert '!ITEM_DESC!' in output
    assert '!ITEM_CONTENT!' in output
--- a/www/.htaccess
+++ b/www/.htaccess
@@ -0,0 +1,9 @@
 Options -Indexes
 ErrorDocument 403 "Access forbidden"
 ErrorDocument 404 /cgi/main.py
 ErrorDocument 500 "A very nasty bug found his way onto this very server"
 <Files ~ "\.(py|pyc|db|log)$">
 	deny from all
 </Files>
--- a/www/cgi/.htaccess
+++ b/www/cgi/.htaccess
@@ -0,0 +1,9 @@
 order allow,deny
 deny from all
 <Files main.py>
 	allow from all
 	AddHandler cgi-script .py
 	Options +ExecCGI
 </Files>
--- a/www/index.html
+++ b/www/index.html
@@ -4,7 +4,6 @@
 		<title>morss</title>
 		<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
 		<meta charset="UTF-8" />
 		<link rel="shortcut icon" type="image/svg+xml" href="/logo.svg" sizes="any" />
 		<style type="text/css">
 			body
 			{
--- a/www/logo.svg
+++ b/www/logo.svg
@@ -1,17 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <svg width="16" height="16" viewBox="0 0 16 16" shape-rendering="crispEdges" fill="black" version="1.1" xmlns="http://www.w3.org/2000/svg">
 	<rect x="2" y="4" width="2" height="2" />
 	<rect x="5" y="4" width="6" height="2" />
 	<rect x="12" y="4" width="2" height="2" />
 	<rect x="2" y="7" width="2" height="2" />
 	<rect x="7" y="7" width="2" height="2" />
 	<rect x="12" y="7" width="2" height="2" />
 	<rect x="2" y="10" width="2" height="2" />
 	<rect x="7" y="10" width="2" height="2" />
 	<rect x="12" y="10" width="2" height="2" />
 </svg>
 <!-- This work by pictuga is licensed under CC BY-NC-SA 4.0. To view a copy of
 this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0 -->
--- a/www/sheet.xsl
+++ b/www/sheet.xsl
@@ -14,119 +14,21 @@
 		<html>
 		<head>
 			<title>RSS feed by morss</title>
-			<meta name="viewport" content="width=device-width; initial-scale=1.0;" />
+			<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
 			<meta name="robots" content="noindex" />
 			<link rel="shortcut icon" type="image/svg+xml" href="/logo.svg" sizes="any" />
 			<style type="text/css">
 				body * {
 					box-sizing:  border-box;
 				}
 				body {
 					overflow-wrap: anywhere;
 					word-wrap: anywhere;
-					word-break: break-word;
+					font-family: sans;
 					font-family: sans-serif;
 					-webkit-tap-highlight-color: transparent; /* safari work around */
 				}
-				input, select {
+				#url {
-					font-family: inherit;
+					background-color: rgba(255, 165, 0, 0.25);
-					font-size: inherit;
+					padding: 1% 5%;
 					text-align: inherit;
 				}
 				header {
 					text-align: justify;
 					text-align-last: center;
 					border-bottom: 1px solid silver;
 				}
 				.input-combo {
 					display: flex;
 					flex-flow: row;
 					align-items: stretch;
 					width: 800px;
 					max-width: 100%;
 					margin: auto;
 					border: 1px solid grey;
 					padding: .5em .5em;
 					background-color: #FFFAF4;
 				}
 				.input-combo * {
 					display: inline-block;
 					line-height: 2em;
 					border: 0;
 					background: transparent;
 				}
 				.input-combo > :not(.button) {
 					max-width: 100%;
 					flex-grow: 1;
 					flex-shrink 0;
 					white-space: nowrap;
 					text-overflow: ellipsis;
 					overflow: hidden;
 				}
 				.input-combo .button {
 					flex-grow: 0;
 					flex-shrink 1;
 					cursor: pointer;
 					min-width: 2em;
 					text-align: center;
 					border-left: 1px solid silver;
 					color: #06f;
 				}
 				[onclick_title] {
 					cursor: pointer;
 					position: relative;
 				}
 				[onclick_title]::before {
 					opacity: 0;
 					content: attr(onclick_title);
 					font-weight: normal;
 					position: absolute;
 					left: -300%;
 					z-index: 1;
 					background: grey;
 					color: white;
 					border-radius: 0.5em;
 					padding: 0 1em;
 				}
 				[onclick_title]:not(:active)::before {
 					transition: opacity 1s ease-in-out;
 				}
 				[onclick_title]:active::before {
 					opacity: 1;
 				}
 				header > form {
 					margin: 1%;
 				}
 				header a {
 					text-decoration: inherit;
 					color: #FF7B0A;
 					font-weight: bold;
 				}
 				.item {
@@ -140,10 +42,6 @@
 					padding: 1%;
 				}
 				.item > *:empty {
 					display: none;
 				}
 				.item > :not(:last-child) {
 					border-bottom: 1px solid silver;
 				}
@@ -155,73 +53,23 @@
 					font-size: 1.5em;
 				}
-				.desc, .content {
+				.content * {
 					overflow: hidden;
 				}
 				.desc *, .content * {
 					max-width: 100%;
 				}
 			</style>
 		</head>
 		<body>
-			<header>
+			<h1>RSS feed by morss</h1>
 				<h1>RSS feed by morss</h1>
-				<p>Your RSS feed is <strong style="color: green">ready</strong>. You
+			<p>Your RSS feed is <strong style="color: green">ready</strong>. You
-				can enter the following url in your newsreader:</p>
+			can enter the following url in your newsreader:</p>
-				<div class="input-combo">
+			<div id="url"></div>
 					<input id="url" readonly="readonly"/>
 					<span class="button" onclick="copy_link()" title="Copy" onclick_title="Copied">
 						<svg width="16px" height="16px" viewBox="0 0 16 16" fill="currentColor" xmlns="http://www.w3.org/2000/svg">
 							<path fill-rule="evenodd" d="M4 1.5H3a2 2 0 00-2 2V14a2 2 0 002 2h10a2 2 0 002-2V3.5a2 2 0 00-2-2h-1v1h1a1 1 0 011 1V14a1 1 0 01-1 1H3a1 1 0 01-1-1V3.5a1 1 0 011-1h1v-1z" clip-rule="evenodd"/>
 							<path fill-rule="evenodd" d="M9.5 1h-3a.5.5 0 00-.5.5v1a.5.5 0 00.5.5h3a.5.5 0 00.5-.5v-1a.5.5 0 00-.5-.5zm-3-1A1.5 1.5 0 005 1.5v1A1.5 1.5 0 006.5 4h3A1.5 1.5 0 0011 2.5v-1A1.5 1.5 0 009.5 0h-3z" clip-rule="evenodd"/>
 						</svg>
 					</span>
 				</div>
-				<form onchange="open_feed()">
+			<hr/>
 					More options: Output the 
 					<select>
 						<option value="">full-text</option>
 						<option value=":proxy">original</option>
 						<option value=":clip" title="original + full-text: keep the original description above the full article. Useful for reddit feeds for example, to keep the comment links">combined (?)</option>
 					</select>
 					feed as 
 					<select>
 						<option value="">RSS</option>
 						<option value=":format=json:cors">JSON</option>
 						<option value=":format=html">HTML</option>
 						<option value=":format=csv">CSV</option>
 					</select>
 					using the 
 					<select>
 						<option value="">standard</option>
 						<option value=":firstlink" title="Pull the article from the first available link in the description, instead of the standard link. Useful for Twitter feeds for example, to get the articles referred to in tweets rather than the tweet itself">first (?)</option>
 					</select>
 					link of the 
 					<select>
 						<option value="">first</option>
 						<option value=":order=newest" title="Select feed items by publication date (instead of appearing order)">newest (?)</option>
 						<option value=":order=last">last</option>
 						<option value=":order=oldest">oldest</option>
 					</select>
 					items and 
 					<select>
 						<option value="">keep</option>
 						<option value=":nolink:noref">remove</option>
 					</select>
 					links
 					<input type="hidden" value="" name="extra_options"/>
 				</form>
-				<p>You can find a <em>preview</em> of the feed below. You need a <em>feed reader</em> for optimal use</p>
+			<div id="header">
 				<p>Click <a href="/">here</a> to go back to morss and/or to use the tool on another feed</p>
 			</header>
 			<div id="header" dir="auto">
 				<h1>
 					<xsl:value-of select="rdf:RDF/rssfake:channel/rssfake:title|rss/channel/title|atom:feed/atom:title|atom03:feed/atom03:title"/>
 				</h1>
@@ -233,8 +81,8 @@
 			<div id="content">
 				<xsl:for-each select="rdf:RDF/rssfake:channel/rssfake:item|rss/channel/item|atom:feed/atom:entry|atom03:feed/atom03:entry">
-					<div class="item" dir="auto">
+					<div class="item">
-						<a target="_blank"><xsl:attribute name="href"><xsl:value-of select="rssfake:link|link|atom:link/@href|atom03:link/@href"/></xsl:attribute>
+						<a href="/" target="_blank"><xsl:attribute name="href"><xsl:value-of select="rssfake:link|link|atom:link/@href|atom03:link/@href"/></xsl:attribute>
 								<xsl:value-of select="rssfake:title|title|atom:title|atom03:title"/>
 						</a>
@@ -250,53 +98,11 @@
 			</div>
 			<script>
-			//<![CDATA[
+				document.getElementById("url").innerHTML = window.location.href.replace(/:html\/?/, '')
 				document.getElementById("url").value = window.location.href
 				if (!/:html/.test(window.location.href))
 					for (var content of document.querySelectorAll(".desc,.content"))
-						content.innerHTML = (content.innerText.match(/>/g) || []).length > 3 ? content.innerText : content.innerHTML
+						content.innerHTML = content.children.children ? content.innerHTML : content.innerText
 				var options = parse_location()[0]
 				if (options) {
 					for (var select of document.forms[0].elements)
 						if (select.tagName == 'SELECT')
 							for (var option of select)
 								if (option.value && options.match(option.value)) {
 									select.value = option.value
 									options = options.replace(option.value, '')
 									break
 								}
 					document.forms[0]['extra_options'].value = options
 				}
 				function copy_content(input) {
 					input.focus()
 					input.select()
 					document.execCommand('copy')
 					input.blur()
 				}
 				function copy_link() {
 					copy_content(document.getElementById("url"))
 				}
 				function parse_location() {
 					return (window.location.pathname + window.location.search).match(/^\/(?:(:[^\/]+)\/)?(.*$)$/).slice(1)
 				}
 				function open_feed() {
 					var url = parse_location()[1]
 					var options = Array.from(document.forms[0].elements).map(x=>x.value).join('')
 					var target = '/' + (options ? options + '/' : '') + url
 					if (target != window.location.pathname)
 						window.location.href = target
 				}
 			//]]>
 			</script>
 		</body>
 		</html>
		`@@ -1,3 +0,0 @@`
			`HTTP/1.1 301 Moved Permanently`
			`location: /200-ok.txt`
		`@@ -1,3 +0,0 @@`
			`HTTP/1.1 301 Moved Permanently`
			`location: ./200-ok.txt`
		`@@ -1,3 +0,0 @@`
			`HTTP/1.1 301 Moved Permanently`
			`location: http://localhost:8888/200-ok.txt`
		`@@ -1,3 +0,0 @@`
			`HTTP/1.1 200 OK`
			`refresh: 0;url=/200-ok.txt`