Compare commits

..

1 Commits

Author SHA1 Message Date
920988ac74 Dockerfile: pull gunicorn from pip
alpine's package might not be that much up-to-date
2020-10-03 21:48:07 +02:00
49 changed files with 704 additions and 10874 deletions

View File

@@ -1,78 +0,0 @@
name: default
on:
push:
branches:
- master
jobs:
test-lint:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Prepare image
run: apt-get -y update && apt-get -y install python3-pip libenchant-2-2 aspell-en
- name: Install dependencies
run: pip3 install .[full] .[dev]
- run: isort --check-only --diff .
- run: pylint morss --rcfile=.pylintrc --disable=C,R,W --fail-under=8
- run: pytest --cov=morss tests
python-publish:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Prepare image
run: apt-get -y update && apt-get -y install python3-pip python3-build
- name: Build package
run: python3 -m build
- name: Publish package
uses: https://github.com/pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.pypi_api_token }}
docker-publish-deploy:
runs-on: ubuntu-latest
container:
image: catthehacker/ubuntu:act-latest
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Set up QEMU
uses: https://github.com/docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: https://github.com/docker/setup-buildx-action@v2
- name: Login to Docker Hub
uses: https://github.com/docker/login-action@v2
with:
username: ${{ secrets.docker_user }}
password: ${{ secrets.docker_pwd }}
- name: Build and push
uses: https://github.com/docker/build-push-action@v4
with:
context: .
platforms: linux/amd64,linux/arm64,linux/arm/v7
push: true
tags: ${{ secrets.docker_repo }}
- name: Deploy on server
uses: https://github.com/appleboy/ssh-action@v0.1.10
with:
host: ${{ secrets.ssh_host }}
username: ${{ secrets.ssh_user }}
key: ${{ secrets.ssh_key }}
script: morss-update

View File

@@ -1,50 +0,0 @@
[MASTER]
ignore=CVS
suggestion-mode=yes
extension-pkg-allow-list=lxml.etree
[MESSAGES CONTROL]
disable=missing-function-docstring,
missing-class-docstring,
missing-module-docstring,
wrong-spelling-in-comment,
[REPORTS]
reports=yes
score=yes
[SPELLING]
spelling-dict=en_GB
spelling-ignore-words=morss
[STRING]
check-quote-consistency=yes
check-str-concat-over-line-jumps=yes
[VARIABLES]
allow-global-unused-variables=no
init-import=no
[FORMAT]
expected-line-ending-format=LF
indent-string=' '
max-line-length=120
max-module-lines=1000
[BASIC]
argument-naming-style=snake_case
attr-naming-style=snake_case
class-attribute-naming-style=snake_case
class-const-naming-style=UPPER_CASE
class-naming-style=PascalCase
const-naming-style=UPPER_CASE
function-naming-style=snake_case
inlinevar-naming-style=snake_case
method-naming-style=snake_case
module-naming-style=snake_case
variable-naming-style=snake_case
include-naming-hint=yes
bad-names=foo, bar
good-names=i, j, k

View File

@@ -1,16 +1,8 @@
FROM alpine:edge FROM alpine:latest
RUN apk add --no-cache python3 py3-lxml py3-pip py3-wheel git
ADD . /app ADD . /app
RUN pip3 install /app
RUN set -ex; \ CMD gunicorn --bind 0.0.0.0:8080 -w 4 --preload morss
apk add --no-cache --virtual .run-deps python3 py3-lxml py3-setproctitle py3-setuptools; \
apk add --no-cache --virtual .build-deps py3-pip py3-wheel; \
pip3 install --no-cache-dir /app[full]; \
apk del .build-deps
USER 1000:1000
ENTRYPOINT ["/bin/sh", "/app/morss-helper"]
CMD ["run"]
HEALTHCHECK CMD /bin/sh /app/morss-helper check

272
README.md
View File

@@ -1,14 +1,11 @@
# Morss - Get full-text RSS feeds # Morss - Get full-text RSS feeds
[Homepage](https://morss.it/) • _GNU AGPLv3 code_
[Upstream source code](https://git.pictuga.com/pictuga/morss) • _Provided logo is CC BY-NC-SA 4.0_
[Github mirror](https://github.com/pictuga/morss) (for Issues & Pull requests)
[![Build Status](https://ci.pictuga.com/api/badges/pictuga/morss/status.svg)](https://ci.pictuga.com/pictuga/morss) Upstream source code: https://git.pictuga.com/pictuga/morss
[![Github Stars](https://img.shields.io/github/stars/pictuga/morss?logo=github)](https://github.com/pictuga/morss/stargazers) Github mirror (for Issues & Pull requests): https://github.com/pictuga/morss
[![Github Forks](https://img.shields.io/github/forks/pictuga/morss?logo=github)](https://github.com/pictuga/morss/network/members) Homepage: https://morss.it/
[![GNU AGPLv3 code](https://img.shields.io/static/v1?label=license&message=AGPLv3)](https://git.pictuga.com/pictuga/morss/src/branch/master/LICENSE)
[![Logo is CC BY-NC-SA 4.0](https://img.shields.io/static/v1?label=CC&message=BY-NC-SA%204.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/)
This tool's goal is to get full-text RSS feeds out of striped RSS feeds, This tool's goal is to get full-text RSS feeds out of striped RSS feeds,
commonly available on internet. Indeed most newspapers only make a small commonly available on internet. Indeed most newspapers only make a small
@@ -41,7 +38,7 @@ Some features of morss:
- Follow 301/meta redirects - Follow 301/meta redirects
- Recover xml feeds with corrupt encoding - Recover xml feeds with corrupt encoding
- Supports gzip-compressed http content - Supports gzip-compressed http content
- HTTP caching with different backends (in-memory/redis/diskcache) - HTTP caching with 3 different backends (in-memory/sqlite/mysql)
- Works as server/cli tool - Works as server/cli tool
- Deobfuscate various tracking links - Deobfuscate various tracking links
@@ -49,79 +46,38 @@ Some features of morss:
### Python package ### Python package
![Build Python](https://img.shields.io/badge/dynamic/json?label=build%20python&query=$.stages[?(@.name=='python')].status&url=https://ci.pictuga.com/api/repos/pictuga/morss/builds/latest)
[![PyPI](https://img.shields.io/pypi/v/morss)](https://pypi.org/project/morss/)
[![PyPI Downloads](https://img.shields.io/pypi/dm/morss)](https://pypistats.org/packages/morss)
Simple install (without optional dependencies)
From pip
```shell
pip install morss
```
From git
```shell ```shell
pip install git+https://git.pictuga.com/pictuga/morss.git pip install git+https://git.pictuga.com/pictuga/morss.git
``` ```
Full installation (including optional dependencies)
From pip
```shell
pip install morss[full]
```
From git
```shell
pip install git+https://git.pictuga.com/pictuga/morss.git#egg=morss[full]
```
The full install includes all the cache backends. Otherwise, only in-memory
cache is available. The full install also includes gunicorn (for more efficient
HTTP handling).
The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
C code needs to be compiled). If possible on your distribution, try installing C code needs to be compiled). If possible on your distribution, try installing
it with the system package manager. it with the system package manager.
Dependencies:
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
- [lxml](http://lxml.de/) for xml parsing
- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
- [chardet](https://pypi.python.org/pypi/chardet)
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
- pymysql
You may also need:
- Apache, with python-cgi support, to run on a server
- a fast internet connection
### Docker ### Docker
![Build Docker](https://img.shields.io/badge/dynamic/json?label=build%20docker&query=$.stages[?(@.name=='docker')].status&url=https://ci.pictuga.com/api/repos/pictuga/morss/builds/latest) Build & run
[![Docker Hub](https://img.shields.io/docker/pulls/pictuga/morss)](https://hub.docker.com/r/pictuga/morss)
[![Docker Arch](https://img.shields.io/badge/dynamic/json?color=blue&label=docker%20arch&query=$.results[0].images[*].architecture&url=https://hub.docker.com/v2/repositories/pictuga/morss/tags)](https://hub.docker.com/r/pictuga/morss/tags)
From docker hub
With cli
```shell ```shell
docker pull pictuga/morss docker build --tag morss https://git.pictuga.com/pictuga/morss.git
docker run -p 8080:8080 morss
``` ```
With docker-compose **(recommended)** With docker-compose:
```yml
services:
app:
image: pictuga/morss
ports:
- '8000:8000'
```
Build from source
With cli
```shell
docker build --tag morss https://git.pictuga.com/pictuga/morss.git --no-cache --pull
```
With docker-compose
```yml ```yml
services: services:
@@ -129,54 +85,21 @@ services:
build: https://git.pictuga.com/pictuga/morss.git build: https://git.pictuga.com/pictuga/morss.git
image: morss image: morss
ports: ports:
- '8000:8000' - '8080:8080'
``` ```
Then execute Then execute
```shell ```shell
docker-compose build --no-cache --pull docker-compose build
docker-compose up
``` ```
### Cloud providers To update:
One-click deployment: - To get the latest code from the git repository, add `--no-cache` to the build
commands
[![Heroku](https://img.shields.io/static/v1?label=deploy%20to&message=heroku&logo=heroku&color=79589F)](https://heroku.com/deploy?template=https://github.com/pictuga/morss) - To update the base image (`alpine:latest`), add `--pull` to the build commands
[![Google Cloud](https://img.shields.io/static/v1?label=deploy%20to&message=google&logo=google&color=4285F4)](https://deploy.cloud.run/?git_repo=https://github.com/pictuga/morss.git)
Providers supporting `cloud-init` (AWS, Oracle Cloud Infrastructure), based on Ubuntu:
``` yml
#cloud-config
packages:
- python3-pip
- python3-wheel
- python3-lxml
- python3-setproctitle
- ca-certificates
write_files:
- path: /etc/environment
append: true
content: |
DEBUG=1
CACHE=diskcache
CACHE_SIZE=1073741824 # 1GiB
- path: /var/lib/cloud/scripts/per-boot/morss.sh
permissions: 744
content: |
#!/bin/sh
/usr/local/bin/morss-helper daemon
runcmd:
- source /etc/environment
- update-ca-certificates
- iptables -I INPUT 6 -m state --state NEW -p tcp --dport ${PORT:-8000} -j ACCEPT
- netfilter-persistent save
- pip install morss[full]
```
## Run ## Run
@@ -197,29 +120,14 @@ For example: `http://morss.example/:clip/https://twitter.com/pictuga`
The `main.py` part is only needed if your server doesn't support the Apache The `main.py` part is only needed if your server doesn't support the Apache
redirect rule set in the provided `.htaccess`. redirect rule set in the provided `.htaccess`.
Works like a charm with [Tiny Tiny RSS](https://tt-rss.org/), and most probably Works like a charm with [Tiny Tiny
other clients. RSS](http://tt-rss.org/redmine/projects/tt-rss/wiki), and most probably other
clients.
#### Using Docker #### Via Docker
From docker hub See above (in Install)
```shell
docker run -p 8000:8000 pictuga/morss
```
From source
```shell
docker run -p 8000:8000 morss
```
With docker-compose **(recommended)**
```shell
docker-compose up
```
#### Using Gunicorn #### Using Gunicorn
@@ -232,13 +140,13 @@ gunicorn --preload morss
Running this command should do: Running this command should do:
```shell ```shell
uwsgi --http :8000 --plugin python --wsgi-file main.py uwsgi --http :8080 --plugin python --wsgi-file main.py
``` ```
#### Using morss' internal HTTP server #### Using morss' internal HTTP server
Morss can run its own, **very basic**, HTTP server, meant for debugging mostly. Morss can run its own, **very basic**, HTTP server, meant for debugging mostly.
The latter should start when you run morss without any argument, on port 8000. The latter should start when you run morss without any argument, on port 8080.
I'd highly recommend you to use gunicorn or something similar for better I'd highly recommend you to use gunicorn or something similar for better
performance. performance.
@@ -276,30 +184,8 @@ For this, you need to make sure your host allows python script execution. This
method uses HTTP calls to fetch the RSS feeds, which will be handled through method uses HTTP calls to fetch the RSS feeds, which will be handled through
`mod_cgi` for example on Apache severs. `mod_cgi` for example on Apache severs.
Please pay attention to `main.py` permissions for it to be executable. See below Please pay attention to `main.py` permissions for it to be executable. Also
some tips for the `.htaccess` file. ensure that the provided `/www/.htaccess` works well with your server.
```htaccess
Options -Indexes
ErrorDocument 404 /cgi/main.py
# Turn debug on for all requests
SetEnv DEBUG 1
# Turn debug on for requests with :debug in the url
SetEnvIf Request_URI :debug DEBUG=1
<Files ~ "\.(py|pyc|db|log)$">
deny from all
</Files>
<Files main.py>
allow from all
AddHandler cgi-script .py
Options +ExecCGI
</Files>
```
### As a CLI application ### As a CLI application
@@ -313,12 +199,6 @@ For example: `morss --clip http://feeds.bbci.co.uk/news/rss.xml`
*(Brackets indicate optional text)* *(Brackets indicate optional text)*
If using Docker:
```shell
docker run morss --clip http://feeds.bbci.co.uk/news/rss.xml
```
### As a newsreader hook ### As a newsreader hook
To use it, the newsreader [Liferea](http://lzone.de/liferea/) is required To use it, the newsreader [Liferea](http://lzone.de/liferea/) is required
@@ -330,7 +210,7 @@ To use this script, you have to enable "(Unix) command" in liferea feed
settings, and use the command: settings, and use the command:
``` ```
morss [--argwithoutvalue] [--argwithvalue=value] [...] FEEDURL morss [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
``` ```
For example: `morss http://feeds.bbci.co.uk/news/rss.xml` For example: `morss http://feeds.bbci.co.uk/news/rss.xml`
@@ -353,7 +233,7 @@ Using cache and passing arguments:
```python ```python
>>> import morss >>> import morss
>>> url = 'http://feeds.bbci.co.uk/news/rss.xml' >>> url = 'http://feeds.bbci.co.uk/news/rss.xml'
>>> cache = '/tmp/morss-cache' # diskcache cache location >>> cache = '/tmp/morss-cache.db' # sqlite cache location
>>> options = {'csv':True} >>> options = {'csv':True}
>>> xml_string = morss.process(url, cache, options) >>> xml_string = morss.process(url, cache, options)
>>> xml_string[:50] >>> xml_string[:50]
@@ -367,10 +247,11 @@ under the hood.
Doing it step-by-step: Doing it step-by-step:
```python ```python
import morss import morss, morss.crawler
url = 'http://newspaper.example/feed.xml' url = 'http://newspaper.example/feed.xml'
options = morss.Options(csv=True) # arguments options = morss.Options(csv=True) # arguments
morss.crawler.sqlite_default = '/tmp/morss-cache.db' # sqlite cache location
url, rss = morss.FeedFetch(url, options) # this only grabs the RSS feed url, rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
rss = morss.FeedGather(rss, url, options) # this fills the feed and cleans it up rss = morss.FeedGather(rss, url, options) # this fills the feed and cleans it up
@@ -389,13 +270,11 @@ arguments to morss is explained in Run above.
The list of arguments can be obtained by running `morss --help` The list of arguments can be obtained by running `morss --help`
``` ```
usage: morss [-h] [--post STRING] [--xpath XPATH] usage: morss [-h] [--format {rss,json,html,csv}] [--search STRING] [--clip]
[--format {rss,json,html,csv}] [--search STRING] [--clip] [--indent] [--cache] [--force] [--proxy] [--newest] [--firstlink]
[--indent] [--cache] [--force] [--proxy] [--resolve] [--items XPATH] [--item_link XPATH]
[--order {first,last,newest,oldest}] [--firstlink] [--resolve] [--item_title XPATH] [--item_content XPATH] [--item_time XPATH]
[--items XPATH] [--item_link XPATH] [--item_title XPATH] [--nolink] [--noref] [--silent]
[--item_content XPATH] [--item_time XPATH]
[--mode {xml,html,json}] [--nolink] [--noref] [--silent]
url url
Get full-text RSS feeds Get full-text RSS feeds
@@ -403,10 +282,8 @@ Get full-text RSS feeds
positional arguments: positional arguments:
url feed url url feed url
options: optional arguments:
-h, --help show this help message and exit -h, --help show this help message and exit
--post STRING POST request
--xpath XPATH xpath rule to manually detect the article
output: output:
--format {rss,json,html,csv} --format {rss,json,html,csv}
@@ -422,9 +299,8 @@ action:
articles' content), so as to save time articles' content), so as to save time
--force force refetch the rss feed and articles --force force refetch the rss feed and articles
--proxy doesn't fill the articles --proxy doesn't fill the articles
--order {first,last,newest,oldest} --newest return the feed items in chronological order (morss
order in which to process items (which are however NOT ohterwise shows the items by appearing order)
sorted in the output)
--firstlink pull the first article mentioned in the description --firstlink pull the first article mentioned in the description
instead of the default link instead of the default link
--resolve replace tracking links with direct links to articles --resolve replace tracking links with direct links to articles
@@ -439,8 +315,6 @@ custom feeds:
--item_content XPATH entry's content --item_content XPATH entry's content
--item_time XPATH entry's date & time (accepts a wide range of time --item_time XPATH entry's date & time (accepts a wide range of time
formats) formats)
--mode {xml,html,json}
parser to use for the custom feeds
misc: misc:
--nolink drop links, but keeps links' inner text --nolink drop links, but keeps links' inner text
@@ -462,39 +336,31 @@ servers)
To pass environment variables: To pass environment variables:
- Docker-cli: `docker run -p 8000:8000 morss --env KEY=value` - Docker-cli: `docker run -p 8080:8080 morss --env KEY=value`
- docker-compose: add an `environment:` section in the .yml file - docker-compose: add an `environment:` section in the .yml file
- Gunicorn/uWSGI/CLI: prepend `KEY=value` before the command - Gunicorn/uWSGI/CLI: prepend `KEY=value` before the command
- Apache: via the `SetEnv` instruction (see sample `.htaccess` provided) - Apache: via the `SetEnv` instruction (see sample `.htaccess` provided)
- cloud-init: in the `/etc/environment` file
Generic: Generic:
- `DEBUG=1`: to have some feedback from the script execution. Useful for - `DEBUG=1`: to have some feedback from the script execution. Useful for
debugging. debugging.
- `IGNORE_SSL=1`: to ignore SSL certs when fetch feeds and articles - `IGNORE_SSL=1`: to ignore SSL certs when fetch feeds and articles
- `DELAY` (seconds) sets the browser cache delay, only for HTTP clients - `DELAY` sets the browser cache delay, only for HTTP clients
- `TIMEOUT` (seconds) sets the HTTP timeout when fetching rss feeds and articles - `TIMEOUT` sets the HTTP timeout when fetching rss feeds and articles
- `DATA_PATH`: to set custom file location for the `www` folder
When parsing long feeds, with a lot of items (100+), morss might take a lot of When parsing long feeds, with a lot of items (100+), morss might take a lot of
time to parse it, or might even run into a memory overflow on some shared time to parse it, or might even run into a memory overflow on some shared
hosting plans (limits around 10Mb), in which case you might want to adjust the hosting plans (limits around 10Mb), in which case you might want to adjust the
below settings via environment variables. below settings via environment variables.
Also, if the request takes too long to process, the http request might be - `MAX_TIME` sets the maximum amount of time spent *fetching* articles, more
discarded. See relevant config for time might be spent taking older articles from cache. `-1` for unlimited.
[gunicorn](https://docs.gunicorn.org/en/stable/settings.html#timeout) or
[nginx](http://nginx.org/en/docs/http/ngx_http_proxy_module.html#proxy_read_timeout).
- `MAX_TIME` (seconds) sets the maximum amount of time spent *fetching*
articles, more time might be spent taking older articles from cache. `-1` for
unlimited.
- `MAX_ITEM` sets the maximum number of articles to fetch. `-1` for unlimited. - `MAX_ITEM` sets the maximum number of articles to fetch. `-1` for unlimited.
More articles will be taken from cache following the nexts settings. More articles will be taken from cache following the nexts settings.
- `LIM_TIME` (seconds) sets the maximum amount of time spent working on the feed - `LIM_TIME` sets the maximum amount of time spent working on the feed (whether
(whether or not it's already cached). Articles beyond that limit will be dropped or not it's already cached). Articles beyond that limit will be dropped from the
from the feed. `-1` for unlimited. feed. `-1` for unlimited.
- `LIM_ITEM` sets the maximum number of article checked, limiting both the - `LIM_ITEM` sets the maximum number of article checked, limiting both the
number of articles fetched and taken from cache. Articles beyond that limit will number of articles fetched and taken from cache. Articles beyond that limit will
be dropped from the feed, even if they're cached. `-1` for unlimited. be dropped from the feed, even if they're cached. `-1` for unlimited.
@@ -502,21 +368,19 @@ be dropped from the feed, even if they're cached. `-1` for unlimited.
morss uses caching to make loading faster. There are 3 possible cache backends: morss uses caching to make loading faster. There are 3 possible cache backends:
- `(nothing/default)`: a simple python in-memory dict-like object. - `(nothing/default)`: a simple python in-memory dict-like object.
- `CACHE=redis`: Redis cache. Connection can be defined with the following - `CACHE=sqlite`: sqlite3 cache. Default file location is in-memory (i.e. it
environment variables: `REDIS_HOST`, `REDIS_PORT`, `REDIS_DB`, `REDIS_PWD` will be cleared every time the program is run). Path can be defined with
- `CACHE=diskcache`: disk-based cache. Target directory canbe defined with `SQLITE_PATH`.
`DISKCACHE_DIR`. - `CACHE=mysql`: MySQL cache. Connection can be defined with the following
environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
To limit the size of the cache: To limit the size of the cache:
- `CACHE_SIZE` sets the target number of items in the cache (further items will - `CACHE_SIZE` sets the target number of items in the cache (further items will
be deleted but the cache might be temporarily bigger than that). Defaults to 1k be deleted but the cache might be temporarily bigger than that). Defaults to 1k
entries. NB. When using `diskcache`, this is the cache max size in Bytes. entries.
- `CACHE_LIFESPAN` (seconds) sets how often the cache must be trimmed (i.e. cut - `CACHE_LIFESPAN` sets how often the cache must be trimmed (i.e. cut down to
down to the number of items set in `CACHE_SIZE`). Defaults to 1min. the number of items set in `CACHE_SIZE`). Defaults to 1min.
Gunicorn also accepts command line arguments via the `GUNICORN_CMD_ARGS`
environment variable.
### Content matching ### Content matching

View File

@@ -1,21 +0,0 @@
{
"stack": "container",
"env": {
"DEBUG": {
"value": 1,
"required": false
},
"GUNICORN_CMD_ARGS": {
"value": "",
"required": false
},
"CACHE": {
"value": "diskcache",
"required": false
},
"CACHE_SIZE": {
"value": 1073741824,
"required": false
}
}
}

View File

@@ -1,3 +0,0 @@
build:
docker:
web: Dockerfile

0
main.py Executable file → Normal file
View File

View File

@@ -1,47 +0,0 @@
#! /bin/sh
set -ex
if ! command -v python && command -v python3 ; then
alias python='python3'
fi
run() {
gunicorn --bind 0.0.0.0:${PORT:-8000} --preload --access-logfile - morss
}
daemon() {
gunicorn --bind 0.0.0.0:${PORT:-8000} --preload --access-logfile - --daemon morss
}
reload() {
pid=$(pidof 'gunicorn: master [morss]' || true)
# NB. requires python-setproctitle
# `|| true` due to `set -e`
if [ -z "$pid" ]; then
# if gunicorn is not currently running
daemon
else
kill -s USR2 $pid
kill -s WINCH $pid
sleep 1 # give gunicorn some time to reload
kill -s TERM $pid
fi
}
check() {
python -m morss.crawler http://localhost:${PORT:-8000}/ > /dev/null 2>&1
}
if [ -z "$1" ]; then
run
elif [ "$1" = "sh" ] || [ "$1" = "bash" ] || command -v "$1" ; then
$@
else
python -m morss $@
fi

View File

@@ -1,13 +0,0 @@
[Unit]
Description=morss server (gunicorn)
After=network.target
[Service]
ExecStart=/usr/local/bin/morss-helper run
ExecReload=/usr/local/bin/morss-helper reload
KillMode=process
Restart=always
User=http
[Install]
WantedBy=multi-user.target

View File

@@ -16,10 +16,5 @@
# with this program. If not, see <https://www.gnu.org/licenses/>. # with this program. If not, see <https://www.gnu.org/licenses/>.
# ran on `import morss` # ran on `import morss`
# pylint: disable=unused-import,unused-variable
__version__ = ""
from .morss import * from .morss import *
from .wsgi import application from .wsgi import application

View File

@@ -20,7 +20,9 @@
import os import os
import sys import sys
from . import cli, wsgi from . import wsgi
from . import cli
from .morss import MorssException from .morss import MorssException

View File

@@ -1,122 +0,0 @@
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import threading
import time
from collections import OrderedDict
CACHE_SIZE = int(os.getenv('CACHE_SIZE', 1000)) # max number of items in cache (default: 1k items)
CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60)) # how often to auto-clear the cache (default: 1min)
class BaseCache:
""" Subclasses must behave like a dict """
def trim(self):
pass
def autotrim(self, delay=CACHE_LIFESPAN):
# trim the cache every so often
self.trim()
t = threading.Timer(delay, self.autotrim)
t.daemon = True
t.start()
def __contains__(self, url):
try:
self[url]
except KeyError:
return False
else:
return True
class CappedDict(OrderedDict, BaseCache):
def trim(self):
if CACHE_SIZE >= 0:
for i in range( max( len(self) - CACHE_SIZE , 0 )):
self.popitem(False)
def __setitem__(self, key, data):
# https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
if key in self:
del self[key]
OrderedDict.__setitem__(self, key, data)
try:
import redis # isort:skip
except ImportError:
pass
class RedisCacheHandler(BaseCache):
def __init__(self, host='localhost', port=6379, db=0, password=None):
self.r = redis.Redis(host=host, port=port, db=db, password=password)
def __getitem__(self, key):
return self.r.get(key)
def __setitem__(self, key, data):
self.r.set(key, data)
try:
import diskcache # isort:skip
except ImportError:
pass
class DiskCacheHandler(BaseCache):
def __init__(self, directory=None, **kwargs):
self.cache = diskcache.Cache(directory=directory, eviction_policy='least-frequently-used', **kwargs)
def __del__(self):
self.cache.close()
def trim(self):
self.cache.cull()
def __getitem__(self, key):
return self.cache[key]
def __setitem__(self, key, data):
self.cache.set(key, data)
if 'CACHE' in os.environ:
if os.environ['CACHE'] == 'redis':
default_cache = RedisCacheHandler(
host = os.getenv('REDIS_HOST', 'localhost'),
port = int(os.getenv('REDIS_PORT', 6379)),
db = int(os.getenv('REDIS_DB', 0)),
password = os.getenv('REDIS_PWD', None)
)
elif os.environ['CACHE'] == 'diskcache':
default_cache = DiskCacheHandler(
directory = os.getenv('DISKCACHE_DIR', '/tmp/morss-diskcache'),
size_limit = CACHE_SIZE # in Bytes
)
else:
default_cache = CappedDict()

View File

@@ -15,11 +15,12 @@
# You should have received a copy of the GNU Affero General Public License along # You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>. # with this program. If not, see <https://www.gnu.org/licenses/>.
import argparse
import os.path
import sys import sys
import os.path
import argparse
from .morss import FeedFetch, FeedFormat, FeedGather, Options from .morss import FeedFetch, FeedGather, FeedFormat
from .morss import Options
def cli_app(): def cli_app():
@@ -31,9 +32,6 @@ def cli_app():
parser.add_argument('url', help='feed url') parser.add_argument('url', help='feed url')
parser.add_argument('--post', action='store', type=str, metavar='STRING', help='POST request')
parser.add_argument('--xpath', action='store', type=str, metavar='XPATH', help='xpath rule to manually detect the article')
group = parser.add_argument_group('output') group = parser.add_argument_group('output')
group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format') group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
group.add_argument('--search', action='store', type=str, metavar='STRING', help='does a basic case-sensitive search in the feed') group.add_argument('--search', action='store', type=str, metavar='STRING', help='does a basic case-sensitive search in the feed')
@@ -44,7 +42,7 @@ def cli_app():
group.add_argument('--cache', action='store_true', help='only take articles from the cache (ie. don\'t grab new articles\' content), so as to save time') group.add_argument('--cache', action='store_true', help='only take articles from the cache (ie. don\'t grab new articles\' content), so as to save time')
group.add_argument('--force', action='store_true', help='force refetch the rss feed and articles') group.add_argument('--force', action='store_true', help='force refetch the rss feed and articles')
group.add_argument('--proxy', action='store_true', help='doesn\'t fill the articles') group.add_argument('--proxy', action='store_true', help='doesn\'t fill the articles')
group.add_argument('--order', default='first', choices=('first', 'last', 'newest', 'oldest'), help='order in which to process items (which are however NOT sorted in the output)') group.add_argument('--newest', action='store_true', help='return the feed items in chronological order (morss ohterwise shows the items by appearing order)')
group.add_argument('--firstlink', action='store_true', help='pull the first article mentioned in the description instead of the default link') group.add_argument('--firstlink', action='store_true', help='pull the first article mentioned in the description instead of the default link')
group.add_argument('--resolve', action='store_true', help='replace tracking links with direct links to articles (not compatible with --proxy)') group.add_argument('--resolve', action='store_true', help='replace tracking links with direct links to articles (not compatible with --proxy)')
@@ -54,7 +52,6 @@ def cli_app():
group.add_argument('--item_title', action='store', type=str, metavar='XPATH', help='entry\'s title') group.add_argument('--item_title', action='store', type=str, metavar='XPATH', help='entry\'s title')
group.add_argument('--item_content', action='store', type=str, metavar='XPATH', help='entry\'s content') group.add_argument('--item_content', action='store', type=str, metavar='XPATH', help='entry\'s content')
group.add_argument('--item_time', action='store', type=str, metavar='XPATH', help='entry\'s date & time (accepts a wide range of time formats)') group.add_argument('--item_time', action='store', type=str, metavar='XPATH', help='entry\'s date & time (accepts a wide range of time formats)')
group.add_argument('--mode', default=None, choices=('xml', 'html', 'json'), help='parser to use for the custom feeds')
group = parser.add_argument_group('misc') group = parser.add_argument_group('misc')
group.add_argument('--nolink', action='store_true', help='drop links, but keeps links\' inner text') group.add_argument('--nolink', action='store_true', help='drop links, but keeps links\' inner text')

View File

@@ -16,37 +16,31 @@
# with this program. If not, see <https://www.gnu.org/licenses/>. # with this program. If not, see <https://www.gnu.org/licenses/>.
import os import os
import pickle
import random
import re
import sys import sys
import time
import zlib import zlib
from cgi import parse_header
from collections import OrderedDict
from io import BytesIO, StringIO from io import BytesIO, StringIO
import re
import chardet import chardet
from cgi import parse_header
from .caching import default_cache import lxml.html
import time
import threading
import random
from collections import OrderedDict
try: try:
# python 2 # python 2
from urllib2 import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
from urllib import quote from urllib import quote
from urlparse import urlparse, urlunparse
from httplib import HTTPMessage import mimetools
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
Request, addinfourl, build_opener, parse_http_list,
parse_keqv_list)
from urlparse import urlsplit
except ImportError: except ImportError:
# python 3 # python 3
from email import message_from_string from urllib.request import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
from http.client import HTTPMessage from urllib.parse import quote
from urllib.parse import quote, urlsplit from urllib.parse import urlparse, urlunparse
from urllib.request import (BaseHandler, HTTPCookieProcessor, import email
HTTPRedirectHandler, Request, addinfourl,
build_opener, parse_http_list, parse_keqv_list)
try: try:
# python 2 # python 2
@@ -56,12 +50,14 @@ except NameError:
basestring = unicode = str basestring = unicode = str
CACHE_SIZE = int(os.getenv('CACHE_SIZE', 1000)) # max number of items in cache (default: 1k items)
CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60)) # how often to auto-clear the cache (default: 1min)
MIMETYPE = { MIMETYPE = {
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'], 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'], 'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
'html': ['text/html', 'application/xhtml+xml', 'application/xml'], 'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
'json': ['application/json'],
}
DEFAULT_UAS = [ DEFAULT_UAS = [
@@ -86,17 +82,14 @@ def get(*args, **kwargs):
return adv_get(*args, **kwargs)['data'] return adv_get(*args, **kwargs)['data']
def adv_get(url, post=None, timeout=None, *args, **kwargs): def adv_get(url, timeout=None, *args, **kwargs):
url = sanitize_url(url) url = sanitize_url(url)
if post is not None:
post = post.encode('utf-8')
if timeout is None: if timeout is None:
con = custom_opener(*args, **kwargs).open(url, data=post) con = custom_handler(*args, **kwargs).open(url)
else: else:
con = custom_opener(*args, **kwargs).open(url, data=post, timeout=timeout) con = custom_handler(*args, **kwargs).open(url, timeout=timeout)
data = con.read() data = con.read()
@@ -104,7 +97,7 @@ def adv_get(url, post=None, timeout=None, *args, **kwargs):
encoding= detect_encoding(data, con) encoding= detect_encoding(data, con)
return { return {
'data': data, 'data':data,
'url': con.geturl(), 'url': con.geturl(),
'con': con, 'con': con,
'contenttype': contenttype, 'contenttype': contenttype,
@@ -112,7 +105,9 @@ def adv_get(url, post=None, timeout=None, *args, **kwargs):
} }
def custom_opener(follow=None, policy=None, force_min=None, force_max=None): def custom_handler(follow=None, delay=None, encoding=None):
handlers = []
# as per urllib2 source code, these Handelers are added first # as per urllib2 source code, these Handelers are added first
# *unless* one of the custom handlers inherits from one of them # *unless* one of the custom handlers inherits from one of them
# #
@@ -120,33 +115,21 @@ def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
# HTTPDefaultErrorHandler, HTTPRedirectHandler, # HTTPDefaultErrorHandler, HTTPRedirectHandler,
# FTPHandler, FileHandler, HTTPErrorProcessor] # FTPHandler, FileHandler, HTTPErrorProcessor]
# & HTTPSHandler # & HTTPSHandler
#
# when processing a request:
# (1) all the *_request are run
# (2) the *_open are run until sth is returned (other than None)
# (3) all the *_response are run
#
# During (3), if an http error occurs (i.e. not a 2XX response code), the
# http_error_* are run until sth is returned (other than None). If they all
# return nothing, a python error is raised
handlers = [ #handlers.append(DebugHandler())
#DebugHandler(), handlers.append(SizeLimitHandler(500*1024)) # 500KiB
SizeLimitHandler(500*1024), # 500KiB handlers.append(HTTPCookieProcessor())
HTTPCookieProcessor(), handlers.append(GZIPHandler())
GZIPHandler(), handlers.append(HTTPEquivHandler())
HTTPAllRedirectHandler(), handlers.append(HTTPRefreshHandler())
HTTPEquivHandler(), handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
HTTPRefreshHandler(), handlers.append(BrowserlyHeaderHandler())
UAHandler(random.choice(DEFAULT_UAS)), handlers.append(EncodingFixHandler(encoding))
BrowserlyHeaderHandler(),
EncodingFixHandler(),
]
if follow: if follow:
handlers.append(AlternateHandler(MIMETYPE[follow])) handlers.append(AlternateHandler(MIMETYPE[follow]))
handlers.append(CacheHandler(policy=policy, force_min=force_min, force_max=force_max)) handlers.append(CacheHandler(force_min=delay))
return build_opener(*handlers) return build_opener(*handlers)
@@ -163,20 +146,10 @@ def is_ascii(string):
return True return True
def soft_quote(string):
" url-quote only when not a valid ascii string "
if is_ascii(string):
return string
else:
return quote(string.encode('utf-8'))
def sanitize_url(url): def sanitize_url(url):
# make sure the url is unicode, i.e. not bytes # make sure the url is unicode, i.e. not bytes
if isinstance(url, bytes): if isinstance(url, bytes):
url = url.decode('utf-8') url = url.decode()
# make sure there's a protocol (http://) # make sure there's a protocol (http://)
if url.split(':', 1)[0] not in PROTOCOL: if url.split(':', 1)[0] not in PROTOCOL:
@@ -189,64 +162,18 @@ def sanitize_url(url):
url = url.replace(' ', '%20') url = url.replace(' ', '%20')
# escape non-ascii unicode characters # escape non-ascii unicode characters
parts = urlsplit(url) # https://stackoverflow.com/a/4391299
parts = list(urlparse(url))
parts = parts._replace( for i in range(len(parts)):
netloc=parts.netloc.replace( if not is_ascii(parts[i]):
parts.hostname, if i == 1:
parts.hostname.encode('idna').decode('ascii') parts[i] = parts[i].encode('idna').decode('ascii')
),
path=soft_quote(parts.path),
query=soft_quote(parts.query),
fragment=soft_quote(parts.fragment),
)
return parts.geturl() else:
parts[i] = quote(parts[i].encode('utf-8'))
return urlunparse(parts)
class RespDataHandler(BaseHandler):
" Make it easier to use the reponse body "
def data_reponse(self, req, resp, data):
pass
def http_response(self, req, resp):
# read data
data = resp.read()
# process data and use returned content (if any)
data = self.data_response(req, resp, data) or data
# reformat the stuff
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
class RespStrHandler(RespDataHandler):
" Make it easier to use the _decoded_ reponse body "
def str_reponse(self, req, resp, data_str):
pass
def data_response(self, req, resp, data):
#decode
enc = detect_encoding(data, resp)
data_str = data.decode(enc, 'replace')
#process
data_str = self.str_response(req, resp, data_str)
# return
data = data_str.encode(enc) if data_str is not None else data
#return
return data
class DebugHandler(BaseHandler): class DebugHandler(BaseHandler):
@@ -269,7 +196,7 @@ class SizeLimitHandler(BaseHandler):
handler_order = 450 handler_order = 450
def __init__(self, limit=5*1024**2): def __init__(self, limit=5*1024^2):
self.limit = limit self.limit = limit
def http_response(self, req, resp): def http_response(self, req, resp):
@@ -290,17 +217,29 @@ def UnGzip(data):
return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data) return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data)
class GZIPHandler(RespDataHandler): class GZIPHandler(BaseHandler):
def http_request(self, req): def http_request(self, req):
req.add_unredirected_header('Accept-Encoding', 'gzip') req.add_unredirected_header('Accept-Encoding', 'gzip')
return req return req
def data_response(self, req, resp, data): def http_response(self, req, resp):
if 200 <= resp.code < 300: if 200 <= resp.code < 300:
if resp.headers.get('Content-Encoding') == 'gzip': if resp.headers.get('Content-Encoding') == 'gzip':
data = resp.read()
data = UnGzip(data)
resp.headers['Content-Encoding'] = 'identity' resp.headers['Content-Encoding'] = 'identity'
return UnGzip(data) fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
https_request = http_request
def detect_encoding(data, resp=None): def detect_encoding(data, resp=None):
@@ -337,9 +276,28 @@ def detect_raw_encoding(data, resp=None):
return 'utf-8' return 'utf-8'
class EncodingFixHandler(RespStrHandler): class EncodingFixHandler(BaseHandler):
def str_response(self, req, resp, data_str): def __init__(self, encoding=None):
return data_str self.encoding = encoding
def http_response(self, req, resp):
maintype = resp.info().get('Content-Type', '').split('/')[0]
if 200 <= resp.code < 300 and maintype == 'text':
data = resp.read()
enc = self.encoding or detect_encoding(data, resp)
data = data.decode(enc, 'replace')
data = data.encode(enc)
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
class UAHandler(BaseHandler): class UAHandler(BaseHandler):
@@ -365,58 +323,71 @@ class BrowserlyHeaderHandler(BaseHandler):
https_request = http_request https_request = http_request
def iter_html_tag(html_str, tag_name): class AlternateHandler(BaseHandler):
" To avoid parsing whole pages when looking for a simple tag "
re_tag = r'<%s\s+[^>]+>' % tag_name
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
for tag_match in re.finditer(re_tag, html_str):
attr_match = re.findall(re_attr, tag_match.group(0))
if attr_match is not None:
yield dict(attr_match)
class AlternateHandler(RespStrHandler):
" Follow <link rel='alternate' type='application/rss+xml' href='...' /> " " Follow <link rel='alternate' type='application/rss+xml' href='...' /> "
def __init__(self, follow=None): def __init__(self, follow=None):
self.follow = follow or [] self.follow = follow or []
def str_response(self, req, resp, data_str): def http_response(self, req, resp):
contenttype = resp.info().get('Content-Type', '').split(';')[0] contenttype = resp.info().get('Content-Type', '').split(';')[0]
if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow: if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
for link in iter_html_tag(data_str[:10000], 'link'): data = resp.read()
if (link.get('rel') == 'alternate'
and link.get('type') in self.follow try:
and 'href' in link): links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
for link in links:
if link.get('type', '') in self.follow:
resp.code = 302 resp.code = 302
resp.msg = 'Moved Temporarily' resp.msg = 'Moved Temporarily'
resp.headers['location'] = link.get('href') resp.headers['location'] = link.get('href')
break break
except (ValueError, SyntaxError):
# catch parsing errors
pass
class HTTPEquivHandler(RespStrHandler): fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
class HTTPEquivHandler(BaseHandler):
" Handler to support <meta http-equiv='...' content='...' />, since it defines HTTP headers " " Handler to support <meta http-equiv='...' content='...' />, since it defines HTTP headers "
handler_order = 600 handler_order = 600
def str_response(self, req, resp, data_str): def http_response(self, req, resp):
contenttype = resp.info().get('Content-Type', '').split(';')[0] contenttype = resp.info().get('Content-Type', '').split(';')[0]
if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']: if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
data = resp.read()
for meta in iter_html_tag(data_str[:10000], 'meta'): try:
if 'http-equiv' in meta and 'content' in meta: headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
for header in headers:
resp.headers[header.get('http-equiv').lower()] = header.get('content')
class HTTPAllRedirectHandler(HTTPRedirectHandler): except (ValueError, SyntaxError):
def http_error_308(self, req, fp, code, msg, headers): # catch parsing errors
return self.http_error_301(req, fp, 301, msg, headers) pass
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
class HTTPRefreshHandler(BaseHandler): class HTTPRefreshHandler(BaseHandler):
@@ -425,7 +396,7 @@ class HTTPRefreshHandler(BaseHandler):
def http_response(self, req, resp): def http_response(self, req, resp):
if 200 <= resp.code < 300: if 200 <= resp.code < 300:
if resp.headers.get('refresh'): if resp.headers.get('refresh'):
regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url\s*=\s*(["\']?)(?P<url>.+)\2$' regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url=(["\']?)(?P<url>.+)\2$'
match = re.search(regex, resp.headers.get('refresh')) match = re.search(regex, resp.headers.get('refresh'))
if match: if match:
@@ -441,124 +412,56 @@ class HTTPRefreshHandler(BaseHandler):
https_response = http_response https_response = http_response
def parse_headers(text=u'\n\n'):
if sys.version_info[0] >= 3:
# python 3
return message_from_string(text, _class=HTTPMessage)
else:
# python 2
return HTTPMessage(StringIO(text))
def error_response(code, msg, url=''):
# return an error as a response
resp = addinfourl(BytesIO(), parse_headers(), url, code)
resp.msg = msg
return resp
class CacheHandler(BaseHandler): class CacheHandler(BaseHandler):
" Cache based on etags/last-modified " " Cache based on etags/last-modified "
privacy = 'private' # Websites can indicate whether the page should be cached private_cache = False # Websites can indicate whether the page should be
# by CDNs (e.g. shouldn't be the case for # cached by CDNs (e.g. shouldn't be the case for
# private/confidential/user-specific pages. With this # private/confidential/user-specific pages.
# setting, decide whether you want the cache to behave # With this setting, decide whether (False) you want
# like a CDN (i.e. don't cache private pages, 'public'), # the cache to behave like a CDN (i.e. don't cache
# or to behave like a end-user private pages # private pages), or (True) to behave like a end-cache
# ('private'). If unsure, 'public' is the safest bet, # private pages. If unsure, False is the safest bet.
# but many websites abuse this feature...
# NB. This overrides all the other min/max/policy settings.
handler_order = 499 handler_order = 499
def __init__(self, cache=None, force_min=None, force_max=None, policy=None): def __init__(self, cache=None, force_min=None):
self.cache = cache or default_cache self.cache = cache or default_cache
self.force_min = force_min self.force_min = force_min
self.force_max = force_max # Servers indicate how long they think their content is "valid".
self.policy = policy # can be cached/refresh/offline/None (default) # With this parameter (force_min, expressed in seconds), we can
# override the validity period (i.e. bypassing http headers)
# Servers indicate how long they think their content is "valid". With # Special values:
# this parameter (force_min/max, expressed in seconds), we can override # -1: valid forever, i.e. use the cache no matter what (and fetch
# the validity period (i.e. bypassing http headers) # the page online if not present in cache)
# Special choices, via "policy": # 0: valid zero second, i.e. force refresh
# cached: use the cache no matter what (and fetch the page online if # -2: same as -1, i.e. use the cache no matter what, but do NOT
# not present in cache) # fetch the page online if not present in cache, throw an
# refresh: valid zero second, i.e. force refresh
# offline: same as cached, i.e. use the cache no matter what, but do
# NOT fetch the page online if not present in cache, throw an
# error instead # error instead
# None: just follow protocols
# sanity checks
assert self.force_max is None or self.force_max >= 0
assert self.force_min is None or self.force_min >= 0
assert self.force_max is None or self.force_min is None or self.force_max >= self.force_min
def load(self, url): def load(self, url):
try: try:
data = pickle.loads(self.cache[url]) out = list(self.cache[url])
except KeyError: except KeyError:
data = None out = [None, None, unicode(), bytes(), 0]
if sys.version_info[0] >= 3:
out[2] = email.message_from_string(out[2] or unicode()) # headers
else: else:
data['headers'] = parse_headers(data['headers'] or unicode()) out[2] = mimetools.Message(StringIO(out[2] or unicode()))
return data return out
def save(self, key, data): def save(self, url, code, msg, headers, data, timestamp):
data['headers'] = unicode(data['headers']) self.cache[url] = (code, msg, unicode(headers), data, timestamp)
self.cache[key] = pickle.dumps(data, 0)
def cached_response(self, req, fallback=None):
req.from_morss_cache = True
data = self.load(req.get_full_url())
if data is not None:
# return the cache as a response
resp = addinfourl(BytesIO(data['data']), data['headers'], req.get_full_url(), data['code'])
resp.msg = data['msg']
return resp
else:
return fallback
def save_response(self, req, resp):
if req.from_morss_cache:
# do not re-save (would reset the timing)
return resp
data = resp.read()
self.save(req.get_full_url(), {
'code': resp.code,
'msg': resp.msg,
'headers': resp.headers,
'data': data,
'timestamp': time.time()
})
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
def http_request(self, req): def http_request(self, req):
req.from_morss_cache = False # to track whether it comes from cache (code, msg, headers, data, timestamp) = self.load(req.get_full_url())
data = self.load(req.get_full_url()) if 'etag' in headers:
req.add_unredirected_header('If-None-Match', headers['etag'])
if data is not None: if 'last-modified' in headers:
if 'etag' in data['headers']: req.add_unredirected_header('If-Modified-Since', headers.get('last-modified'))
req.add_unredirected_header('If-None-Match', data['headers']['etag'])
if 'last-modified' in data['headers']:
req.add_unredirected_header('If-Modified-Since', data['headers']['last-modified'])
return req return req
@@ -567,111 +470,275 @@ class CacheHandler(BaseHandler):
# If 'None' is returned, try your chance with the next-available handler # If 'None' is returned, try your chance with the next-available handler
# If a 'resp' is returned, stop there, and proceed with 'http_response' # If a 'resp' is returned, stop there, and proceed with 'http_response'
# Here, we try to see whether we want to use data from cache (i.e. (code, msg, headers, data, timestamp) = self.load(req.get_full_url())
# return 'resp'), or whether we want to refresh the content (return
# 'None')
data = self.load(req.get_full_url())
if data is not None:
# some info needed to process everything # some info needed to process everything
cache_control = parse_http_list(data['headers'].get('cache-control', ())) cache_control = parse_http_list(headers.get('cache-control', ()))
cache_control += parse_http_list(data['headers'].get('pragma', ())) cache_control += parse_http_list(headers.get('pragma', ()))
cc_list = [x for x in cache_control if '=' not in x] cc_list = [x for x in cache_control if '=' not in x]
cc_values = parse_keqv_list([x for x in cache_control if '=' in x]) cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
cache_age = time.time() - data['timestamp'] cache_age = time.time() - timestamp
# list in a simple way what to do in special cases # list in a simple way what to do when
if req.get_header('Morss') == 'from_304': # for whatever reason, we need an uppercase
# we're just in the middle of a dirty trick, use cache
pass
if data is not None and 'private' in cc_list and self.privacy == 'public': elif self.force_min == -2:
# private data but public cache, do not use cache if code is not None:
# privacy concern, so handled first and foremost # already in cache, perfect, use cache
# (and doesn't need to be addressed anymore afterwards) pass
else:
# raise an error, via urllib handlers
headers['Morss'] = 'from_cache'
resp = addinfourl(BytesIO(), headers, req.get_full_url(), 409)
resp.msg = 'Conflict'
return resp
elif code is None:
# cache empty, refresh
return None return None
elif self.policy == 'offline': elif self.force_min == -1:
# use cache, or return an error # force use cache
return self.cached_response( pass
req,
error_response(409, 'Conflict', req.get_full_url())
)
elif self.policy == 'cached': elif self.force_min == 0:
# use cache, or fetch online
return self.cached_response(req, None)
elif self.policy == 'refresh':
# force refresh # force refresh
return None return None
elif data is None: elif code == 301 and cache_age < 7*24*3600:
# we have already settled all the cases that don't need the cache.
# all the following ones need the cached item
return None
elif self.force_max is not None and cache_age > self.force_max:
# older than we want, refresh
return None
elif self.force_min is not None and cache_age < self.force_min:
# recent enough, use cache
return self.cached_response(req)
elif data['code'] == 301 and cache_age < 7*24*3600:
# "301 Moved Permanently" has to be cached...as long as we want # "301 Moved Permanently" has to be cached...as long as we want
# (awesome HTTP specs), let's say a week (why not?). Use force_min=0 # (awesome HTTP specs), let's say a week (why not?). Use force_min=0
# if you want to bypass this (needed for a proper refresh) # if you want to bypass this (needed for a proper refresh)
return self.cached_response(req) pass
elif self.force_min is None and ('no-cache' in cc_list or 'no-store' in cc_list): elif self.force_min is None and ('no-cache' in cc_list
# kindly follow web servers indications, refresh if the same or 'no-store' in cc_list
# settings are used all along, this section shouldn't be of any use, or ('private' in cc_list and not self.private_cache)):
# since the page woudln't be cached in the first place the check is # kindly follow web servers indications, refresh
# only performed "just in case" # if the same settings are used all along, this section shouldn't be
# NB. NOT respected if force_min is set # of any use, since the page woudln't be cached in the first place
# the check is only performed "just in case"
return None return None
elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age: elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
# server says it's still fine (and we trust him, if not, use overrides), use cache # server says it's still fine (and we trust him, if not, use force_min=0), use cache
return self.cached_response(req) pass
elif self.force_min is not None and self.force_min > cache_age:
# still recent enough for us, use cache
pass
else: else:
# according to the www, we have to refresh when nothing is said # according to the www, we have to refresh when nothing is said
return None return None
# return the cache as a response. This code is reached with 'pass' above
headers['morss'] = 'from_cache' # TODO delete the morss header from incoming pages, to avoid websites messing up with us
resp = addinfourl(BytesIO(data), headers, req.get_full_url(), code)
resp.msg = msg
return resp
def http_response(self, req, resp): def http_response(self, req, resp):
# code for after-fetch, to know whether to save to hard-drive (if sticking to http headers' will) # code for after-fetch, to know whether to save to hard-drive (if stiking to http headers' will)
if resp.code == 304 and resp.url in self.cache: if resp.code == 304:
# we are hopefully the first after the HTTP handler, so no need return resp
# to re-run all the *_response
# here: cached page, returning from cache
return self.cached_response(req)
elif self.force_min is None and ('cache-control' in resp.headers or 'pragma' in resp.headers): if ('cache-control' in resp.headers or 'pragma' in resp.headers) and self.force_min is None:
cache_control = parse_http_list(resp.headers.get('cache-control', ())) cache_control = parse_http_list(resp.headers.get('cache-control', ()))
cache_control += parse_http_list(resp.headers.get('pragma', ())) cache_control += parse_http_list(resp.headers.get('pragma', ()))
cc_list = [x for x in cache_control if '=' not in x] cc_list = [x for x in cache_control if '=' not in x]
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and self.privacy == 'public'): if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache):
# kindly follow web servers indications (do not save & return) # kindly follow web servers indications
return resp return resp
else: if resp.headers.get('Morss') == 'from_cache':
# save # it comes from cache, so no need to save it again
return self.save_response(req, resp) return resp
else: # save to disk
return self.save_response(req, resp) data = resp.read()
self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
# the below is only needed because of 'resp.read()' above, as we can't
# seek(0) on arbitraty file-like objects (e.g. sockets)
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
def http_error_304(self, req, fp, code, msg, headers):
cache = list(self.load(req.get_full_url()))
if cache[0]:
cache[-1] = time.time()
self.save(req.get_full_url(), *cache)
new = Request(req.get_full_url(),
headers=req.headers,
unverifiable=True)
new.add_unredirected_header('Morss', 'from_304')
# create a "fake" new request to just re-run through the various
# handlers
return self.parent.open(new, timeout=req.timeout)
return None # when returning 'None', the next-available handler is used
# the 'HTTPRedirectHandler' has no 'handler_order', i.e.
# uses the default of 500, therefore executed after this
https_request = http_request https_request = http_request
https_open = http_open https_open = http_open
https_response = http_response https_response = http_response
class BaseCache:
""" Subclasses must behave like a dict """
def trim(self):
pass
def autotrim(self, delay=CACHE_LIFESPAN):
# trim the cache every so often
self.trim()
t = threading.Timer(delay, self.autotrim)
t.daemon = True
t.start()
def __contains__(self, url):
try:
self[url]
except KeyError:
return False
else:
return True
import sqlite3
class SQLiteCache(BaseCache):
def __init__(self, filename=':memory:'):
self.con = sqlite3.connect(filename, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
with self.con:
self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)')
self.con.execute('pragma journal_mode=WAL')
self.trim()
def __del__(self):
self.con.close()
def trim(self):
with self.con:
self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,))
def __getitem__(self, url):
row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
if not row:
raise KeyError
return row[1:]
def __setitem__(self, url, value): # value = (code, msg, headers, data, timestamp)
value = list(value)
value[3] = sqlite3.Binary(value[3]) # data
value = tuple(value)
with self.con:
self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?) ON CONFLICT(url) DO UPDATE SET code=?, msg=?, headers=?, data=?, timestamp=?', (url,) + value + value)
import pymysql.cursors
class MySQLCacheHandler(BaseCache):
def __init__(self, user, password, database, host='localhost'):
self.user = user
self.password = password
self.database = database
self.host = host
with self.cursor() as cursor:
cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)')
self.trim()
def cursor(self):
return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
def trim(self):
with self.cursor() as cursor:
cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,))
def __getitem__(self, url):
cursor = self.cursor()
cursor.execute('SELECT * FROM data WHERE url=%s', (url,))
row = cursor.fetchone()
if not row:
raise KeyError
return row[1:]
def __setitem__(self, url, value): # (code, msg, headers, data, timestamp)
with self.cursor() as cursor:
cursor.execute('INSERT INTO data VALUES (%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE code=%s, msg=%s, headers=%s, data=%s, timestamp=%s',
(url,) + value + value)
class CappedDict(OrderedDict, BaseCache):
def trim(self):
if CACHE_SIZE >= 0:
for i in range( max( len(self) - CACHE_SIZE , 0 )):
self.popitem(False)
def __setitem__(self, key, value):
# https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
if key in self:
del self[key]
OrderedDict.__setitem__(self, key, value)
if 'CACHE' in os.environ:
if os.environ['CACHE'] == 'mysql':
default_cache = MySQLCacheHandler(
user = os.getenv('MYSQL_USER'),
password = os.getenv('MYSQL_PWD'),
database = os.getenv('MYSQL_DB'),
host = os.getenv('MYSQL_HOST', 'localhost')
)
elif os.environ['CACHE'] == 'sqlite':
if 'SQLITE_PATH' in os.environ:
path = os.getenv('SQLITE_PATH') + '/morss-cache.db'
else:
path = ':memory:'
default_cache = SQLiteCache(path)
else:
default_cache = CappedDict()
if 'IGNORE_SSL' in os.environ: if 'IGNORE_SSL' in os.environ:
import ssl import ssl
ssl._create_default_https_context = ssl._create_unverified_context ssl._create_default_https_context = ssl._create_unverified_context

View File

@@ -73,7 +73,7 @@ item_updated = atom03:updated
mode = json mode = json
mimetype = application/json mimetype = application/json
timeformat = %Y-%m-%dT%H:%M:%S%z timeformat = %Y-%m-%dT%H:%M:%SZ
base = {} base = {}
title = title title = title
@@ -90,6 +90,9 @@ item_updated = updated
[html] [html]
mode = html mode = html
path =
http://localhost/
title = //div[@id='header']/h1 title = //div[@id='header']/h1
desc = //div[@id='header']/p desc = //div[@id='header']/p
items = //div[@id='content']/div items = //div[@id='content']/div

View File

@@ -15,31 +15,35 @@
# You should have received a copy of the GNU Affero General Public License along # You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>. # with this program. If not, see <https://www.gnu.org/licenses/>.
import csv import sys
import json import os.path
import re
from copy import deepcopy
from datetime import datetime from datetime import datetime
import re
import json
import csv
from fnmatch import fnmatch from fnmatch import fnmatch
import dateutil.parser
import lxml.html
from dateutil import tz
from lxml import etree from lxml import etree
from dateutil import tz
import dateutil.parser
from copy import deepcopy
import lxml.html
from .readabilite import parse as html_parse from .readabilite import parse as html_parse
from .util import *
json.encoder.c_make_encoder = None json.encoder.c_make_encoder = None
try: try:
# python 2 # python 2
from ConfigParser import RawConfigParser
from StringIO import StringIO from StringIO import StringIO
from ConfigParser import RawConfigParser
except ImportError: except ImportError:
# python 3 # python 3
from configparser import RawConfigParser
from io import StringIO from io import StringIO
from configparser import RawConfigParser
try: try:
# python 2 # python 2
@@ -51,7 +55,7 @@ except NameError:
def parse_rules(filename=None): def parse_rules(filename=None):
if not filename: if not filename:
filename = pkg_path('feedify.ini') filename = os.path.join(os.path.dirname(__file__), 'feedify.ini')
config = RawConfigParser() config = RawConfigParser()
config.read(filename) config.read(filename)
@@ -65,36 +69,39 @@ def parse_rules(filename=None):
# for each rule # for each rule
if rules[section][arg].startswith('file:'): if rules[section][arg].startswith('file:'):
path = data_path('www', rules[section][arg][5:]) paths = [os.path.join(sys.prefix, 'share/morss/www', rules[section][arg][5:]),
os.path.join(os.path.dirname(__file__), '../www', rules[section][arg][5:]),
os.path.join(os.path.dirname(__file__), '../..', rules[section][arg][5:])]
for path in paths:
try:
file_raw = open(path).read() file_raw = open(path).read()
file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw) file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
rules[section][arg] = file_clean rules[section][arg] = file_clean
except IOError:
pass
elif '\n' in rules[section][arg]: elif '\n' in rules[section][arg]:
rules[section][arg] = rules[section][arg].split('\n')[1:] rules[section][arg] = rules[section][arg].split('\n')[1:]
return rules return rules
def parse(data, url=None, encoding=None, ruleset=None): def parse(data, url=None, encoding=None):
" Determine which ruleset to use " " Determine which ruleset to use "
if ruleset is not None: rulesets = parse_rules()
rulesets = [ruleset]
else:
rulesets = parse_rules().values()
parsers = [FeedXML, FeedHTML, FeedJSON] parsers = [FeedXML, FeedHTML, FeedJSON]
# 1) Look for a ruleset based on path # 1) Look for a ruleset based on path
if url is not None: if url is not None:
for ruleset in rulesets: for ruleset in rulesets.values():
if 'path' in ruleset: if 'path' in ruleset:
for path in ruleset['path']: for path in ruleset['path']:
if fnmatch(url, path): if fnmatch(url, path):
parser = [x for x in parsers if x.mode == ruleset.get('mode')][0] # FIXME what if no mode specified? parser = [x for x in parsers if x.mode == ruleset['mode']][0]
return parser(data, ruleset, encoding=encoding) return parser(data, ruleset, encoding=encoding)
# 2) Try each and every parser # 2) Try each and every parser
@@ -104,6 +111,9 @@ def parse(data, url=None, encoding=None, ruleset=None):
# 3b) See if .items matches anything # 3b) See if .items matches anything
for parser in parsers: for parser in parsers:
ruleset_candidates = [x for x in rulesets.values() if x['mode'] == parser.mode and 'path' not in x]
# 'path' as they should have been caught beforehands
try: try:
feed = parser(data, encoding=encoding) feed = parser(data, encoding=encoding)
@@ -114,17 +124,13 @@ def parse(data, url=None, encoding=None, ruleset=None):
else: else:
# parsing worked, now we try the rulesets # parsing worked, now we try the rulesets
ruleset_candidates = [x for x in rulesets if x.get('mode') in (parser.mode, None) and 'path' not in x]
# 'path' as they should have been caught beforehands
# try anyway if no 'mode' specified
for ruleset in ruleset_candidates: for ruleset in ruleset_candidates:
feed.rules = ruleset feed.rules = ruleset
try: try:
feed.items[0] feed.items[0]
except (AttributeError, IndexError, TypeError): except (AttributeError, IndexError):
# parsing and or item picking did not work out # parsing and or item picking did not work out
pass pass
@@ -187,12 +193,11 @@ class ParserBase(object):
return self.convert(FeedHTML).tostring(**k) return self.convert(FeedHTML).tostring(**k)
def convert(self, TargetParser): def convert(self, TargetParser):
target = TargetParser() if type(self) == TargetParser:
if type(self) == TargetParser and self.rules == target.rules:
# check both type *AND* rules (e.g. when going from freeform xml to rss)
return self return self
target = TargetParser()
for attr in target.dic: for attr in target.dic:
if attr == 'items': if attr == 'items':
for item in self.items: for item in self.items:
@@ -361,13 +366,7 @@ class ParserXML(ParserBase):
def rule_search_all(self, rule): def rule_search_all(self, rule):
try: try:
match = self.root.xpath(rule, namespaces=self.NSMAP) return self.root.xpath(rule, namespaces=self.NSMAP)
if isinstance(match, str):
# some xpath rules return a single string instead of an array (e.g. concatenate() )
return [match,]
else:
return match
except etree.XPathEvalError: except etree.XPathEvalError:
return [] return []
@@ -430,7 +429,7 @@ class ParserXML(ParserBase):
match = self.rule_search(rrule) match = self.rule_search(rrule)
html_rich = ('atom' in rule or self.rules.get('mode') == 'html') \ html_rich = ('atom' in rule or self.rules['mode'] == 'html') \
and rule in [self.rules.get('item_desc'), self.rules.get('item_content')] and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
if key is not None: if key is not None:
@@ -441,7 +440,7 @@ class ParserXML(ParserBase):
self._clean_node(match) self._clean_node(match)
match.append(lxml.html.fragment_fromstring(value, create_parent='div')) match.append(lxml.html.fragment_fromstring(value, create_parent='div'))
if self.rules.get('mode') == 'html': if self.rules['mode'] == 'html':
match.find('div').drop_tag() # not supported by lxml.etree match.find('div').drop_tag() # not supported by lxml.etree
else: # i.e. if atom else: # i.e. if atom
@@ -457,7 +456,7 @@ class ParserXML(ParserBase):
def rule_str(self, rule): def rule_str(self, rule):
match = self.rule_search(rule) match = self.rule_search(rule)
html_rich = ('atom' in rule or self.mode == 'html') \ html_rich = ('atom' in rule or self.rules['mode'] == 'html') \
and rule in [self.rules.get('item_desc'), self.rules.get('item_content')] and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
if isinstance(match, etree._Element): if isinstance(match, etree._Element):
@@ -490,14 +489,7 @@ class ParserHTML(ParserXML):
repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]' repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
rule = re.sub(pattern, repl, rule) rule = re.sub(pattern, repl, rule)
match = self.root.xpath(rule) return self.root.xpath(rule)
if isinstance(match, str):
# for some xpath rules, see XML parser
return [match,]
else:
return match
except etree.XPathEvalError: except etree.XPathEvalError:
return [] return []
@@ -516,31 +508,24 @@ class ParserHTML(ParserXML):
def parse_time(value): def parse_time(value):
# parsing per se
if value is None or value == 0: if value is None or value == 0:
time = None return None
elif isinstance(value, basestring): elif isinstance(value, basestring):
if re.match(r'^[0-9]+$', value): if re.match(r'^[0-9]+$', value):
time = datetime.fromtimestamp(int(value)) return datetime.fromtimestamp(int(value), tz.tzutc())
else: else:
time = dateutil.parser.parse(value) return dateutil.parser.parse(value).replace(tzinfo=tz.tzutc())
elif isinstance(value, int): elif isinstance(value, int):
time = datetime.fromtimestamp(value) return datetime.fromtimestamp(value, tz.tzutc())
elif isinstance(value, datetime): elif isinstance(value, datetime):
time = value return value
else: else:
time = None return None
# add default time zone if none set
if time is not None and time.tzinfo is None:
time = time.replace(tzinfo=tz.tzutc())
return time
class ParserJSON(ParserBase): class ParserJSON(ParserBase):
@@ -699,7 +684,7 @@ class Feed(object):
try: try:
setattr(item, attr, new[attr]) setattr(item, attr, new[attr])
except (KeyError, IndexError, TypeError): except (IndexError, TypeError):
pass pass
return item return item
@@ -815,8 +800,6 @@ class FeedJSON(Feed, ParserJSON):
if __name__ == '__main__': if __name__ == '__main__':
import sys
from . import crawler from . import crawler
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss') req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')

View File

@@ -16,26 +16,30 @@
# with this program. If not, see <https://www.gnu.org/licenses/>. # with this program. If not, see <https://www.gnu.org/licenses/>.
import os import os
import re
import sys
import time import time
from datetime import datetime from datetime import datetime
from dateutil import tz
from fnmatch import fnmatch from fnmatch import fnmatch
import re
import lxml.etree import lxml.etree
import lxml.html import lxml.html
from dateutil import tz
from . import caching, crawler, feeds, readabilite from . import feeds
from . import crawler
from . import readabilite
try: try:
# python 2 # python 2
from httplib import HTTPException from httplib import HTTPException
from urlparse import parse_qs, urljoin, urlparse from urlparse import urlparse, urljoin, parse_qs
except ImportError: except ImportError:
# python 3 # python 3
from http.client import HTTPException from http.client import HTTPException
from urllib.parse import parse_qs, urljoin, urlparse from urllib.parse import urlparse, urljoin, parse_qs
MAX_ITEM = int(os.getenv('MAX_ITEM', 5)) # cache-only beyond MAX_ITEM = int(os.getenv('MAX_ITEM', 5)) # cache-only beyond
@@ -60,7 +64,7 @@ def log(txt):
else: else:
# when using internal server or cli # when using internal server or cli
print(repr(txt), file=sys.stderr) print(repr(txt))
def len_html(txt): def len_html(txt):
@@ -87,12 +91,12 @@ class Options:
else: else:
self.options = options or {} self.options = options or {}
def __getattr__(self, key, default=None): def __getattr__(self, key):
if key in self.options: if key in self.options:
return self.options[key] return self.options[key]
else: else:
return default return False
def __setitem__(self, key, value): def __setitem__(self, key, value):
self.options[key] = value self.options[key] = value
@@ -100,8 +104,6 @@ class Options:
def __contains__(self, key): def __contains__(self, key):
return key in self.options return key in self.options
get = __getitem__ = __getattr__
def ItemFix(item, options, feedurl='/'): def ItemFix(item, options, feedurl='/'):
""" Improves feed items (absolute links, resolve feedburner links, etc) """ """ Improves feed items (absolute links, resolve feedburner links, etc) """
@@ -195,20 +197,21 @@ def ItemFill(item, options, feedurl='/', fast=False):
log(item.link) log(item.link)
# download # download
delay = -1
if fast or options.cache: if fast or options.fast:
# force cache, don't fetch # force cache, don't fetch
policy = 'offline' delay = -2
elif options.force: elif options.force:
# force refresh # force refresh
policy = 'refresh' delay = 0
else: else:
policy = None delay = 24*60*60 # 24h
try: try:
req = crawler.adv_get(url=item.link, policy=policy, force_min=24*60*60, timeout=TIMEOUT) req = crawler.adv_get(url=item.link, delay=delay, timeout=TIMEOUT)
except (IOError, HTTPException) as e: except (IOError, HTTPException) as e:
log('http error') log('http error')
@@ -218,11 +221,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
log('non-text page') log('non-text page')
return True return True
if not req['data']: out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
log('empty page')
return True
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode', xpath=options.xpath)
if out is not None: if out is not None:
item.content = out item.content = out
@@ -266,43 +265,33 @@ def FeedFetch(url, options):
# fetch feed # fetch feed
delay = DELAY delay = DELAY
if options.cache: if options.force:
policy = 'offline' delay = 0
elif options.force:
policy = 'refresh'
else:
policy = None
try: try:
req = crawler.adv_get(url=url, post=options.post, follow=('rss' if not options.items else None), policy=policy, force_min=5*60, force_max=60*60, timeout=TIMEOUT) req = crawler.adv_get(url=url, follow=('rss' if not options.items else None), delay=delay, timeout=TIMEOUT * 2)
except (IOError, HTTPException): except (IOError, HTTPException):
raise MorssException('Error downloading feed') raise MorssException('Error downloading feed')
if options.items: if options.items:
# using custom rules # using custom rules
ruleset = {} rss = feeds.FeedHTML(req['data'], encoding=req['encoding'])
ruleset['items'] = options.items rss.rules['title'] = options.title if options.title else '//head/title'
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
if options.mode: rss.rules['items'] = options.items
ruleset['mode'] = options.mode
ruleset['title'] = options.get('title', '//head/title') rss.rules['item_title'] = options.item_title if options.item_title else '.'
ruleset['desc'] = options.get('desc', '//head/meta[@name="description"]/@content') rss.rules['item_link'] = options.item_link if options.item_link else './@href|.//a/@href|ancestor::a/@href'
ruleset['item_title'] = options.get('item_title', '.')
ruleset['item_link'] = options.get('item_link', '(.|.//a|ancestor::a)/@href')
if options.item_content: if options.item_content:
ruleset['item_content'] = options.item_content rss.rules['item_content'] = options.item_content
if options.item_time: if options.item_time:
ruleset['item_time'] = options.item_time rss.rules['item_time'] = options.item_time
rss = feeds.parse(req['data'], encoding=req['encoding'], ruleset=ruleset)
rss = rss.convert(feeds.FeedXML) rss = rss.convert(feeds.FeedXML)
else: else:
@@ -332,23 +321,16 @@ def FeedGather(rss, url, options):
if options.cache: if options.cache:
max_time = 0 max_time = 0
# sort if options.newest:
# :newest take the newest items
now = datetime.now(tz.tzutc())
sorted_items = sorted(rss.items, key=lambda x:x.updated or x.time or now, reverse=True)
else:
# default behavior, take the first items (in appearing order)
sorted_items = list(rss.items) sorted_items = list(rss.items)
if options.order == 'last':
# `first` does nothing from a practical standpoint, so only `last` needs
# to be addressed
sorted_items = reversed(sorted_items)
elif options.order in ['newest', 'oldest']:
now = datetime.now(tz.tzutc())
sorted_items = sorted(sorted_items, key=lambda x:x.updated or x.time or now) # oldest to newest
if options.order == 'newest':
sorted_items = reversed(sorted_items)
for i, item in enumerate(sorted_items): for i, item in enumerate(sorted_items):
# hard cap
if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0: if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0:
log('dropped') log('dropped')
item.remove() item.remove()
@@ -361,7 +343,6 @@ def FeedGather(rss, url, options):
item = ItemFix(item, options, url) item = ItemFix(item, options, url)
# soft cap
if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0: if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
if not options.proxy: if not options.proxy:
if ItemFill(item, options, url, True) is False: if ItemFill(item, options, url, True) is False:
@@ -428,7 +409,7 @@ def process(url, cache=None, options=None):
options = Options(options) options = Options(options)
if cache: if cache:
caching.default_cache = caching.DiskCacheHandler(cache) crawler.default_cache = crawler.SQLiteCache(cache)
url, rss = FeedFetch(url, options) url, rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options) rss = FeedGather(rss, url, options)

View File

@@ -15,22 +15,22 @@
# You should have received a copy of the GNU Affero General Public License along # You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>. # with this program. If not, see <https://www.gnu.org/licenses/>.
import re
import bs4.builder._lxml
import lxml.etree import lxml.etree
import lxml.html import lxml.html
import lxml.html.soupparser from bs4 import BeautifulSoup
import re
class CustomTreeBuilder(bs4.builder._lxml.LXMLTreeBuilder):
def default_parser(self, encoding):
return lxml.html.HTMLParser(target=self, remove_comments=True, remove_pis=True, encoding=encoding)
def parse(data, encoding=None): def parse(data, encoding=None):
kwargs = {'from_encoding': encoding} if encoding else {} if encoding:
return lxml.html.soupparser.fromstring(data, builder=CustomTreeBuilder, **kwargs) data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')
else:
data = BeautifulSoup(data, 'lxml').prettify('utf-8')
parser = lxml.html.HTMLParser(remove_comments=True, encoding='utf-8')
return lxml.html.fromstring(data, parser=parser)
def count_words(string): def count_words(string):
@@ -43,8 +43,6 @@ def count_words(string):
if string is None: if string is None:
return 0 return 0
string = string.strip()
i = 0 i = 0
count = 0 count = 0
@@ -154,20 +152,15 @@ def score_all(node):
for child in node: for child in node:
score = score_node(child) score = score_node(child)
set_score(child, score, 'morss_own_score') child.attrib['morss_own_score'] = str(float(score))
if score > 0 or len(list(child.iterancestors())) <= 2: if score > 0 or len(list(child.iterancestors())) <= 2:
spread_score(child, score) spread_score(child, score)
score_all(child) score_all(child)
def set_score(node, value, label='morss_score'): def set_score(node, value):
try: node.attrib['morss_score'] = str(float(value))
node.attrib[label] = str(float(value))
except KeyError:
# catch issues with e.g. html comments
pass
def get_score(node): def get_score(node):
@@ -207,12 +200,6 @@ def clean_root(root, keep_threshold=None):
def clean_node(node, keep_threshold=None): def clean_node(node, keep_threshold=None):
parent = node.getparent() parent = node.getparent()
# remove comments
if (isinstance(node, lxml.html.HtmlComment)
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
parent.remove(node)
return
if parent is None: if parent is None:
# this is <html/> (or a removed element waiting for GC) # this is <html/> (or a removed element waiting for GC)
return return
@@ -223,7 +210,7 @@ def clean_node(node, keep_threshold=None):
return return
# high score, so keep # high score, so keep
if keep_threshold is not None and keep_threshold > 0 and get_score(node) >= keep_threshold: if keep_threshold is not None and get_score(node) >= keep_threshold:
return return
gdparent = parent.getparent() gdparent = parent.getparent()
@@ -244,6 +231,11 @@ def clean_node(node, keep_threshold=None):
parent.remove(node) parent.remove(node)
return return
# remove comments
if isinstance(node, lxml.html.HtmlComment) or isinstance(node, lxml.html.HtmlProcessingInstruction):
parent.remove(node)
return
# remove if too many kids & too high link density # remove if too many kids & too high link density
wc = count_words(node.text_content()) wc = count_words(node.text_content())
if wc != 0 and len(list(node.iter())) > 3: if wc != 0 and len(list(node.iter())) > 3:
@@ -301,26 +293,28 @@ def clean_node(node, keep_threshold=None):
gdparent.insert(gdparent.index(parent)+1, new_node) gdparent.insert(gdparent.index(parent)+1, new_node)
def lowest_common_ancestor(node_a, node_b, max_depth=None): def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
ancestors_a = list(node_a.iterancestors()) ancestorsA = list(nodeA.iterancestors())
ancestors_b = list(node_b.iterancestors()) ancestorsB = list(nodeB.iterancestors())
if max_depth is not None: if max_depth is not None:
ancestors_a = ancestors_a[:max_depth] ancestorsA = ancestorsA[:max_depth]
ancestors_b = ancestors_b[:max_depth] ancestorsB = ancestorsB[:max_depth]
ancestors_a.insert(0, node_a) ancestorsA.insert(0, nodeA)
ancestors_b.insert(0, node_b) ancestorsB.insert(0, nodeB)
for ancestor_a in ancestors_a: for ancestorA in ancestorsA:
if ancestor_a in ancestors_b: if ancestorA in ancestorsB:
return ancestor_a return ancestorA
return node_a # should always find one tho, at least <html/>, but needed for max_depth return nodeA # should always find one tho, at least <html/>, but needed for max_depth
def get_best_node(html, threshold=5): def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5):
# score all nodes " Input a raw html string, returns a raw html string of the article "
html = parse(data, encoding_in)
score_all(html) score_all(html)
# rank all nodes (largest to smallest) # rank all nodes (largest to smallest)
@@ -337,33 +331,9 @@ def get_best_node(html, threshold=5):
else: else:
best = ranked_nodes[0] best = ranked_nodes[0]
return best
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5, xpath=None):
" Input a raw html string, returns a raw html string of the article "
html = parse(data, encoding_in)
if xpath is not None:
xpath_match = html.xpath(xpath)
if len(xpath_match):
best = xpath_match[0]
else:
best = get_best_node(html, threshold)
else:
best = get_best_node(html, threshold)
if best is None:
# if threshold not met
return None
# clean up # clean up
if not debug: if not debug:
keep_threshold = get_score(best) * 3/4 keep_threshold = get_score(ranked_nodes[0]) * 3/4
clean_root(best, keep_threshold) clean_root(best, keep_threshold)
# check for spammy content (links only) # check for spammy content (links only)
@@ -382,7 +352,6 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=
if __name__ == '__main__': if __name__ == '__main__':
import sys import sys
from . import crawler from . import crawler
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it') req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')

View File

@@ -1,57 +0,0 @@
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import os.path
import sys
def pkg_path(*path_elements):
return os.path.join(os.path.dirname(__file__), *path_elements)
data_path_base = None
def data_path(*path_elements):
global data_path_base
path = os.path.join(*path_elements)
if data_path_base is not None:
return os.path.join(data_path_base, path)
bases = [
os.path.join(sys.prefix, 'share/morss'), # when installed as root
pkg_path('../../../share/morss'),
pkg_path('../../../../share/morss'),
pkg_path('../share/morss'), # for `pip install --target=dir morss`
pkg_path('..'), # when running from source tree
]
if 'DATA_PATH' in os.environ:
bases.append(os.environ['DATA_PATH'])
for base in bases:
full_path = os.path.join(base, path)
if os.path.isfile(full_path):
data_path_base = os.path.abspath(base)
return data_path(path)
else:
raise IOError()

View File

@@ -15,17 +15,17 @@
# You should have received a copy of the GNU Affero General Public License along # You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>. # with this program. If not, see <https://www.gnu.org/licenses/>.
import cgitb import sys
import mimetypes
import os.path import os.path
import re import re
import sys
import wsgiref.handlers
import wsgiref.simple_server
import wsgiref.util
import lxml.etree import lxml.etree
import cgitb
import wsgiref.util
import wsgiref.simple_server
import wsgiref.handlers
import mimetypes
try: try:
# python 2 # python 2
from urllib import unquote from urllib import unquote
@@ -33,12 +33,13 @@ except ImportError:
# python 3 # python 3
from urllib.parse import unquote from urllib.parse import unquote
from . import caching, crawler, readabilite from . import crawler
from .morss import (DELAY, TIMEOUT, FeedFetch, FeedFormat, FeedGather, from . import readabilite
MorssException, Options, log) from .morss import FeedFetch, FeedGather, FeedFormat
from .util import data_path from .morss import Options, log, TIMEOUT, DELAY, MorssException
PORT = int(os.getenv('PORT', 8000))
PORT = int(os.getenv('PORT', 8080))
def parse_options(options): def parse_options(options):
@@ -49,7 +50,7 @@ def parse_options(options):
split = option.split('=', 1) split = option.split('=', 1)
if len(split) > 1: if len(split) > 1:
out[split[0]] = unquote(split[1]).replace('|', '/') # | -> / for backward compatibility (and Apache) out[split[0]] = split[1]
else: else:
out[split[0]] = True out[split[0]] = True
@@ -57,18 +58,14 @@ def parse_options(options):
return out return out
def request_uri(environ): def get_path(environ):
if 'REQUEST_URI' in environ: if 'REQUEST_URI' in environ:
# when running on Apache/uwsgi # when running on Apache
url = environ['REQUEST_URI'] url = unquote(environ['REQUEST_URI'][1:])
elif 'RAW_URI' in environ:
# gunicorn
url = environ['RAW_URI']
else: else:
# when using other servers # when using internal server
url = environ['PATH_INFO'] url = environ['PATH_INFO'][1:]
if environ['QUERY_STRING']: if environ['QUERY_STRING']:
url += '?' + environ['QUERY_STRING'] url += '?' + environ['QUERY_STRING']
@@ -79,13 +76,19 @@ def request_uri(environ):
def cgi_parse_environ(environ): def cgi_parse_environ(environ):
# get options # get options
url = request_uri(environ)[1:] url = get_path(environ)
url = re.sub(r'^(cgi/)?(morss.py|main.py)/', '', url) url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
if url.startswith(':'): if url.startswith(':'):
parts = url.split('/', 1) split = url.split('/', 1)
raw_options = parts[0].split(':')[1:]
url = parts[1] if len(parts) > 1 else '' raw_options = split[0].replace('|', '/').replace('\\\'', '\'').split(':')[1:]
if len(split) > 1:
url = split[1]
else:
url = ''
else: else:
raw_options = [] raw_options = []
@@ -161,20 +164,25 @@ def middleware(func):
def cgi_file_handler(environ, start_response, app): def cgi_file_handler(environ, start_response, app):
" Simple HTTP server to serve static files (.html, .css, etc.) " " Simple HTTP server to serve static files (.html, .css, etc.) "
url = request_uri(environ)[1:] url = get_path(environ)
if url == '': if url == '':
url = 'index.html' url = 'index.html'
if re.match(r'^/?([a-zA-Z0-9_-][a-zA-Z0-9\._-]+/?)*$', url): if re.match(r'^/?([a-zA-Z0-9_-][a-zA-Z0-9\._-]+/?)*$', url):
# if it is a legitimate url (no funny relative paths) # if it is a legitimate url (no funny relative paths)
paths = [
os.path.join(sys.prefix, 'share/morss/www', url),
os.path.join(os.path.dirname(__file__), '../www', url)
]
for path in paths:
try: try:
path = data_path('www', url)
f = open(path, 'rb') f = open(path, 'rb')
except IOError: except IOError:
# problem with file (cannot open or not found) # problem with file (cannot open or not found)
pass continue
else: else:
# file successfully open # file successfully open
@@ -192,11 +200,10 @@ def cgi_get(environ, start_response):
url, options = cgi_parse_environ(environ) url, options = cgi_parse_environ(environ)
# get page # get page
if options['get'] in ('page', 'article'):
req = crawler.adv_get(url=url, timeout=TIMEOUT) req = crawler.adv_get(url=url, timeout=TIMEOUT)
if req['contenttype'] in crawler.MIMETYPE['html']: if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
if options['get'] == 'page': if options.get == 'page':
html = readabilite.parse(req['data'], encoding=req['encoding']) html = readabilite.parse(req['data'], encoding=req['encoding'])
html.make_links_absolute(req['url']) html.make_links_absolute(req['url'])
@@ -208,20 +215,17 @@ def cgi_get(environ, start_response):
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html') output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
else: # i.e. options['get'] == 'article' elif options.get == 'article':
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug) output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
elif req['contenttype'] in crawler.MIMETYPE['xml'] + crawler.MIMETYPE['rss'] + crawler.MIMETYPE['json']:
output = req['data']
else:
raise MorssException('unsupported mimetype')
else: else:
raise MorssException('no :get option passed') raise MorssException('no :get option passed')
else:
output = req['data']
# return html page # return html page
headers = {'status': '200 OK', 'content-type': req['contenttype'], 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
start_response(headers['status'], list(headers.items())) start_response(headers['status'], list(headers.items()))
return [output] return [output]
@@ -251,9 +255,9 @@ def cgi_error_handler(environ, start_response, app):
raise raise
except Exception as e: except Exception as e:
headers = {'status': '404 Not Found', 'content-type': 'text/html', 'x-morss-error': repr(e)} headers = {'status': '500 Oops', 'content-type': 'text/html'}
start_response(headers['status'], list(headers.items()), sys.exc_info()) start_response(headers['status'], list(headers.items()), sys.exc_info())
log('ERROR: %s' % repr(e)) log('ERROR: %s' % repr(e), force=True)
return [cgitb.html(sys.exc_info())] return [cgitb.html(sys.exc_info())]
@@ -279,20 +283,13 @@ def cgi_handle_request():
wsgiref.handlers.CGIHandler().run(app) wsgiref.handlers.CGIHandler().run(app)
class WSGIRequestHandlerRequestUri(wsgiref.simple_server.WSGIRequestHandler):
def get_environ(self):
env = wsgiref.simple_server.WSGIRequestHandler.get_environ(self)
env['REQUEST_URI'] = self.path
return env
def cgi_start_server(): def cgi_start_server():
caching.default_cache.autotrim() crawler.default_cache.autotrim()
print('Serving http://localhost:%s/' % PORT) print('Serving http://localhost:%s/' % PORT)
httpd = wsgiref.simple_server.make_server('', PORT, application, handler_class=WSGIRequestHandlerRequestUri) httpd = wsgiref.simple_server.make_server('', PORT, application)
httpd.serve_forever() httpd.serve_forever()
if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''): if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''):
caching.default_cache.autotrim() crawler.default_cache.autotrim()

View File

@@ -1,60 +1,24 @@
from datetime import datetime
from glob import glob
from setuptools import setup from setuptools import setup
from glob import glob
def get_version():
with open('morss/__init__.py', 'r+') as file:
lines = file.readlines()
# look for hard coded version number
for i in range(len(lines)):
if lines[i].startswith('__version__'):
version = lines[i].split('"')[1]
break
# create (& save) one if none found
if version == '':
version = datetime.now().strftime('%Y%m%d.%H%M')
lines[i] = '__version__ = "' + version + '"\n'
file.seek(0)
file.writelines(lines)
# return version number
return version
package_name = 'morss' package_name = 'morss'
setup( setup(
name = package_name, name = package_name,
version = get_version(),
description = 'Get full-text RSS feeds', description = 'Get full-text RSS feeds',
long_description = open('README.md').read(), author = 'pictuga, Samuel Marks',
long_description_content_type = 'text/markdown', author_email = 'contact at pictuga dot com',
author = 'pictuga',
author_email = 'contact@pictuga.com',
url = 'http://morss.it/', url = 'http://morss.it/',
project_urls = { download_url = 'https://git.pictuga.com/pictuga/morss',
'Source': 'https://git.pictuga.com/pictuga/morss',
'Bug Tracker': 'https://github.com/pictuga/morss/issues',
},
license = 'AGPL v3', license = 'AGPL v3',
packages = [package_name], packages = [package_name],
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'], install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet', 'pymysql'],
extras_require = {
'full': ['redis', 'diskcache', 'gunicorn', 'setproctitle'],
'dev': ['pylint', 'pyenchant', 'pytest', 'pytest-cov'],
},
python_requires = '>=2.7',
package_data = {package_name: ['feedify.ini']}, package_data = {package_name: ['feedify.ini']},
data_files = [ data_files = [
('share/' + package_name, ['README.md', 'LICENSE']), ('share/' + package_name, ['README.md', 'LICENSE']),
('share/' + package_name + '/www', glob('www/*.*')), ('share/' + package_name + '/www', glob('www/*.*')),
('share/' + package_name + '/www/cgi', [])
], ],
entry_points = { entry_points = {
'console_scripts': [package_name + '=' + package_name + '.__main__:main'], 'console_scripts': [package_name + '=' + package_name + '.__main__:main']
}, })
scripts = ['morss-helper'],
)

View File

@@ -1,60 +0,0 @@
import os
import os.path
import threading
import pytest
try:
# python2
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
from SimpleHTTPServer import SimpleHTTPRequestHandler
except:
# python3
from http.server import (BaseHTTPRequestHandler, HTTPServer,
SimpleHTTPRequestHandler)
class HTTPReplayHandler(SimpleHTTPRequestHandler):
" Serves pages saved alongside with headers. See `curl --http1.1 -is http://...` "
directory = os.path.join(os.path.dirname(__file__), './samples/')
__init__ = BaseHTTPRequestHandler.__init__
def do_GET(self):
path = self.translate_path(self.path)
if os.path.isdir(path):
f = self.list_directory(path)
else:
f = open(path, 'rb')
try:
self.copyfile(f, self.wfile)
finally:
f.close()
class MuteHTTPServer(HTTPServer):
def handle_error(self, request, client_address):
# mute errors
pass
def make_server(port=8888):
print('Serving http://localhost:%s/' % port)
return MuteHTTPServer(('', port), RequestHandlerClass=HTTPReplayHandler)
@pytest.fixture
def replay_server():
httpd = make_server()
thread = threading.Thread(target=httpd.serve_forever)
thread.start()
yield
httpd.shutdown()
thread.join()
if __name__ == '__main__':
httpd = make_server()
httpd.serve_forever()

View File

@@ -1,4 +0,0 @@
HTTP/1.1 200 OK
content-type: text/plain
success

View File

@@ -1,3 +0,0 @@
HTTP/1.1 301 Moved Permanently
location: /200-ok.txt

View File

@@ -1,3 +0,0 @@
HTTP/1.1 301 Moved Permanently
location: ./200-ok.txt

View File

@@ -1,3 +0,0 @@
HTTP/1.1 301 Moved Permanently
location: http://localhost:8888/200-ok.txt

View File

@@ -1,4 +0,0 @@
HTTP/1.1 308 Permanent Redirect
location: /200-ok.txt
/200-ok.txt

View File

@@ -1,8 +0,0 @@
HTTP/1.1 200 OK
content-type: text/html; charset=UTF-8
<!DOCTYPE html>
<html>
<head><link rel="alternate" type="application/rss+xml" href="/200-ok.txt" /></head>
<body>meta redirect</body>
</html>

View File

@@ -1,4 +0,0 @@
HTTP/1.1 200 OK
content-type: text/plain; charset=gb2312
<EFBFBD>ɹ<EFBFBD>

View File

@@ -1,10 +0,0 @@
HTTP/1.1 200 OK
content-type: text/html
<!DOCTYPE html>
<html>
<head><meta charset="gb2312"/></head>
<body>
<EFBFBD>ɹ<EFBFBD>
</body></html>

View File

@@ -1,4 +0,0 @@
HTTP/1.1 200 OK
content-type: text/plain; charset=iso-8859-1
succ<EFBFBD>s

View File

@@ -1,4 +0,0 @@
HTTP/1.1 200 OK
content-type: text/plain
succ<EFBFBD>s

View File

@@ -1,4 +0,0 @@
HTTP/1.1 200 OK
content-type: text/plain; charset=UTF-8
succès

View File

@@ -1,16 +0,0 @@
HTTP/1.1 200 OK
Content-Type: text/xml; charset=utf-8
<?xml version='1.0' encoding='utf-8'?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>!TITLE!</title>
<subtitle>!DESC!</subtitle>
<entry>
<title>!ITEM_TITLE!</title>
<summary>!ITEM_DESC!</summary>
<content type="html">!ITEM_CONTENT!</content>
<link href="!ITEM_LINK!"/>
<updated>2022-01-01T00:00:01+01:00</updated>
<published>2022-01-01T00:00:02+01:00</published>
</entry>
</feed>

View File

@@ -1,15 +0,0 @@
HTTP/1.1 200 OK
content-type: application/xml
<?xml version='1.0' encoding='utf-8' ?>
<feed version='0.3' xmlns='http://purl.org/atom/ns#'>
<title>!TITLE!</title>
<subtitle>!DESC!</subtitle>
<entry>
<title>!ITEM_TITLE!</title>
<link rel='alternate' type='text/html' href='!ITEM_LINK!' />
<summary>!ITEM_DESC!</summary>
<content>!ITEM_CONTENT!</content>
<issued>2022-01-01T00:00:01+01:00</issued> <!-- FIXME -->
</entry>
</feed>

View File

@@ -1,22 +0,0 @@
HTTP/1.1 200 OK
Content-Type: text/html; charset=utf-8
<html>
<head></head>
<body>
<div id="header">
<h1>!TITLE!</h1>
<p>!DESC!</p>
</div>
<div id="content">
<div class="item">
<a target="_blank" href="!ITEM_LINK!">!ITEM_TITLE!</a>
<div class="desc">!ITEM_DESC!</div>
<div class="content">!ITEM_CONTENT!</div>
</div>
</div>
</body>
</html>

View File

@@ -1,16 +0,0 @@
HTTP/1.1 200 OK
Content-Type: application/json; charset=utf-8
{
"title": "!TITLE!",
"desc": "!DESC!",
"items": [
{
"title": "!ITEM_TITLE!",
"time": "2022-01-01T00:00:01+0100",
"url": "!ITEM_LINK!",
"desc": "!ITEM_DESC!",
"content": "!ITEM_CONTENT!"
}
]
}

View File

@@ -1,17 +0,0 @@
HTTP/1.1 200 OK
Content-Type: text/xml; charset=utf-8
<?xml version='1.0' encoding='utf-8'?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
<channel>
<title>!TITLE!</title>
<description>!DESC!</description>
<item>
<title>!ITEM_TITLE!</title>
<pubDate>Mon, 01 Jan 2022 00:00:01 +0100</pubDate>
<link>!ITEM_LINK!</link>
<description>!ITEM_DESC!</description>
<content:encoded>!ITEM_CONTENT!</content:encoded>
</item>
</channel>
</rss>

Binary file not shown.

View File

@@ -1,3 +0,0 @@
HTTP/1.1 200 OK
refresh: 0;url=/200-ok.txt

View File

@@ -1,8 +0,0 @@
HTTP/1.1 200 OK
content-type: text/html; charset=UTF-8
<!DOCTYPE html>
<html>
<head><meta http-equiv="refresh" content="2; url = /200-ok.txt" /></head>
<body>meta redirect</body>
</html>

View File

@@ -1,8 +0,0 @@
HTTP/1.1 200 OK
content-type: text/html; charset=UTF-8
<!DOCTYPE html>
<html>
<head><meta http-equiv="refresh" content="2; url = ./200-ok.txt" /></head>
<body>meta redirect</body>
</html>

View File

@@ -1,8 +0,0 @@
HTTP/1.1 200 OK
content-type: text/html; charset=UTF-8
<!DOCTYPE html>
<html>
<head><meta http-equiv="refresh" content="2; url = http://localhost:8888/200-ok.txt" /></head>
<body>meta redirect</body>
</html>

File diff suppressed because it is too large Load Diff

View File

@@ -1,62 +0,0 @@
import pytest
from morss.crawler import *
def test_get(replay_server):
assert get('http://localhost:8888/200-ok.txt') == b'success\r\n'
def test_adv_get(replay_server):
assert adv_get('http://localhost:8888/200-ok.txt')['data'] == b'success\r\n'
@pytest.mark.parametrize('before,after', [
(b'http://localhost:8888/', 'http://localhost:8888/'),
('localhost:8888/', 'http://localhost:8888/'),
('http:/localhost:8888/', 'http://localhost:8888/'),
('http://localhost:8888/&/', 'http://localhost:8888/&/'),
('http://localhost:8888/ /', 'http://localhost:8888/%20/'),
('http://localhost-€/€/', 'http://xn--localhost--077e/%E2%82%AC/'),
('http://localhost-€:8888/€/', 'http://xn--localhost--077e:8888/%E2%82%AC/'),
])
def test_sanitize_url(before, after):
assert sanitize_url(before) == after
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(SizeLimitHandler(500*1024))])
def test_size_limit_handler(replay_server, opener):
assert len(opener.open('http://localhost:8888/size-1MiB.txt').read()) == 500*1024
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(GZIPHandler())])
def test_gzip_handler(replay_server, opener):
assert opener.open('http://localhost:8888/gzip.txt').read() == b'success\n'
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(EncodingFixHandler())])
@pytest.mark.parametrize('url', [
'enc-gb2312-header.txt', 'enc-gb2312-meta.txt', #'enc-gb2312-missing.txt',
'enc-iso-8859-1-header.txt', 'enc-iso-8859-1-missing.txt',
'enc-utf-8-header.txt',
])
def test_encoding_fix_handler(replay_server, opener, url):
out = adv_get('http://localhost:8888/%s' % url)
out = out['data'].decode(out['encoding'])
assert 'succes' in out or 'succès' in out or '成功' in out
@pytest.mark.parametrize('opener', [custom_opener(follow='rss'), build_opener(AlternateHandler(MIMETYPE['rss']))])
def test_alternate_handler(replay_server, opener):
assert opener.open('http://localhost:8888/alternate-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPEquivHandler(), HTTPRefreshHandler())])
def test_http_equiv_handler(replay_server, opener):
assert opener.open('http://localhost:8888/meta-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
assert opener.open('http://localhost:8888/meta-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
assert opener.open('http://localhost:8888/meta-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPAllRedirectHandler())])
def test_http_all_redirect_handler(replay_server, opener):
assert opener.open('http://localhost:8888/308-redirect.txt').geturl() == 'http://localhost:8888/200-ok.txt'
assert opener.open('http://localhost:8888/301-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
assert opener.open('http://localhost:8888/301-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
assert opener.open('http://localhost:8888/301-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPRefreshHandler())])
def test_http_refresh_handler(replay_server, opener):
assert opener.open('http://localhost:8888/header-refresh.txt').geturl() == 'http://localhost:8888/200-ok.txt'

View File

@@ -1,108 +0,0 @@
import pytest
from morss.crawler import adv_get
from morss.feeds import *
def get_feed(url):
url = 'http://localhost:8888/%s' % url
out = adv_get(url)
feed = parse(out['data'], url=url, encoding=out['encoding'])
return feed
def check_feed(feed):
# NB. time and updated not covered
assert feed.title == '!TITLE!'
assert feed.desc == '!DESC!'
assert feed[0] == feed.items[0]
assert feed[0].title == '!ITEM_TITLE!'
assert feed[0].link == '!ITEM_LINK!'
assert '!ITEM_DESC!' in feed[0].desc # broader test due to possible inclusion of surrounding <div> in xml
assert '!ITEM_CONTENT!' in feed[0].content
def check_output(feed):
output = feed.tostring()
assert '!TITLE!' in output
assert '!DESC!' in output
assert '!ITEM_TITLE!' in output
assert '!ITEM_LINK!' in output
assert '!ITEM_DESC!' in output
assert '!ITEM_CONTENT!' in output
def check_change(feed):
feed.title = '!TITLE2!'
feed.desc = '!DESC2!'
feed[0].title = '!ITEM_TITLE2!'
feed[0].link = '!ITEM_LINK2!'
feed[0].desc = '!ITEM_DESC2!'
feed[0].content = '!ITEM_CONTENT2!'
assert feed.title == '!TITLE2!'
assert feed.desc == '!DESC2!'
assert feed[0].title == '!ITEM_TITLE2!'
assert feed[0].link == '!ITEM_LINK2!'
assert '!ITEM_DESC2!' in feed[0].desc
assert '!ITEM_CONTENT2!' in feed[0].content
def check_add(feed):
feed.append({
'title': '!ITEM_TITLE3!',
'link': '!ITEM_LINK3!',
'desc': '!ITEM_DESC3!',
'content': '!ITEM_CONTENT3!',
})
assert feed[1].title == '!ITEM_TITLE3!'
assert feed[1].link == '!ITEM_LINK3!'
assert '!ITEM_DESC3!' in feed[1].desc
assert '!ITEM_CONTENT3!' in feed[1].content
each_format = pytest.mark.parametrize('url', [
'feed-rss-channel-utf-8.txt', 'feed-atom-utf-8.txt',
'feed-atom03-utf-8.txt', 'feed-json-utf-8.txt', 'feed-html-utf-8.txt',
])
each_check = pytest.mark.parametrize('check', [
check_feed, check_output, check_change, check_add,
])
@each_format
@each_check
def test_parse(replay_server, url, check):
feed = get_feed(url)
check(feed)
@each_format
@each_check
def test_convert_rss(replay_server, url, check):
feed = get_feed(url)
feed = feed.convert(FeedXML)
check(feed)
@each_format
@each_check
def test_convert_json(replay_server, url, check):
feed = get_feed(url)
feed = feed.convert(FeedJSON)
check(feed)
@each_format
@each_check
def test_convert_html(replay_server, url, check):
feed = get_feed(url)
feed = feed.convert(FeedHTML)
if len(feed) > 1:
# remove the 'blank' default html item
del feed[0]
check(feed)
@each_format
def test_convert_csv(replay_server, url):
# only csv output, not csv feed, check therefore differnet
feed = get_feed(url)
output = feed.tocsv()
assert '!ITEM_TITLE!' in output
assert '!ITEM_LINK!' in output
assert '!ITEM_DESC!' in output
assert '!ITEM_CONTENT!' in output

15
www/.htaccess Normal file
View File

@@ -0,0 +1,15 @@
Options -Indexes
ErrorDocument 403 "Access forbidden"
ErrorDocument 404 /cgi/main.py
ErrorDocument 500 "A very nasty bug found his way onto this very server"
# Uncomment below line to turn debug on for all requests
#SetEnv DEBUG 1
# Uncomment below line to turn debug on for requests with :debug in the url
#SetEnvIf Request_URI :debug DEBUG=1
<Files ~ "\.(py|pyc|db|log)$">
deny from all
</Files>

9
www/cgi/.htaccess Normal file
View File

@@ -0,0 +1,9 @@
order allow,deny
deny from all
<Files main.py>
allow from all
AddHandler cgi-script .py
Options +ExecCGI
</Files>

View File

@@ -16,7 +16,6 @@
<title>RSS feed by morss</title> <title>RSS feed by morss</title>
<meta name="viewport" content="width=device-width; initial-scale=1.0;" /> <meta name="viewport" content="width=device-width; initial-scale=1.0;" />
<meta name="robots" content="noindex" /> <meta name="robots" content="noindex" />
<link rel="shortcut icon" type="image/svg+xml" href="/logo.svg" sizes="any" />
<style type="text/css"> <style type="text/css">
body * { body * {
@@ -192,9 +191,9 @@
feed as feed as
<select> <select>
<option value="">RSS</option> <option value="">RSS</option>
<option value=":format=json:cors">JSON</option> <option value=":json:cors">JSON</option>
<option value=":format=html">HTML</option> <option value=":html">HTML</option>
<option value=":format=csv">CSV</option> <option value=":csv">CSV</option>
</select> </select>
using the using the
<select> <select>
@@ -204,9 +203,7 @@
link of the link of the
<select> <select>
<option value="">first</option> <option value="">first</option>
<option value=":order=newest" title="Select feed items by publication date (instead of appearing order)">newest (?)</option> <option value=":newest" title="Select feed items by publication date (instead of appearing order)">newest (?)</option>
<option value=":order=last">last</option>
<option value=":order=oldest">oldest</option>
</select> </select>
items and items and
<select> <select>