Compare commits
203 Commits
920988ac74
...
master
Author | SHA1 | Date | |
---|---|---|---|
c5b2df754e | |||
6529fdbdd8 | |||
f4da40fffb | |||
d27fc93f75 | |||
dfb2b83c06 | |||
4340b678d0 | |||
ff9503b0d0 | |||
|
8bdcd8f386 | ||
ea2ebedfcb | |||
438c32a312 | |||
8b26797e93 | |||
e1ed33f320 | |||
b65272daab | |||
4d64afe9cb | |||
d3b623482d | |||
32645548c2 | |||
d6b90448f3 | |||
da81edc651 | |||
4f2895f931 | |||
b2b04691d6 | |||
bfaf7b0fac | |||
32d9bc9d9d | |||
b138f11771 | |||
a01258700d | |||
4d6d3c9239 | |||
e81f6b173f | |||
fe5dbf1ce0 | |||
fdf9acd32b | |||
d05706e056 | |||
e88a823ada | |||
750850c162 | |||
c8669002e4 | |||
c524e54d2d | |||
ef14567d87 | |||
fb643f5ef1 | |||
dbdca910d8 | |||
9eb19fac04 | |||
d424e394d1 | |||
3f92787b38 | |||
afc31eb6e9 | |||
87d2fe772d | |||
917aa0fbc5 | |||
3e2b81286f | |||
15430a2b83 | |||
ecdb74812d | |||
2c7844942c | |||
e12cb4567a | |||
b74365b121 | |||
2020543469 | |||
676be4a4fe | |||
8870400a6e | |||
8e9cc541b0 | |||
2a7a1b83ec | |||
106f59afa1 | |||
ee514e2da3 | |||
e7578e859a | |||
3bcb8db974 | |||
7751792942 | |||
6e2e5ffa00 | |||
f6da0e1e9b | |||
2247ba13c5 | |||
d17b9a2f27 | |||
5ab45e60af | |||
368e4683d6 | |||
9fd8c7d6af | |||
89f5d07408 | |||
495bd44893 | |||
ff12dbea39 | |||
7885ab48df | |||
7cdcbd23e1 | |||
25f283da1f | |||
727d14e539 | |||
3392ae3973 | |||
0111ea1749 | |||
def397de5e | |||
d07aa566ed | |||
0ee16d4a7d | |||
ac9859d955 | |||
580565da77 | |||
b2600152ea | |||
27d8f06308 | |||
79c4477cfc | |||
c09aa8400a | |||
861c275f5b | |||
99a855c8fc | |||
bef7899cdd | |||
7513a3e74d | |||
5bf93b83df | |||
e7ecc018c5 | |||
34b7468ba5 | |||
5336d26204 | |||
c7082dcf6c | |||
c785adb4c3 | |||
73798d2fc1 | |||
18daf378e8 | |||
aa2b747c5e | |||
d390ed9715 | |||
0a5a8ceb7f | |||
d2d9d7f22e | |||
29ae99c24d | |||
ed06ae6398 | |||
c3318d4af0 | |||
4e577d3266 | |||
22fc0e076b | |||
856be36769 | |||
397bd61374 | |||
25d63f2aee | |||
4a8dca1fbf | |||
51f1d330a4 | |||
11bc9f643e | |||
b600bbc256 | |||
502366db10 | |||
296b69f40e | |||
a2deb90185 | |||
72024f2864 | |||
440f7d6797 | |||
eb47aac6f1 | |||
eca546b890 | |||
5422d4e14c | |||
1837eda25f | |||
321763710d | |||
e79c426c6e | |||
92a28be0b0 | |||
70db524664 | |||
d8cc07223e | |||
37e08f8b4c | |||
8f576adb64 | |||
528b3448e4 | |||
f627f1b12b | |||
53fd97651e | |||
4dd77b4bcc | |||
deffeebd85 | |||
765e0ba728 | |||
12073ac7d8 | |||
6d049935e3 | |||
7b64c963c4 | |||
6900b9053c | |||
6ec3fb47d1 | |||
1083f3ffbc | |||
7eeb1d696c | |||
e42df98f83 | |||
cb21871c35 | |||
c71cf5d5ce | |||
44a6b2591d | |||
a890536601 | |||
8de309f2d4 | |||
cbf7b3f77b | |||
1ff7e4103c | |||
3f12258e98 | |||
d023ec8d73 | |||
5473b77416 | |||
0365232a73 | |||
a523518ae8 | |||
52c48b899f | |||
9649cabb1b | |||
0c29102788 | |||
10535a17c5 | |||
7d86972e58 | |||
62e04549ac | |||
5da7121a77 | |||
bb82902ad1 | |||
04afa28fe7 | |||
75bb69f0fd | |||
97d9dda547 | |||
0c31d9f6db | |||
49e29208ef | |||
d8d608a4de | |||
5437e40a15 | |||
6c1f8da692 | |||
a1a26d8209 | |||
edbb580f33 | |||
4fd730b983 | |||
198353d6b9 | |||
0b3e6d7749 | |||
06e0ada95b | |||
71d9c7a027 | |||
37f5a92b05 | |||
24c26d3850 | |||
8f24214915 | |||
d5942fe5a7 | |||
6f50443995 | |||
5582fbef31 | |||
da5442a1dc | |||
f9d7794bcc | |||
e37c8346d0 | |||
3a1d564992 | |||
6880a443e0 | |||
7342ab26d2 | |||
981da9e66a | |||
6ea9d012a2 | |||
95d6143636 | |||
03cad120d0 | |||
01a7667032 | |||
3e886caaab | |||
ad927e03a7 | |||
0efb096fa7 | |||
9ab2e488ef | |||
b525ab0d26 | |||
fb19b1241f | |||
9d062ef24b | |||
447f62dc45 | |||
18ec10fe44 | |||
891c385b69 |
78
.github/workflows/default.yml
vendored
Normal file
78
.github/workflows/default.yml
vendored
Normal file
@@ -0,0 +1,78 @@
|
||||
name: default
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
jobs:
|
||||
test-lint:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Prepare image
|
||||
run: apt-get -y update && apt-get -y install python3-pip libenchant-2-2 aspell-en
|
||||
|
||||
- name: Install dependencies
|
||||
run: pip3 install .[full] .[dev]
|
||||
- run: isort --check-only --diff .
|
||||
- run: pylint morss --rcfile=.pylintrc --disable=C,R,W --fail-under=8
|
||||
- run: pytest --cov=morss tests
|
||||
|
||||
python-publish:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Prepare image
|
||||
run: apt-get -y update && apt-get -y install python3-pip python3-build
|
||||
|
||||
- name: Build package
|
||||
run: python3 -m build
|
||||
|
||||
- name: Publish package
|
||||
uses: https://github.com/pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
password: ${{ secrets.pypi_api_token }}
|
||||
|
||||
docker-publish-deploy:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: catthehacker/ubuntu:act-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: https://github.com/docker/setup-qemu-action@v2
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: https://github.com/docker/setup-buildx-action@v2
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: https://github.com/docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.docker_user }}
|
||||
password: ${{ secrets.docker_pwd }}
|
||||
|
||||
- name: Build and push
|
||||
uses: https://github.com/docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64,linux/arm64,linux/arm/v7
|
||||
push: true
|
||||
tags: ${{ secrets.docker_repo }}
|
||||
|
||||
- name: Deploy on server
|
||||
uses: https://github.com/appleboy/ssh-action@v0.1.10
|
||||
with:
|
||||
host: ${{ secrets.ssh_host }}
|
||||
username: ${{ secrets.ssh_user }}
|
||||
key: ${{ secrets.ssh_key }}
|
||||
script: morss-update
|
50
.pylintrc
Normal file
50
.pylintrc
Normal file
@@ -0,0 +1,50 @@
|
||||
[MASTER]
|
||||
ignore=CVS
|
||||
suggestion-mode=yes
|
||||
extension-pkg-allow-list=lxml.etree
|
||||
|
||||
[MESSAGES CONTROL]
|
||||
disable=missing-function-docstring,
|
||||
missing-class-docstring,
|
||||
missing-module-docstring,
|
||||
wrong-spelling-in-comment,
|
||||
|
||||
[REPORTS]
|
||||
reports=yes
|
||||
score=yes
|
||||
|
||||
[SPELLING]
|
||||
spelling-dict=en_GB
|
||||
spelling-ignore-words=morss
|
||||
|
||||
[STRING]
|
||||
check-quote-consistency=yes
|
||||
check-str-concat-over-line-jumps=yes
|
||||
|
||||
[VARIABLES]
|
||||
allow-global-unused-variables=no
|
||||
init-import=no
|
||||
|
||||
[FORMAT]
|
||||
expected-line-ending-format=LF
|
||||
indent-string=' '
|
||||
max-line-length=120
|
||||
max-module-lines=1000
|
||||
|
||||
[BASIC]
|
||||
argument-naming-style=snake_case
|
||||
attr-naming-style=snake_case
|
||||
class-attribute-naming-style=snake_case
|
||||
class-const-naming-style=UPPER_CASE
|
||||
class-naming-style=PascalCase
|
||||
const-naming-style=UPPER_CASE
|
||||
function-naming-style=snake_case
|
||||
inlinevar-naming-style=snake_case
|
||||
method-naming-style=snake_case
|
||||
module-naming-style=snake_case
|
||||
variable-naming-style=snake_case
|
||||
|
||||
include-naming-hint=yes
|
||||
|
||||
bad-names=foo, bar
|
||||
good-names=i, j, k
|
18
Dockerfile
18
Dockerfile
@@ -1,8 +1,16 @@
|
||||
FROM alpine:latest
|
||||
|
||||
RUN apk add --no-cache python3 py3-lxml py3-gunicorn py3-pip py3-wheel git
|
||||
FROM alpine:edge
|
||||
|
||||
ADD . /app
|
||||
RUN pip3 install /app
|
||||
|
||||
CMD gunicorn --bind 0.0.0.0:8080 -w 4 --preload morss
|
||||
RUN set -ex; \
|
||||
apk add --no-cache --virtual .run-deps python3 py3-lxml py3-setproctitle py3-setuptools; \
|
||||
apk add --no-cache --virtual .build-deps py3-pip py3-wheel; \
|
||||
pip3 install --no-cache-dir /app[full]; \
|
||||
apk del .build-deps
|
||||
|
||||
USER 1000:1000
|
||||
|
||||
ENTRYPOINT ["/bin/sh", "/app/morss-helper"]
|
||||
CMD ["run"]
|
||||
|
||||
HEALTHCHECK CMD /bin/sh /app/morss-helper check
|
||||
|
272
README.md
272
README.md
@@ -1,11 +1,14 @@
|
||||
# Morss - Get full-text RSS feeds
|
||||
|
||||
_GNU AGPLv3 code_
|
||||
_Provided logo is CC BY-NC-SA 4.0_
|
||||
[Homepage](https://morss.it/) •
|
||||
[Upstream source code](https://git.pictuga.com/pictuga/morss) •
|
||||
[Github mirror](https://github.com/pictuga/morss) (for Issues & Pull requests)
|
||||
|
||||
Upstream source code: https://git.pictuga.com/pictuga/morss
|
||||
Github mirror (for Issues & Pull requests): https://github.com/pictuga/morss
|
||||
Homepage: https://morss.it/
|
||||
[](https://ci.pictuga.com/pictuga/morss)
|
||||
[](https://github.com/pictuga/morss/stargazers)
|
||||
[](https://github.com/pictuga/morss/network/members)
|
||||
[](https://git.pictuga.com/pictuga/morss/src/branch/master/LICENSE)
|
||||
[](https://creativecommons.org/licenses/by-nc-sa/4.0/)
|
||||
|
||||
This tool's goal is to get full-text RSS feeds out of striped RSS feeds,
|
||||
commonly available on internet. Indeed most newspapers only make a small
|
||||
@@ -38,7 +41,7 @@ Some features of morss:
|
||||
- Follow 301/meta redirects
|
||||
- Recover xml feeds with corrupt encoding
|
||||
- Supports gzip-compressed http content
|
||||
- HTTP caching with 3 different backends (in-memory/sqlite/mysql)
|
||||
- HTTP caching with different backends (in-memory/redis/diskcache)
|
||||
- Works as server/cli tool
|
||||
- Deobfuscate various tracking links
|
||||
|
||||
@@ -46,38 +49,79 @@ Some features of morss:
|
||||
|
||||
### Python package
|
||||
|
||||
].status&url=https://ci.pictuga.com/api/repos/pictuga/morss/builds/latest)
|
||||
[](https://pypi.org/project/morss/)
|
||||
[](https://pypistats.org/packages/morss)
|
||||
|
||||
Simple install (without optional dependencies)
|
||||
|
||||
From pip
|
||||
|
||||
```shell
|
||||
pip install morss
|
||||
```
|
||||
|
||||
From git
|
||||
|
||||
```shell
|
||||
pip install git+https://git.pictuga.com/pictuga/morss.git
|
||||
```
|
||||
|
||||
Full installation (including optional dependencies)
|
||||
|
||||
From pip
|
||||
|
||||
```shell
|
||||
pip install morss[full]
|
||||
```
|
||||
|
||||
From git
|
||||
|
||||
```shell
|
||||
pip install git+https://git.pictuga.com/pictuga/morss.git#egg=morss[full]
|
||||
```
|
||||
|
||||
The full install includes all the cache backends. Otherwise, only in-memory
|
||||
cache is available. The full install also includes gunicorn (for more efficient
|
||||
HTTP handling).
|
||||
|
||||
The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
|
||||
C code needs to be compiled). If possible on your distribution, try installing
|
||||
it with the system package manager.
|
||||
|
||||
Dependencies:
|
||||
|
||||
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
|
||||
- [lxml](http://lxml.de/) for xml parsing
|
||||
- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
|
||||
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
|
||||
- [chardet](https://pypi.python.org/pypi/chardet)
|
||||
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
|
||||
- pymysql
|
||||
|
||||
You may also need:
|
||||
- Apache, with python-cgi support, to run on a server
|
||||
- a fast internet connection
|
||||
|
||||
### Docker
|
||||
|
||||
Build & run
|
||||
].status&url=https://ci.pictuga.com/api/repos/pictuga/morss/builds/latest)
|
||||
[](https://hub.docker.com/r/pictuga/morss)
|
||||
[](https://hub.docker.com/r/pictuga/morss/tags)
|
||||
|
||||
From docker hub
|
||||
|
||||
With cli
|
||||
|
||||
```shell
|
||||
docker build --tag morss https://git.pictuga.com/pictuga/morss.git
|
||||
docker run -p 8080:8080 morss
|
||||
docker pull pictuga/morss
|
||||
```
|
||||
|
||||
With docker-compose:
|
||||
With docker-compose **(recommended)**
|
||||
|
||||
```yml
|
||||
services:
|
||||
app:
|
||||
image: pictuga/morss
|
||||
ports:
|
||||
- '8000:8000'
|
||||
```
|
||||
|
||||
Build from source
|
||||
|
||||
With cli
|
||||
|
||||
```shell
|
||||
docker build --tag morss https://git.pictuga.com/pictuga/morss.git --no-cache --pull
|
||||
```
|
||||
|
||||
With docker-compose
|
||||
|
||||
```yml
|
||||
services:
|
||||
@@ -85,21 +129,54 @@ services:
|
||||
build: https://git.pictuga.com/pictuga/morss.git
|
||||
image: morss
|
||||
ports:
|
||||
- '8080:8080'
|
||||
- '8000:8000'
|
||||
```
|
||||
|
||||
Then execute
|
||||
|
||||
```shell
|
||||
docker-compose build
|
||||
docker-compose up
|
||||
docker-compose build --no-cache --pull
|
||||
```
|
||||
|
||||
To update:
|
||||
### Cloud providers
|
||||
|
||||
- To get the latest code from the git repository, add `--no-cache` to the build
|
||||
commands
|
||||
- To update the base image (`alpine:latest`), add `--pull` to the build commands
|
||||
One-click deployment:
|
||||
|
||||
[](https://heroku.com/deploy?template=https://github.com/pictuga/morss)
|
||||
[](https://deploy.cloud.run/?git_repo=https://github.com/pictuga/morss.git)
|
||||
|
||||
Providers supporting `cloud-init` (AWS, Oracle Cloud Infrastructure), based on Ubuntu:
|
||||
|
||||
``` yml
|
||||
#cloud-config
|
||||
|
||||
packages:
|
||||
- python3-pip
|
||||
- python3-wheel
|
||||
- python3-lxml
|
||||
- python3-setproctitle
|
||||
- ca-certificates
|
||||
|
||||
write_files:
|
||||
- path: /etc/environment
|
||||
append: true
|
||||
content: |
|
||||
DEBUG=1
|
||||
CACHE=diskcache
|
||||
CACHE_SIZE=1073741824 # 1GiB
|
||||
- path: /var/lib/cloud/scripts/per-boot/morss.sh
|
||||
permissions: 744
|
||||
content: |
|
||||
#!/bin/sh
|
||||
/usr/local/bin/morss-helper daemon
|
||||
|
||||
runcmd:
|
||||
- source /etc/environment
|
||||
- update-ca-certificates
|
||||
- iptables -I INPUT 6 -m state --state NEW -p tcp --dport ${PORT:-8000} -j ACCEPT
|
||||
- netfilter-persistent save
|
||||
- pip install morss[full]
|
||||
```
|
||||
|
||||
## Run
|
||||
|
||||
@@ -120,14 +197,29 @@ For example: `http://morss.example/:clip/https://twitter.com/pictuga`
|
||||
The `main.py` part is only needed if your server doesn't support the Apache
|
||||
redirect rule set in the provided `.htaccess`.
|
||||
|
||||
Works like a charm with [Tiny Tiny
|
||||
RSS](http://tt-rss.org/redmine/projects/tt-rss/wiki), and most probably other
|
||||
clients.
|
||||
Works like a charm with [Tiny Tiny RSS](https://tt-rss.org/), and most probably
|
||||
other clients.
|
||||
|
||||
|
||||
#### Via Docker
|
||||
#### Using Docker
|
||||
|
||||
See above (in Install)
|
||||
From docker hub
|
||||
|
||||
```shell
|
||||
docker run -p 8000:8000 pictuga/morss
|
||||
```
|
||||
|
||||
From source
|
||||
|
||||
```shell
|
||||
docker run -p 8000:8000 morss
|
||||
```
|
||||
|
||||
With docker-compose **(recommended)**
|
||||
|
||||
```shell
|
||||
docker-compose up
|
||||
```
|
||||
|
||||
#### Using Gunicorn
|
||||
|
||||
@@ -140,13 +232,13 @@ gunicorn --preload morss
|
||||
Running this command should do:
|
||||
|
||||
```shell
|
||||
uwsgi --http :8080 --plugin python --wsgi-file main.py
|
||||
uwsgi --http :8000 --plugin python --wsgi-file main.py
|
||||
```
|
||||
|
||||
#### Using morss' internal HTTP server
|
||||
|
||||
Morss can run its own, **very basic**, HTTP server, meant for debugging mostly.
|
||||
The latter should start when you run morss without any argument, on port 8080.
|
||||
The latter should start when you run morss without any argument, on port 8000.
|
||||
I'd highly recommend you to use gunicorn or something similar for better
|
||||
performance.
|
||||
|
||||
@@ -184,8 +276,30 @@ For this, you need to make sure your host allows python script execution. This
|
||||
method uses HTTP calls to fetch the RSS feeds, which will be handled through
|
||||
`mod_cgi` for example on Apache severs.
|
||||
|
||||
Please pay attention to `main.py` permissions for it to be executable. Also
|
||||
ensure that the provided `/www/.htaccess` works well with your server.
|
||||
Please pay attention to `main.py` permissions for it to be executable. See below
|
||||
some tips for the `.htaccess` file.
|
||||
|
||||
```htaccess
|
||||
Options -Indexes
|
||||
|
||||
ErrorDocument 404 /cgi/main.py
|
||||
|
||||
# Turn debug on for all requests
|
||||
SetEnv DEBUG 1
|
||||
|
||||
# Turn debug on for requests with :debug in the url
|
||||
SetEnvIf Request_URI :debug DEBUG=1
|
||||
|
||||
<Files ~ "\.(py|pyc|db|log)$">
|
||||
deny from all
|
||||
</Files>
|
||||
|
||||
<Files main.py>
|
||||
allow from all
|
||||
AddHandler cgi-script .py
|
||||
Options +ExecCGI
|
||||
</Files>
|
||||
```
|
||||
|
||||
### As a CLI application
|
||||
|
||||
@@ -199,6 +313,12 @@ For example: `morss --clip http://feeds.bbci.co.uk/news/rss.xml`
|
||||
|
||||
*(Brackets indicate optional text)*
|
||||
|
||||
If using Docker:
|
||||
|
||||
```shell
|
||||
docker run morss --clip http://feeds.bbci.co.uk/news/rss.xml
|
||||
```
|
||||
|
||||
### As a newsreader hook
|
||||
|
||||
To use it, the newsreader [Liferea](http://lzone.de/liferea/) is required
|
||||
@@ -210,7 +330,7 @@ To use this script, you have to enable "(Unix) command" in liferea feed
|
||||
settings, and use the command:
|
||||
|
||||
```
|
||||
morss [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
|
||||
morss [--argwithoutvalue] [--argwithvalue=value] [...] FEEDURL
|
||||
```
|
||||
|
||||
For example: `morss http://feeds.bbci.co.uk/news/rss.xml`
|
||||
@@ -233,7 +353,7 @@ Using cache and passing arguments:
|
||||
```python
|
||||
>>> import morss
|
||||
>>> url = 'http://feeds.bbci.co.uk/news/rss.xml'
|
||||
>>> cache = '/tmp/morss-cache.db' # sqlite cache location
|
||||
>>> cache = '/tmp/morss-cache' # diskcache cache location
|
||||
>>> options = {'csv':True}
|
||||
>>> xml_string = morss.process(url, cache, options)
|
||||
>>> xml_string[:50]
|
||||
@@ -247,11 +367,10 @@ under the hood.
|
||||
Doing it step-by-step:
|
||||
|
||||
```python
|
||||
import morss, morss.crawler
|
||||
import morss
|
||||
|
||||
url = 'http://newspaper.example/feed.xml'
|
||||
options = morss.Options(csv=True) # arguments
|
||||
morss.crawler.sqlite_default = '/tmp/morss-cache.db' # sqlite cache location
|
||||
|
||||
url, rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
|
||||
rss = morss.FeedGather(rss, url, options) # this fills the feed and cleans it up
|
||||
@@ -270,11 +389,13 @@ arguments to morss is explained in Run above.
|
||||
The list of arguments can be obtained by running `morss --help`
|
||||
|
||||
```
|
||||
usage: morss [-h] [--format {rss,json,html,csv}] [--search STRING] [--clip]
|
||||
[--indent] [--cache] [--force] [--proxy] [--newest] [--firstlink]
|
||||
[--resolve] [--items XPATH] [--item_link XPATH]
|
||||
[--item_title XPATH] [--item_content XPATH] [--item_time XPATH]
|
||||
[--nolink] [--noref] [--silent]
|
||||
usage: morss [-h] [--post STRING] [--xpath XPATH]
|
||||
[--format {rss,json,html,csv}] [--search STRING] [--clip]
|
||||
[--indent] [--cache] [--force] [--proxy]
|
||||
[--order {first,last,newest,oldest}] [--firstlink] [--resolve]
|
||||
[--items XPATH] [--item_link XPATH] [--item_title XPATH]
|
||||
[--item_content XPATH] [--item_time XPATH]
|
||||
[--mode {xml,html,json}] [--nolink] [--noref] [--silent]
|
||||
url
|
||||
|
||||
Get full-text RSS feeds
|
||||
@@ -282,8 +403,10 @@ Get full-text RSS feeds
|
||||
positional arguments:
|
||||
url feed url
|
||||
|
||||
optional arguments:
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
--post STRING POST request
|
||||
--xpath XPATH xpath rule to manually detect the article
|
||||
|
||||
output:
|
||||
--format {rss,json,html,csv}
|
||||
@@ -299,8 +422,9 @@ action:
|
||||
articles' content), so as to save time
|
||||
--force force refetch the rss feed and articles
|
||||
--proxy doesn't fill the articles
|
||||
--newest return the feed items in chronological order (morss
|
||||
ohterwise shows the items by appearing order)
|
||||
--order {first,last,newest,oldest}
|
||||
order in which to process items (which are however NOT
|
||||
sorted in the output)
|
||||
--firstlink pull the first article mentioned in the description
|
||||
instead of the default link
|
||||
--resolve replace tracking links with direct links to articles
|
||||
@@ -315,6 +439,8 @@ custom feeds:
|
||||
--item_content XPATH entry's content
|
||||
--item_time XPATH entry's date & time (accepts a wide range of time
|
||||
formats)
|
||||
--mode {xml,html,json}
|
||||
parser to use for the custom feeds
|
||||
|
||||
misc:
|
||||
--nolink drop links, but keeps links' inner text
|
||||
@@ -336,31 +462,39 @@ servers)
|
||||
|
||||
To pass environment variables:
|
||||
|
||||
- Docker-cli: `docker run -p 8080:8080 morss --env KEY=value`
|
||||
- Docker-cli: `docker run -p 8000:8000 morss --env KEY=value`
|
||||
- docker-compose: add an `environment:` section in the .yml file
|
||||
- Gunicorn/uWSGI/CLI: prepend `KEY=value` before the command
|
||||
- Apache: via the `SetEnv` instruction (see sample `.htaccess` provided)
|
||||
- cloud-init: in the `/etc/environment` file
|
||||
|
||||
Generic:
|
||||
|
||||
- `DEBUG=1`: to have some feedback from the script execution. Useful for
|
||||
debugging.
|
||||
- `IGNORE_SSL=1`: to ignore SSL certs when fetch feeds and articles
|
||||
- `DELAY` sets the browser cache delay, only for HTTP clients
|
||||
- `TIMEOUT` sets the HTTP timeout when fetching rss feeds and articles
|
||||
- `DELAY` (seconds) sets the browser cache delay, only for HTTP clients
|
||||
- `TIMEOUT` (seconds) sets the HTTP timeout when fetching rss feeds and articles
|
||||
- `DATA_PATH`: to set custom file location for the `www` folder
|
||||
|
||||
When parsing long feeds, with a lot of items (100+), morss might take a lot of
|
||||
time to parse it, or might even run into a memory overflow on some shared
|
||||
hosting plans (limits around 10Mb), in which case you might want to adjust the
|
||||
below settings via environment variables.
|
||||
|
||||
- `MAX_TIME` sets the maximum amount of time spent *fetching* articles, more
|
||||
time might be spent taking older articles from cache. `-1` for unlimited.
|
||||
Also, if the request takes too long to process, the http request might be
|
||||
discarded. See relevant config for
|
||||
[gunicorn](https://docs.gunicorn.org/en/stable/settings.html#timeout) or
|
||||
[nginx](http://nginx.org/en/docs/http/ngx_http_proxy_module.html#proxy_read_timeout).
|
||||
|
||||
- `MAX_TIME` (seconds) sets the maximum amount of time spent *fetching*
|
||||
articles, more time might be spent taking older articles from cache. `-1` for
|
||||
unlimited.
|
||||
- `MAX_ITEM` sets the maximum number of articles to fetch. `-1` for unlimited.
|
||||
More articles will be taken from cache following the nexts settings.
|
||||
- `LIM_TIME` sets the maximum amount of time spent working on the feed (whether
|
||||
or not it's already cached). Articles beyond that limit will be dropped from the
|
||||
feed. `-1` for unlimited.
|
||||
- `LIM_TIME` (seconds) sets the maximum amount of time spent working on the feed
|
||||
(whether or not it's already cached). Articles beyond that limit will be dropped
|
||||
from the feed. `-1` for unlimited.
|
||||
- `LIM_ITEM` sets the maximum number of article checked, limiting both the
|
||||
number of articles fetched and taken from cache. Articles beyond that limit will
|
||||
be dropped from the feed, even if they're cached. `-1` for unlimited.
|
||||
@@ -368,19 +502,21 @@ be dropped from the feed, even if they're cached. `-1` for unlimited.
|
||||
morss uses caching to make loading faster. There are 3 possible cache backends:
|
||||
|
||||
- `(nothing/default)`: a simple python in-memory dict-like object.
|
||||
- `CACHE=sqlite`: sqlite3 cache. Default file location is in-memory (i.e. it
|
||||
will be cleared every time the program is run). Path can be defined with
|
||||
`SQLITE_PATH`.
|
||||
- `CACHE=mysql`: MySQL cache. Connection can be defined with the following
|
||||
environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
|
||||
- `CACHE=redis`: Redis cache. Connection can be defined with the following
|
||||
environment variables: `REDIS_HOST`, `REDIS_PORT`, `REDIS_DB`, `REDIS_PWD`
|
||||
- `CACHE=diskcache`: disk-based cache. Target directory canbe defined with
|
||||
`DISKCACHE_DIR`.
|
||||
|
||||
To limit the size of the cache:
|
||||
|
||||
- `CACHE_SIZE` sets the target number of items in the cache (further items will
|
||||
be deleted but the cache might be temporarily bigger than that). Defaults to 1k
|
||||
entries.
|
||||
- `CACHE_LIFESPAN` sets how often the cache must be trimmed (i.e. cut down to
|
||||
the number of items set in `CACHE_SIZE`). Defaults to 1min.
|
||||
entries. NB. When using `diskcache`, this is the cache max size in Bytes.
|
||||
- `CACHE_LIFESPAN` (seconds) sets how often the cache must be trimmed (i.e. cut
|
||||
down to the number of items set in `CACHE_SIZE`). Defaults to 1min.
|
||||
|
||||
Gunicorn also accepts command line arguments via the `GUNICORN_CMD_ARGS`
|
||||
environment variable.
|
||||
|
||||
### Content matching
|
||||
|
||||
|
21
app.json
Normal file
21
app.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"stack": "container",
|
||||
"env": {
|
||||
"DEBUG": {
|
||||
"value": 1,
|
||||
"required": false
|
||||
},
|
||||
"GUNICORN_CMD_ARGS": {
|
||||
"value": "",
|
||||
"required": false
|
||||
},
|
||||
"CACHE": {
|
||||
"value": "diskcache",
|
||||
"required": false
|
||||
},
|
||||
"CACHE_SIZE": {
|
||||
"value": 1073741824,
|
||||
"required": false
|
||||
}
|
||||
}
|
||||
}
|
3
heroku.yml
Normal file
3
heroku.yml
Normal file
@@ -0,0 +1,3 @@
|
||||
build:
|
||||
docker:
|
||||
web: Dockerfile
|
47
morss-helper
Executable file
47
morss-helper
Executable file
@@ -0,0 +1,47 @@
|
||||
#! /bin/sh
|
||||
set -ex
|
||||
|
||||
if ! command -v python && command -v python3 ; then
|
||||
alias python='python3'
|
||||
fi
|
||||
|
||||
run() {
|
||||
gunicorn --bind 0.0.0.0:${PORT:-8000} --preload --access-logfile - morss
|
||||
}
|
||||
|
||||
daemon() {
|
||||
gunicorn --bind 0.0.0.0:${PORT:-8000} --preload --access-logfile - --daemon morss
|
||||
}
|
||||
|
||||
reload() {
|
||||
pid=$(pidof 'gunicorn: master [morss]' || true)
|
||||
# NB. requires python-setproctitle
|
||||
# `|| true` due to `set -e`
|
||||
|
||||
if [ -z "$pid" ]; then
|
||||
# if gunicorn is not currently running
|
||||
daemon
|
||||
|
||||
else
|
||||
kill -s USR2 $pid
|
||||
kill -s WINCH $pid
|
||||
sleep 1 # give gunicorn some time to reload
|
||||
kill -s TERM $pid
|
||||
|
||||
fi
|
||||
}
|
||||
|
||||
check() {
|
||||
python -m morss.crawler http://localhost:${PORT:-8000}/ > /dev/null 2>&1
|
||||
}
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
run
|
||||
|
||||
elif [ "$1" = "sh" ] || [ "$1" = "bash" ] || command -v "$1" ; then
|
||||
$@
|
||||
|
||||
else
|
||||
python -m morss $@
|
||||
|
||||
fi
|
13
morss.service
Normal file
13
morss.service
Normal file
@@ -0,0 +1,13 @@
|
||||
[Unit]
|
||||
Description=morss server (gunicorn)
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
ExecStart=/usr/local/bin/morss-helper run
|
||||
ExecReload=/usr/local/bin/morss-helper reload
|
||||
KillMode=process
|
||||
Restart=always
|
||||
User=http
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
@@ -16,5 +16,10 @@
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
# ran on `import morss`
|
||||
|
||||
# pylint: disable=unused-import,unused-variable
|
||||
|
||||
__version__ = ""
|
||||
|
||||
from .morss import *
|
||||
from .wsgi import application
|
||||
|
@@ -20,9 +20,7 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
from . import wsgi
|
||||
from . import cli
|
||||
|
||||
from . import cli, wsgi
|
||||
from .morss import MorssException
|
||||
|
||||
|
||||
|
122
morss/caching.py
Normal file
122
morss/caching.py
Normal file
@@ -0,0 +1,122 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from collections import OrderedDict
|
||||
|
||||
CACHE_SIZE = int(os.getenv('CACHE_SIZE', 1000)) # max number of items in cache (default: 1k items)
|
||||
CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60)) # how often to auto-clear the cache (default: 1min)
|
||||
|
||||
|
||||
class BaseCache:
|
||||
""" Subclasses must behave like a dict """
|
||||
|
||||
def trim(self):
|
||||
pass
|
||||
|
||||
def autotrim(self, delay=CACHE_LIFESPAN):
|
||||
# trim the cache every so often
|
||||
|
||||
self.trim()
|
||||
|
||||
t = threading.Timer(delay, self.autotrim)
|
||||
t.daemon = True
|
||||
t.start()
|
||||
|
||||
def __contains__(self, url):
|
||||
try:
|
||||
self[url]
|
||||
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
class CappedDict(OrderedDict, BaseCache):
|
||||
def trim(self):
|
||||
if CACHE_SIZE >= 0:
|
||||
for i in range( max( len(self) - CACHE_SIZE , 0 )):
|
||||
self.popitem(False)
|
||||
|
||||
def __setitem__(self, key, data):
|
||||
# https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
|
||||
if key in self:
|
||||
del self[key]
|
||||
OrderedDict.__setitem__(self, key, data)
|
||||
|
||||
|
||||
try:
|
||||
import redis # isort:skip
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class RedisCacheHandler(BaseCache):
|
||||
def __init__(self, host='localhost', port=6379, db=0, password=None):
|
||||
self.r = redis.Redis(host=host, port=port, db=db, password=password)
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.r.get(key)
|
||||
|
||||
def __setitem__(self, key, data):
|
||||
self.r.set(key, data)
|
||||
|
||||
|
||||
try:
|
||||
import diskcache # isort:skip
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class DiskCacheHandler(BaseCache):
|
||||
def __init__(self, directory=None, **kwargs):
|
||||
self.cache = diskcache.Cache(directory=directory, eviction_policy='least-frequently-used', **kwargs)
|
||||
|
||||
def __del__(self):
|
||||
self.cache.close()
|
||||
|
||||
def trim(self):
|
||||
self.cache.cull()
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.cache[key]
|
||||
|
||||
def __setitem__(self, key, data):
|
||||
self.cache.set(key, data)
|
||||
|
||||
|
||||
if 'CACHE' in os.environ:
|
||||
if os.environ['CACHE'] == 'redis':
|
||||
default_cache = RedisCacheHandler(
|
||||
host = os.getenv('REDIS_HOST', 'localhost'),
|
||||
port = int(os.getenv('REDIS_PORT', 6379)),
|
||||
db = int(os.getenv('REDIS_DB', 0)),
|
||||
password = os.getenv('REDIS_PWD', None)
|
||||
)
|
||||
|
||||
elif os.environ['CACHE'] == 'diskcache':
|
||||
default_cache = DiskCacheHandler(
|
||||
directory = os.getenv('DISKCACHE_DIR', '/tmp/morss-diskcache'),
|
||||
size_limit = CACHE_SIZE # in Bytes
|
||||
)
|
||||
|
||||
else:
|
||||
default_cache = CappedDict()
|
13
morss/cli.py
13
morss/cli.py
@@ -15,12 +15,11 @@
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import sys
|
||||
import os.path
|
||||
import argparse
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
from .morss import FeedFetch, FeedGather, FeedFormat
|
||||
from .morss import Options
|
||||
from .morss import FeedFetch, FeedFormat, FeedGather, Options
|
||||
|
||||
|
||||
def cli_app():
|
||||
@@ -32,6 +31,9 @@ def cli_app():
|
||||
|
||||
parser.add_argument('url', help='feed url')
|
||||
|
||||
parser.add_argument('--post', action='store', type=str, metavar='STRING', help='POST request')
|
||||
parser.add_argument('--xpath', action='store', type=str, metavar='XPATH', help='xpath rule to manually detect the article')
|
||||
|
||||
group = parser.add_argument_group('output')
|
||||
group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
|
||||
group.add_argument('--search', action='store', type=str, metavar='STRING', help='does a basic case-sensitive search in the feed')
|
||||
@@ -42,7 +44,7 @@ def cli_app():
|
||||
group.add_argument('--cache', action='store_true', help='only take articles from the cache (ie. don\'t grab new articles\' content), so as to save time')
|
||||
group.add_argument('--force', action='store_true', help='force refetch the rss feed and articles')
|
||||
group.add_argument('--proxy', action='store_true', help='doesn\'t fill the articles')
|
||||
group.add_argument('--newest', action='store_true', help='return the feed items in chronological order (morss ohterwise shows the items by appearing order)')
|
||||
group.add_argument('--order', default='first', choices=('first', 'last', 'newest', 'oldest'), help='order in which to process items (which are however NOT sorted in the output)')
|
||||
group.add_argument('--firstlink', action='store_true', help='pull the first article mentioned in the description instead of the default link')
|
||||
group.add_argument('--resolve', action='store_true', help='replace tracking links with direct links to articles (not compatible with --proxy)')
|
||||
|
||||
@@ -52,6 +54,7 @@ def cli_app():
|
||||
group.add_argument('--item_title', action='store', type=str, metavar='XPATH', help='entry\'s title')
|
||||
group.add_argument('--item_content', action='store', type=str, metavar='XPATH', help='entry\'s content')
|
||||
group.add_argument('--item_time', action='store', type=str, metavar='XPATH', help='entry\'s date & time (accepts a wide range of time formats)')
|
||||
group.add_argument('--mode', default=None, choices=('xml', 'html', 'json'), help='parser to use for the custom feeds')
|
||||
|
||||
group = parser.add_argument_group('misc')
|
||||
group.add_argument('--nolink', action='store_true', help='drop links, but keeps links\' inner text')
|
||||
|
717
morss/crawler.py
717
morss/crawler.py
@@ -16,31 +16,37 @@
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import zlib
|
||||
from io import BytesIO, StringIO
|
||||
import re
|
||||
import chardet
|
||||
from cgi import parse_header
|
||||
import lxml.html
|
||||
import time
|
||||
import threading
|
||||
import pickle
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import zlib
|
||||
from cgi import parse_header
|
||||
from collections import OrderedDict
|
||||
from io import BytesIO, StringIO
|
||||
|
||||
import chardet
|
||||
|
||||
from .caching import default_cache
|
||||
|
||||
try:
|
||||
# python 2
|
||||
from urllib2 import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
|
||||
from urllib import quote
|
||||
from urlparse import urlparse, urlunparse
|
||||
import mimetools
|
||||
|
||||
from httplib import HTTPMessage
|
||||
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
|
||||
Request, addinfourl, build_opener, parse_http_list,
|
||||
parse_keqv_list)
|
||||
from urlparse import urlsplit
|
||||
except ImportError:
|
||||
# python 3
|
||||
from urllib.request import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
|
||||
from urllib.parse import quote
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
import email
|
||||
from email import message_from_string
|
||||
from http.client import HTTPMessage
|
||||
from urllib.parse import quote, urlsplit
|
||||
from urllib.request import (BaseHandler, HTTPCookieProcessor,
|
||||
HTTPRedirectHandler, Request, addinfourl,
|
||||
build_opener, parse_http_list, parse_keqv_list)
|
||||
|
||||
try:
|
||||
# python 2
|
||||
@@ -50,14 +56,12 @@ except NameError:
|
||||
basestring = unicode = str
|
||||
|
||||
|
||||
CACHE_SIZE = int(os.getenv('CACHE_SIZE', 1000)) # max number of items in cache (default: 1k items)
|
||||
CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60)) # how often to auto-clear the cache (default: 1min)
|
||||
|
||||
|
||||
MIMETYPE = {
|
||||
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
|
||||
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
||||
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
||||
'html': ['text/html', 'application/xhtml+xml', 'application/xml'],
|
||||
'json': ['application/json'],
|
||||
}
|
||||
|
||||
|
||||
DEFAULT_UAS = [
|
||||
@@ -82,14 +86,17 @@ def get(*args, **kwargs):
|
||||
return adv_get(*args, **kwargs)['data']
|
||||
|
||||
|
||||
def adv_get(url, timeout=None, *args, **kwargs):
|
||||
def adv_get(url, post=None, timeout=None, *args, **kwargs):
|
||||
url = sanitize_url(url)
|
||||
|
||||
if post is not None:
|
||||
post = post.encode('utf-8')
|
||||
|
||||
if timeout is None:
|
||||
con = custom_handler(*args, **kwargs).open(url)
|
||||
con = custom_opener(*args, **kwargs).open(url, data=post)
|
||||
|
||||
else:
|
||||
con = custom_handler(*args, **kwargs).open(url, timeout=timeout)
|
||||
con = custom_opener(*args, **kwargs).open(url, data=post, timeout=timeout)
|
||||
|
||||
data = con.read()
|
||||
|
||||
@@ -97,7 +104,7 @@ def adv_get(url, timeout=None, *args, **kwargs):
|
||||
encoding= detect_encoding(data, con)
|
||||
|
||||
return {
|
||||
'data':data,
|
||||
'data': data,
|
||||
'url': con.geturl(),
|
||||
'con': con,
|
||||
'contenttype': contenttype,
|
||||
@@ -105,9 +112,7 @@ def adv_get(url, timeout=None, *args, **kwargs):
|
||||
}
|
||||
|
||||
|
||||
def custom_handler(follow=None, delay=None, encoding=None):
|
||||
handlers = []
|
||||
|
||||
def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
|
||||
# as per urllib2 source code, these Handelers are added first
|
||||
# *unless* one of the custom handlers inherits from one of them
|
||||
#
|
||||
@@ -115,21 +120,33 @@ def custom_handler(follow=None, delay=None, encoding=None):
|
||||
# HTTPDefaultErrorHandler, HTTPRedirectHandler,
|
||||
# FTPHandler, FileHandler, HTTPErrorProcessor]
|
||||
# & HTTPSHandler
|
||||
#
|
||||
# when processing a request:
|
||||
# (1) all the *_request are run
|
||||
# (2) the *_open are run until sth is returned (other than None)
|
||||
# (3) all the *_response are run
|
||||
#
|
||||
# During (3), if an http error occurs (i.e. not a 2XX response code), the
|
||||
# http_error_* are run until sth is returned (other than None). If they all
|
||||
# return nothing, a python error is raised
|
||||
|
||||
#handlers.append(DebugHandler())
|
||||
handlers.append(SizeLimitHandler(500*1024)) # 500KiB
|
||||
handlers.append(HTTPCookieProcessor())
|
||||
handlers.append(GZIPHandler())
|
||||
handlers.append(HTTPEquivHandler())
|
||||
handlers.append(HTTPRefreshHandler())
|
||||
handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
|
||||
handlers.append(BrowserlyHeaderHandler())
|
||||
handlers.append(EncodingFixHandler(encoding))
|
||||
handlers = [
|
||||
#DebugHandler(),
|
||||
SizeLimitHandler(500*1024), # 500KiB
|
||||
HTTPCookieProcessor(),
|
||||
GZIPHandler(),
|
||||
HTTPAllRedirectHandler(),
|
||||
HTTPEquivHandler(),
|
||||
HTTPRefreshHandler(),
|
||||
UAHandler(random.choice(DEFAULT_UAS)),
|
||||
BrowserlyHeaderHandler(),
|
||||
EncodingFixHandler(),
|
||||
]
|
||||
|
||||
if follow:
|
||||
handlers.append(AlternateHandler(MIMETYPE[follow]))
|
||||
|
||||
handlers.append(CacheHandler(force_min=delay))
|
||||
handlers.append(CacheHandler(policy=policy, force_min=force_min, force_max=force_max))
|
||||
|
||||
return build_opener(*handlers)
|
||||
|
||||
@@ -146,10 +163,20 @@ def is_ascii(string):
|
||||
return True
|
||||
|
||||
|
||||
def soft_quote(string):
|
||||
" url-quote only when not a valid ascii string "
|
||||
|
||||
if is_ascii(string):
|
||||
return string
|
||||
|
||||
else:
|
||||
return quote(string.encode('utf-8'))
|
||||
|
||||
|
||||
def sanitize_url(url):
|
||||
# make sure the url is unicode, i.e. not bytes
|
||||
if isinstance(url, bytes):
|
||||
url = url.decode()
|
||||
url = url.decode('utf-8')
|
||||
|
||||
# make sure there's a protocol (http://)
|
||||
if url.split(':', 1)[0] not in PROTOCOL:
|
||||
@@ -162,18 +189,64 @@ def sanitize_url(url):
|
||||
url = url.replace(' ', '%20')
|
||||
|
||||
# escape non-ascii unicode characters
|
||||
# https://stackoverflow.com/a/4391299
|
||||
parts = list(urlparse(url))
|
||||
parts = urlsplit(url)
|
||||
|
||||
for i in range(len(parts)):
|
||||
if not is_ascii(parts[i]):
|
||||
if i == 1:
|
||||
parts[i] = parts[i].encode('idna').decode('ascii')
|
||||
parts = parts._replace(
|
||||
netloc=parts.netloc.replace(
|
||||
parts.hostname,
|
||||
parts.hostname.encode('idna').decode('ascii')
|
||||
),
|
||||
path=soft_quote(parts.path),
|
||||
query=soft_quote(parts.query),
|
||||
fragment=soft_quote(parts.fragment),
|
||||
)
|
||||
|
||||
else:
|
||||
parts[i] = quote(parts[i].encode('utf-8'))
|
||||
return parts.geturl()
|
||||
|
||||
return urlunparse(parts)
|
||||
|
||||
class RespDataHandler(BaseHandler):
|
||||
" Make it easier to use the reponse body "
|
||||
|
||||
def data_reponse(self, req, resp, data):
|
||||
pass
|
||||
|
||||
def http_response(self, req, resp):
|
||||
# read data
|
||||
data = resp.read()
|
||||
|
||||
# process data and use returned content (if any)
|
||||
data = self.data_response(req, resp, data) or data
|
||||
|
||||
# reformat the stuff
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
|
||||
|
||||
class RespStrHandler(RespDataHandler):
|
||||
" Make it easier to use the _decoded_ reponse body "
|
||||
|
||||
def str_reponse(self, req, resp, data_str):
|
||||
pass
|
||||
|
||||
def data_response(self, req, resp, data):
|
||||
#decode
|
||||
enc = detect_encoding(data, resp)
|
||||
data_str = data.decode(enc, 'replace')
|
||||
|
||||
#process
|
||||
data_str = self.str_response(req, resp, data_str)
|
||||
|
||||
# return
|
||||
data = data_str.encode(enc) if data_str is not None else data
|
||||
|
||||
#return
|
||||
return data
|
||||
|
||||
|
||||
class DebugHandler(BaseHandler):
|
||||
@@ -196,7 +269,7 @@ class SizeLimitHandler(BaseHandler):
|
||||
|
||||
handler_order = 450
|
||||
|
||||
def __init__(self, limit=5*1024^2):
|
||||
def __init__(self, limit=5*1024**2):
|
||||
self.limit = limit
|
||||
|
||||
def http_response(self, req, resp):
|
||||
@@ -217,29 +290,17 @@ def UnGzip(data):
|
||||
return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data)
|
||||
|
||||
|
||||
class GZIPHandler(BaseHandler):
|
||||
class GZIPHandler(RespDataHandler):
|
||||
def http_request(self, req):
|
||||
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
||||
return req
|
||||
|
||||
def http_response(self, req, resp):
|
||||
def data_response(self, req, resp, data):
|
||||
if 200 <= resp.code < 300:
|
||||
if resp.headers.get('Content-Encoding') == 'gzip':
|
||||
data = resp.read()
|
||||
|
||||
data = UnGzip(data)
|
||||
|
||||
resp.headers['Content-Encoding'] = 'identity'
|
||||
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
https_request = http_request
|
||||
return UnGzip(data)
|
||||
|
||||
|
||||
def detect_encoding(data, resp=None):
|
||||
@@ -276,28 +337,9 @@ def detect_raw_encoding(data, resp=None):
|
||||
return 'utf-8'
|
||||
|
||||
|
||||
class EncodingFixHandler(BaseHandler):
|
||||
def __init__(self, encoding=None):
|
||||
self.encoding = encoding
|
||||
|
||||
def http_response(self, req, resp):
|
||||
maintype = resp.info().get('Content-Type', '').split('/')[0]
|
||||
if 200 <= resp.code < 300 and maintype == 'text':
|
||||
data = resp.read()
|
||||
|
||||
enc = self.encoding or detect_encoding(data, resp)
|
||||
|
||||
data = data.decode(enc, 'replace')
|
||||
data = data.encode(enc)
|
||||
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
class EncodingFixHandler(RespStrHandler):
|
||||
def str_response(self, req, resp, data_str):
|
||||
return data_str
|
||||
|
||||
|
||||
class UAHandler(BaseHandler):
|
||||
@@ -323,71 +365,58 @@ class BrowserlyHeaderHandler(BaseHandler):
|
||||
https_request = http_request
|
||||
|
||||
|
||||
class AlternateHandler(BaseHandler):
|
||||
def iter_html_tag(html_str, tag_name):
|
||||
" To avoid parsing whole pages when looking for a simple tag "
|
||||
|
||||
re_tag = r'<%s\s+[^>]+>' % tag_name
|
||||
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
|
||||
|
||||
for tag_match in re.finditer(re_tag, html_str):
|
||||
attr_match = re.findall(re_attr, tag_match.group(0))
|
||||
|
||||
if attr_match is not None:
|
||||
yield dict(attr_match)
|
||||
|
||||
|
||||
class AlternateHandler(RespStrHandler):
|
||||
" Follow <link rel='alternate' type='application/rss+xml' href='...' /> "
|
||||
|
||||
def __init__(self, follow=None):
|
||||
self.follow = follow or []
|
||||
|
||||
def http_response(self, req, resp):
|
||||
def str_response(self, req, resp, data_str):
|
||||
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
||||
|
||||
if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
|
||||
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
|
||||
|
||||
data = resp.read()
|
||||
|
||||
try:
|
||||
links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
|
||||
|
||||
for link in links:
|
||||
if link.get('type', '') in self.follow:
|
||||
resp.code = 302
|
||||
resp.msg = 'Moved Temporarily'
|
||||
resp.headers['location'] = link.get('href')
|
||||
break
|
||||
|
||||
except (ValueError, SyntaxError):
|
||||
# catch parsing errors
|
||||
pass
|
||||
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
for link in iter_html_tag(data_str[:10000], 'link'):
|
||||
if (link.get('rel') == 'alternate'
|
||||
and link.get('type') in self.follow
|
||||
and 'href' in link):
|
||||
resp.code = 302
|
||||
resp.msg = 'Moved Temporarily'
|
||||
resp.headers['location'] = link.get('href')
|
||||
break
|
||||
|
||||
|
||||
class HTTPEquivHandler(BaseHandler):
|
||||
class HTTPEquivHandler(RespStrHandler):
|
||||
" Handler to support <meta http-equiv='...' content='...' />, since it defines HTTP headers "
|
||||
|
||||
handler_order = 600
|
||||
|
||||
def http_response(self, req, resp):
|
||||
def str_response(self, req, resp, data_str):
|
||||
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
||||
if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
|
||||
data = resp.read()
|
||||
|
||||
try:
|
||||
headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
|
||||
for meta in iter_html_tag(data_str[:10000], 'meta'):
|
||||
if 'http-equiv' in meta and 'content' in meta:
|
||||
resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
|
||||
|
||||
for header in headers:
|
||||
resp.headers[header.get('http-equiv').lower()] = header.get('content')
|
||||
|
||||
except (ValueError, SyntaxError):
|
||||
# catch parsing errors
|
||||
pass
|
||||
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
class HTTPAllRedirectHandler(HTTPRedirectHandler):
|
||||
def http_error_308(self, req, fp, code, msg, headers):
|
||||
return self.http_error_301(req, fp, 301, msg, headers)
|
||||
|
||||
|
||||
class HTTPRefreshHandler(BaseHandler):
|
||||
@@ -396,7 +425,7 @@ class HTTPRefreshHandler(BaseHandler):
|
||||
def http_response(self, req, resp):
|
||||
if 200 <= resp.code < 300:
|
||||
if resp.headers.get('refresh'):
|
||||
regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url=(["\']?)(?P<url>.+)\2$'
|
||||
regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url\s*=\s*(["\']?)(?P<url>.+)\2$'
|
||||
match = re.search(regex, resp.headers.get('refresh'))
|
||||
|
||||
if match:
|
||||
@@ -412,56 +441,124 @@ class HTTPRefreshHandler(BaseHandler):
|
||||
https_response = http_response
|
||||
|
||||
|
||||
def parse_headers(text=u'\n\n'):
|
||||
if sys.version_info[0] >= 3:
|
||||
# python 3
|
||||
return message_from_string(text, _class=HTTPMessage)
|
||||
|
||||
else:
|
||||
# python 2
|
||||
return HTTPMessage(StringIO(text))
|
||||
|
||||
|
||||
def error_response(code, msg, url=''):
|
||||
# return an error as a response
|
||||
resp = addinfourl(BytesIO(), parse_headers(), url, code)
|
||||
resp.msg = msg
|
||||
return resp
|
||||
|
||||
|
||||
class CacheHandler(BaseHandler):
|
||||
" Cache based on etags/last-modified "
|
||||
|
||||
private_cache = False # Websites can indicate whether the page should be
|
||||
# cached by CDNs (e.g. shouldn't be the case for
|
||||
# private/confidential/user-specific pages.
|
||||
# With this setting, decide whether (False) you want
|
||||
# the cache to behave like a CDN (i.e. don't cache
|
||||
# private pages), or (True) to behave like a end-cache
|
||||
# private pages. If unsure, False is the safest bet.
|
||||
privacy = 'private' # Websites can indicate whether the page should be cached
|
||||
# by CDNs (e.g. shouldn't be the case for
|
||||
# private/confidential/user-specific pages. With this
|
||||
# setting, decide whether you want the cache to behave
|
||||
# like a CDN (i.e. don't cache private pages, 'public'),
|
||||
# or to behave like a end-user private pages
|
||||
# ('private'). If unsure, 'public' is the safest bet,
|
||||
# but many websites abuse this feature...
|
||||
|
||||
# NB. This overrides all the other min/max/policy settings.
|
||||
handler_order = 499
|
||||
|
||||
def __init__(self, cache=None, force_min=None):
|
||||
def __init__(self, cache=None, force_min=None, force_max=None, policy=None):
|
||||
self.cache = cache or default_cache
|
||||
self.force_min = force_min
|
||||
# Servers indicate how long they think their content is "valid".
|
||||
# With this parameter (force_min, expressed in seconds), we can
|
||||
# override the validity period (i.e. bypassing http headers)
|
||||
# Special values:
|
||||
# -1: valid forever, i.e. use the cache no matter what (and fetch
|
||||
# the page online if not present in cache)
|
||||
# 0: valid zero second, i.e. force refresh
|
||||
# -2: same as -1, i.e. use the cache no matter what, but do NOT
|
||||
# fetch the page online if not present in cache, throw an
|
||||
# error instead
|
||||
self.force_max = force_max
|
||||
self.policy = policy # can be cached/refresh/offline/None (default)
|
||||
|
||||
# Servers indicate how long they think their content is "valid". With
|
||||
# this parameter (force_min/max, expressed in seconds), we can override
|
||||
# the validity period (i.e. bypassing http headers)
|
||||
# Special choices, via "policy":
|
||||
# cached: use the cache no matter what (and fetch the page online if
|
||||
# not present in cache)
|
||||
# refresh: valid zero second, i.e. force refresh
|
||||
# offline: same as cached, i.e. use the cache no matter what, but do
|
||||
# NOT fetch the page online if not present in cache, throw an
|
||||
# error instead
|
||||
# None: just follow protocols
|
||||
|
||||
# sanity checks
|
||||
assert self.force_max is None or self.force_max >= 0
|
||||
assert self.force_min is None or self.force_min >= 0
|
||||
assert self.force_max is None or self.force_min is None or self.force_max >= self.force_min
|
||||
|
||||
def load(self, url):
|
||||
try:
|
||||
out = list(self.cache[url])
|
||||
data = pickle.loads(self.cache[url])
|
||||
|
||||
except KeyError:
|
||||
out = [None, None, unicode(), bytes(), 0]
|
||||
data = None
|
||||
|
||||
if sys.version_info[0] >= 3:
|
||||
out[2] = email.message_from_string(out[2] or unicode()) # headers
|
||||
else:
|
||||
out[2] = mimetools.Message(StringIO(out[2] or unicode()))
|
||||
data['headers'] = parse_headers(data['headers'] or unicode())
|
||||
|
||||
return out
|
||||
return data
|
||||
|
||||
def save(self, url, code, msg, headers, data, timestamp):
|
||||
self.cache[url] = (code, msg, unicode(headers), data, timestamp)
|
||||
def save(self, key, data):
|
||||
data['headers'] = unicode(data['headers'])
|
||||
self.cache[key] = pickle.dumps(data, 0)
|
||||
|
||||
def cached_response(self, req, fallback=None):
|
||||
req.from_morss_cache = True
|
||||
|
||||
data = self.load(req.get_full_url())
|
||||
|
||||
if data is not None:
|
||||
# return the cache as a response
|
||||
resp = addinfourl(BytesIO(data['data']), data['headers'], req.get_full_url(), data['code'])
|
||||
resp.msg = data['msg']
|
||||
return resp
|
||||
|
||||
else:
|
||||
return fallback
|
||||
|
||||
def save_response(self, req, resp):
|
||||
if req.from_morss_cache:
|
||||
# do not re-save (would reset the timing)
|
||||
return resp
|
||||
|
||||
data = resp.read()
|
||||
|
||||
self.save(req.get_full_url(), {
|
||||
'code': resp.code,
|
||||
'msg': resp.msg,
|
||||
'headers': resp.headers,
|
||||
'data': data,
|
||||
'timestamp': time.time()
|
||||
})
|
||||
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
def http_request(self, req):
|
||||
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
|
||||
req.from_morss_cache = False # to track whether it comes from cache
|
||||
|
||||
if 'etag' in headers:
|
||||
req.add_unredirected_header('If-None-Match', headers['etag'])
|
||||
data = self.load(req.get_full_url())
|
||||
|
||||
if 'last-modified' in headers:
|
||||
req.add_unredirected_header('If-Modified-Since', headers.get('last-modified'))
|
||||
if data is not None:
|
||||
if 'etag' in data['headers']:
|
||||
req.add_unredirected_header('If-None-Match', data['headers']['etag'])
|
||||
|
||||
if 'last-modified' in data['headers']:
|
||||
req.add_unredirected_header('If-Modified-Since', data['headers']['last-modified'])
|
||||
|
||||
return req
|
||||
|
||||
@@ -470,275 +567,111 @@ class CacheHandler(BaseHandler):
|
||||
# If 'None' is returned, try your chance with the next-available handler
|
||||
# If a 'resp' is returned, stop there, and proceed with 'http_response'
|
||||
|
||||
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
|
||||
# Here, we try to see whether we want to use data from cache (i.e.
|
||||
# return 'resp'), or whether we want to refresh the content (return
|
||||
# 'None')
|
||||
|
||||
# some info needed to process everything
|
||||
cache_control = parse_http_list(headers.get('cache-control', ()))
|
||||
cache_control += parse_http_list(headers.get('pragma', ()))
|
||||
data = self.load(req.get_full_url())
|
||||
|
||||
cc_list = [x for x in cache_control if '=' not in x]
|
||||
cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
|
||||
if data is not None:
|
||||
# some info needed to process everything
|
||||
cache_control = parse_http_list(data['headers'].get('cache-control', ()))
|
||||
cache_control += parse_http_list(data['headers'].get('pragma', ()))
|
||||
|
||||
cache_age = time.time() - timestamp
|
||||
cc_list = [x for x in cache_control if '=' not in x]
|
||||
cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
|
||||
|
||||
# list in a simple way what to do when
|
||||
if req.get_header('Morss') == 'from_304': # for whatever reason, we need an uppercase
|
||||
# we're just in the middle of a dirty trick, use cache
|
||||
pass
|
||||
cache_age = time.time() - data['timestamp']
|
||||
|
||||
elif self.force_min == -2:
|
||||
if code is not None:
|
||||
# already in cache, perfect, use cache
|
||||
pass
|
||||
# list in a simple way what to do in special cases
|
||||
|
||||
else:
|
||||
# raise an error, via urllib handlers
|
||||
headers['Morss'] = 'from_cache'
|
||||
resp = addinfourl(BytesIO(), headers, req.get_full_url(), 409)
|
||||
resp.msg = 'Conflict'
|
||||
return resp
|
||||
|
||||
elif code is None:
|
||||
# cache empty, refresh
|
||||
if data is not None and 'private' in cc_list and self.privacy == 'public':
|
||||
# private data but public cache, do not use cache
|
||||
# privacy concern, so handled first and foremost
|
||||
# (and doesn't need to be addressed anymore afterwards)
|
||||
return None
|
||||
|
||||
elif self.force_min == -1:
|
||||
# force use cache
|
||||
pass
|
||||
elif self.policy == 'offline':
|
||||
# use cache, or return an error
|
||||
return self.cached_response(
|
||||
req,
|
||||
error_response(409, 'Conflict', req.get_full_url())
|
||||
)
|
||||
|
||||
elif self.force_min == 0:
|
||||
elif self.policy == 'cached':
|
||||
# use cache, or fetch online
|
||||
return self.cached_response(req, None)
|
||||
|
||||
elif self.policy == 'refresh':
|
||||
# force refresh
|
||||
return None
|
||||
|
||||
elif code == 301 and cache_age < 7*24*3600:
|
||||
elif data is None:
|
||||
# we have already settled all the cases that don't need the cache.
|
||||
# all the following ones need the cached item
|
||||
return None
|
||||
|
||||
elif self.force_max is not None and cache_age > self.force_max:
|
||||
# older than we want, refresh
|
||||
return None
|
||||
|
||||
elif self.force_min is not None and cache_age < self.force_min:
|
||||
# recent enough, use cache
|
||||
return self.cached_response(req)
|
||||
|
||||
elif data['code'] == 301 and cache_age < 7*24*3600:
|
||||
# "301 Moved Permanently" has to be cached...as long as we want
|
||||
# (awesome HTTP specs), let's say a week (why not?). Use force_min=0
|
||||
# if you want to bypass this (needed for a proper refresh)
|
||||
pass
|
||||
return self.cached_response(req)
|
||||
|
||||
elif self.force_min is None and ('no-cache' in cc_list
|
||||
or 'no-store' in cc_list
|
||||
or ('private' in cc_list and not self.private_cache)):
|
||||
# kindly follow web servers indications, refresh
|
||||
# if the same settings are used all along, this section shouldn't be
|
||||
# of any use, since the page woudln't be cached in the first place
|
||||
# the check is only performed "just in case"
|
||||
elif self.force_min is None and ('no-cache' in cc_list or 'no-store' in cc_list):
|
||||
# kindly follow web servers indications, refresh if the same
|
||||
# settings are used all along, this section shouldn't be of any use,
|
||||
# since the page woudln't be cached in the first place the check is
|
||||
# only performed "just in case"
|
||||
# NB. NOT respected if force_min is set
|
||||
return None
|
||||
|
||||
elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
|
||||
# server says it's still fine (and we trust him, if not, use force_min=0), use cache
|
||||
pass
|
||||
|
||||
elif self.force_min is not None and self.force_min > cache_age:
|
||||
# still recent enough for us, use cache
|
||||
pass
|
||||
# server says it's still fine (and we trust him, if not, use overrides), use cache
|
||||
return self.cached_response(req)
|
||||
|
||||
else:
|
||||
# according to the www, we have to refresh when nothing is said
|
||||
return None
|
||||
|
||||
# return the cache as a response. This code is reached with 'pass' above
|
||||
headers['morss'] = 'from_cache' # TODO delete the morss header from incoming pages, to avoid websites messing up with us
|
||||
resp = addinfourl(BytesIO(data), headers, req.get_full_url(), code)
|
||||
resp.msg = msg
|
||||
|
||||
return resp
|
||||
|
||||
def http_response(self, req, resp):
|
||||
# code for after-fetch, to know whether to save to hard-drive (if stiking to http headers' will)
|
||||
# code for after-fetch, to know whether to save to hard-drive (if sticking to http headers' will)
|
||||
|
||||
if resp.code == 304:
|
||||
return resp
|
||||
if resp.code == 304 and resp.url in self.cache:
|
||||
# we are hopefully the first after the HTTP handler, so no need
|
||||
# to re-run all the *_response
|
||||
# here: cached page, returning from cache
|
||||
return self.cached_response(req)
|
||||
|
||||
if ('cache-control' in resp.headers or 'pragma' in resp.headers) and self.force_min is None:
|
||||
elif self.force_min is None and ('cache-control' in resp.headers or 'pragma' in resp.headers):
|
||||
cache_control = parse_http_list(resp.headers.get('cache-control', ()))
|
||||
cache_control += parse_http_list(resp.headers.get('pragma', ()))
|
||||
|
||||
cc_list = [x for x in cache_control if '=' not in x]
|
||||
|
||||
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache):
|
||||
# kindly follow web servers indications
|
||||
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and self.privacy == 'public'):
|
||||
# kindly follow web servers indications (do not save & return)
|
||||
return resp
|
||||
|
||||
if resp.headers.get('Morss') == 'from_cache':
|
||||
# it comes from cache, so no need to save it again
|
||||
return resp
|
||||
else:
|
||||
# save
|
||||
return self.save_response(req, resp)
|
||||
|
||||
# save to disk
|
||||
data = resp.read()
|
||||
self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
|
||||
|
||||
# the below is only needed because of 'resp.read()' above, as we can't
|
||||
# seek(0) on arbitraty file-like objects (e.g. sockets)
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
def http_error_304(self, req, fp, code, msg, headers):
|
||||
cache = list(self.load(req.get_full_url()))
|
||||
|
||||
if cache[0]:
|
||||
cache[-1] = time.time()
|
||||
self.save(req.get_full_url(), *cache)
|
||||
|
||||
new = Request(req.get_full_url(),
|
||||
headers=req.headers,
|
||||
unverifiable=True)
|
||||
|
||||
new.add_unredirected_header('Morss', 'from_304')
|
||||
# create a "fake" new request to just re-run through the various
|
||||
# handlers
|
||||
|
||||
return self.parent.open(new, timeout=req.timeout)
|
||||
|
||||
return None # when returning 'None', the next-available handler is used
|
||||
# the 'HTTPRedirectHandler' has no 'handler_order', i.e.
|
||||
# uses the default of 500, therefore executed after this
|
||||
else:
|
||||
return self.save_response(req, resp)
|
||||
|
||||
https_request = http_request
|
||||
https_open = http_open
|
||||
https_response = http_response
|
||||
|
||||
|
||||
class BaseCache:
|
||||
""" Subclasses must behave like a dict """
|
||||
|
||||
def trim(self):
|
||||
pass
|
||||
|
||||
def autotrim(self, delay=CACHE_LIFESPAN):
|
||||
# trim the cache every so often
|
||||
|
||||
self.trim()
|
||||
|
||||
t = threading.Timer(delay, self.autotrim)
|
||||
t.daemon = True
|
||||
t.start()
|
||||
|
||||
def __contains__(self, url):
|
||||
try:
|
||||
self[url]
|
||||
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
import sqlite3
|
||||
|
||||
|
||||
class SQLiteCache(BaseCache):
|
||||
def __init__(self, filename=':memory:'):
|
||||
self.con = sqlite3.connect(filename, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
|
||||
|
||||
with self.con:
|
||||
self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)')
|
||||
self.con.execute('pragma journal_mode=WAL')
|
||||
|
||||
self.trim()
|
||||
|
||||
def __del__(self):
|
||||
self.con.close()
|
||||
|
||||
def trim(self):
|
||||
with self.con:
|
||||
self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,))
|
||||
|
||||
def __getitem__(self, url):
|
||||
row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
|
||||
|
||||
if not row:
|
||||
raise KeyError
|
||||
|
||||
return row[1:]
|
||||
|
||||
def __setitem__(self, url, value): # value = (code, msg, headers, data, timestamp)
|
||||
value = list(value)
|
||||
value[3] = sqlite3.Binary(value[3]) # data
|
||||
value = tuple(value)
|
||||
|
||||
with self.con:
|
||||
self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?) ON CONFLICT(url) DO UPDATE SET code=?, msg=?, headers=?, data=?, timestamp=?', (url,) + value + value)
|
||||
|
||||
|
||||
import pymysql.cursors
|
||||
|
||||
|
||||
class MySQLCacheHandler(BaseCache):
|
||||
def __init__(self, user, password, database, host='localhost'):
|
||||
self.user = user
|
||||
self.password = password
|
||||
self.database = database
|
||||
self.host = host
|
||||
|
||||
with self.cursor() as cursor:
|
||||
cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)')
|
||||
|
||||
self.trim()
|
||||
|
||||
def cursor(self):
|
||||
return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
|
||||
|
||||
def trim(self):
|
||||
with self.cursor() as cursor:
|
||||
cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,))
|
||||
|
||||
def __getitem__(self, url):
|
||||
cursor = self.cursor()
|
||||
cursor.execute('SELECT * FROM data WHERE url=%s', (url,))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
raise KeyError
|
||||
|
||||
return row[1:]
|
||||
|
||||
def __setitem__(self, url, value): # (code, msg, headers, data, timestamp)
|
||||
with self.cursor() as cursor:
|
||||
cursor.execute('INSERT INTO data VALUES (%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE code=%s, msg=%s, headers=%s, data=%s, timestamp=%s',
|
||||
(url,) + value + value)
|
||||
|
||||
|
||||
class CappedDict(OrderedDict, BaseCache):
|
||||
def trim(self):
|
||||
if CACHE_SIZE >= 0:
|
||||
for i in range( max( len(self) - CACHE_SIZE , 0 )):
|
||||
self.popitem(False)
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
# https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
|
||||
if key in self:
|
||||
del self[key]
|
||||
OrderedDict.__setitem__(self, key, value)
|
||||
|
||||
|
||||
if 'CACHE' in os.environ:
|
||||
if os.environ['CACHE'] == 'mysql':
|
||||
default_cache = MySQLCacheHandler(
|
||||
user = os.getenv('MYSQL_USER'),
|
||||
password = os.getenv('MYSQL_PWD'),
|
||||
database = os.getenv('MYSQL_DB'),
|
||||
host = os.getenv('MYSQL_HOST', 'localhost')
|
||||
)
|
||||
|
||||
elif os.environ['CACHE'] == 'sqlite':
|
||||
if 'SQLITE_PATH' in os.environ:
|
||||
path = os.getenv('SQLITE_PATH') + '/morss-cache.db'
|
||||
|
||||
else:
|
||||
path = ':memory:'
|
||||
|
||||
default_cache = SQLiteCache(path)
|
||||
|
||||
else:
|
||||
default_cache = CappedDict()
|
||||
|
||||
|
||||
if 'IGNORE_SSL' in os.environ:
|
||||
import ssl
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
|
@@ -73,7 +73,7 @@ item_updated = atom03:updated
|
||||
mode = json
|
||||
|
||||
mimetype = application/json
|
||||
timeformat = %Y-%m-%dT%H:%M:%SZ
|
||||
timeformat = %Y-%m-%dT%H:%M:%S%z
|
||||
base = {}
|
||||
|
||||
title = title
|
||||
@@ -90,9 +90,6 @@ item_updated = updated
|
||||
[html]
|
||||
mode = html
|
||||
|
||||
path =
|
||||
http://localhost/
|
||||
|
||||
title = //div[@id='header']/h1
|
||||
desc = //div[@id='header']/p
|
||||
items = //div[@id='content']/div
|
||||
|
117
morss/feeds.py
117
morss/feeds.py
@@ -15,35 +15,31 @@
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import sys
|
||||
import os.path
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import re
|
||||
import json
|
||||
import csv
|
||||
|
||||
import json
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from fnmatch import fnmatch
|
||||
|
||||
from lxml import etree
|
||||
from dateutil import tz
|
||||
import dateutil.parser
|
||||
from copy import deepcopy
|
||||
|
||||
import lxml.html
|
||||
from dateutil import tz
|
||||
from lxml import etree
|
||||
|
||||
from .readabilite import parse as html_parse
|
||||
from .util import *
|
||||
|
||||
json.encoder.c_make_encoder = None
|
||||
|
||||
try:
|
||||
# python 2
|
||||
from StringIO import StringIO
|
||||
from ConfigParser import RawConfigParser
|
||||
from StringIO import StringIO
|
||||
except ImportError:
|
||||
# python 3
|
||||
from io import StringIO
|
||||
from configparser import RawConfigParser
|
||||
from io import StringIO
|
||||
|
||||
try:
|
||||
# python 2
|
||||
@@ -55,7 +51,7 @@ except NameError:
|
||||
|
||||
def parse_rules(filename=None):
|
||||
if not filename:
|
||||
filename = os.path.join(os.path.dirname(__file__), 'feedify.ini')
|
||||
filename = pkg_path('feedify.ini')
|
||||
|
||||
config = RawConfigParser()
|
||||
config.read(filename)
|
||||
@@ -69,18 +65,10 @@ def parse_rules(filename=None):
|
||||
# for each rule
|
||||
|
||||
if rules[section][arg].startswith('file:'):
|
||||
paths = [os.path.join(sys.prefix, 'share/morss/www', rules[section][arg][5:]),
|
||||
os.path.join(os.path.dirname(__file__), '../www', rules[section][arg][5:]),
|
||||
os.path.join(os.path.dirname(__file__), '../..', rules[section][arg][5:])]
|
||||
|
||||
for path in paths:
|
||||
try:
|
||||
file_raw = open(path).read()
|
||||
file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
|
||||
rules[section][arg] = file_clean
|
||||
|
||||
except IOError:
|
||||
pass
|
||||
path = data_path('www', rules[section][arg][5:])
|
||||
file_raw = open(path).read()
|
||||
file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
|
||||
rules[section][arg] = file_clean
|
||||
|
||||
elif '\n' in rules[section][arg]:
|
||||
rules[section][arg] = rules[section][arg].split('\n')[1:]
|
||||
@@ -88,20 +76,25 @@ def parse_rules(filename=None):
|
||||
return rules
|
||||
|
||||
|
||||
def parse(data, url=None, encoding=None):
|
||||
def parse(data, url=None, encoding=None, ruleset=None):
|
||||
" Determine which ruleset to use "
|
||||
|
||||
rulesets = parse_rules()
|
||||
if ruleset is not None:
|
||||
rulesets = [ruleset]
|
||||
|
||||
else:
|
||||
rulesets = parse_rules().values()
|
||||
|
||||
parsers = [FeedXML, FeedHTML, FeedJSON]
|
||||
|
||||
# 1) Look for a ruleset based on path
|
||||
|
||||
if url is not None:
|
||||
for ruleset in rulesets.values():
|
||||
for ruleset in rulesets:
|
||||
if 'path' in ruleset:
|
||||
for path in ruleset['path']:
|
||||
if fnmatch(url, path):
|
||||
parser = [x for x in parsers if x.mode == ruleset['mode']][0]
|
||||
parser = [x for x in parsers if x.mode == ruleset.get('mode')][0] # FIXME what if no mode specified?
|
||||
return parser(data, ruleset, encoding=encoding)
|
||||
|
||||
# 2) Try each and every parser
|
||||
@@ -111,9 +104,6 @@ def parse(data, url=None, encoding=None):
|
||||
# 3b) See if .items matches anything
|
||||
|
||||
for parser in parsers:
|
||||
ruleset_candidates = [x for x in rulesets.values() if x['mode'] == parser.mode and 'path' not in x]
|
||||
# 'path' as they should have been caught beforehands
|
||||
|
||||
try:
|
||||
feed = parser(data, encoding=encoding)
|
||||
|
||||
@@ -124,13 +114,17 @@ def parse(data, url=None, encoding=None):
|
||||
else:
|
||||
# parsing worked, now we try the rulesets
|
||||
|
||||
ruleset_candidates = [x for x in rulesets if x.get('mode') in (parser.mode, None) and 'path' not in x]
|
||||
# 'path' as they should have been caught beforehands
|
||||
# try anyway if no 'mode' specified
|
||||
|
||||
for ruleset in ruleset_candidates:
|
||||
feed.rules = ruleset
|
||||
|
||||
try:
|
||||
feed.items[0]
|
||||
|
||||
except (AttributeError, IndexError):
|
||||
except (AttributeError, IndexError, TypeError):
|
||||
# parsing and or item picking did not work out
|
||||
pass
|
||||
|
||||
@@ -193,11 +187,12 @@ class ParserBase(object):
|
||||
return self.convert(FeedHTML).tostring(**k)
|
||||
|
||||
def convert(self, TargetParser):
|
||||
if type(self) == TargetParser:
|
||||
return self
|
||||
|
||||
target = TargetParser()
|
||||
|
||||
if type(self) == TargetParser and self.rules == target.rules:
|
||||
# check both type *AND* rules (e.g. when going from freeform xml to rss)
|
||||
return self
|
||||
|
||||
for attr in target.dic:
|
||||
if attr == 'items':
|
||||
for item in self.items:
|
||||
@@ -366,7 +361,13 @@ class ParserXML(ParserBase):
|
||||
|
||||
def rule_search_all(self, rule):
|
||||
try:
|
||||
return self.root.xpath(rule, namespaces=self.NSMAP)
|
||||
match = self.root.xpath(rule, namespaces=self.NSMAP)
|
||||
if isinstance(match, str):
|
||||
# some xpath rules return a single string instead of an array (e.g. concatenate() )
|
||||
return [match,]
|
||||
|
||||
else:
|
||||
return match
|
||||
|
||||
except etree.XPathEvalError:
|
||||
return []
|
||||
@@ -429,7 +430,7 @@ class ParserXML(ParserBase):
|
||||
|
||||
match = self.rule_search(rrule)
|
||||
|
||||
html_rich = ('atom' in rule or self.rules['mode'] == 'html') \
|
||||
html_rich = ('atom' in rule or self.rules.get('mode') == 'html') \
|
||||
and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
|
||||
|
||||
if key is not None:
|
||||
@@ -440,7 +441,7 @@ class ParserXML(ParserBase):
|
||||
self._clean_node(match)
|
||||
match.append(lxml.html.fragment_fromstring(value, create_parent='div'))
|
||||
|
||||
if self.rules['mode'] == 'html':
|
||||
if self.rules.get('mode') == 'html':
|
||||
match.find('div').drop_tag() # not supported by lxml.etree
|
||||
|
||||
else: # i.e. if atom
|
||||
@@ -456,7 +457,7 @@ class ParserXML(ParserBase):
|
||||
def rule_str(self, rule):
|
||||
match = self.rule_search(rule)
|
||||
|
||||
html_rich = ('atom' in rule or self.rules['mode'] == 'html') \
|
||||
html_rich = ('atom' in rule or self.mode == 'html') \
|
||||
and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
|
||||
|
||||
if isinstance(match, etree._Element):
|
||||
@@ -489,7 +490,14 @@ class ParserHTML(ParserXML):
|
||||
repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
|
||||
rule = re.sub(pattern, repl, rule)
|
||||
|
||||
return self.root.xpath(rule)
|
||||
match = self.root.xpath(rule)
|
||||
|
||||
if isinstance(match, str):
|
||||
# for some xpath rules, see XML parser
|
||||
return [match,]
|
||||
|
||||
else:
|
||||
return match
|
||||
|
||||
except etree.XPathEvalError:
|
||||
return []
|
||||
@@ -508,24 +516,31 @@ class ParserHTML(ParserXML):
|
||||
|
||||
|
||||
def parse_time(value):
|
||||
# parsing per se
|
||||
if value is None or value == 0:
|
||||
return None
|
||||
time = None
|
||||
|
||||
elif isinstance(value, basestring):
|
||||
if re.match(r'^[0-9]+$', value):
|
||||
return datetime.fromtimestamp(int(value), tz.tzutc())
|
||||
time = datetime.fromtimestamp(int(value))
|
||||
|
||||
else:
|
||||
return dateutil.parser.parse(value).replace(tzinfo=tz.tzutc())
|
||||
time = dateutil.parser.parse(value)
|
||||
|
||||
elif isinstance(value, int):
|
||||
return datetime.fromtimestamp(value, tz.tzutc())
|
||||
time = datetime.fromtimestamp(value)
|
||||
|
||||
elif isinstance(value, datetime):
|
||||
return value
|
||||
time = value
|
||||
|
||||
else:
|
||||
return None
|
||||
time = None
|
||||
|
||||
# add default time zone if none set
|
||||
if time is not None and time.tzinfo is None:
|
||||
time = time.replace(tzinfo=tz.tzutc())
|
||||
|
||||
return time
|
||||
|
||||
|
||||
class ParserJSON(ParserBase):
|
||||
@@ -684,7 +699,7 @@ class Feed(object):
|
||||
try:
|
||||
setattr(item, attr, new[attr])
|
||||
|
||||
except (IndexError, TypeError):
|
||||
except (KeyError, IndexError, TypeError):
|
||||
pass
|
||||
|
||||
return item
|
||||
@@ -800,6 +815,8 @@ class FeedJSON(Feed, ParserJSON):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
|
||||
from . import crawler
|
||||
|
||||
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')
|
||||
|
@@ -16,30 +16,26 @@
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from dateutil import tz
|
||||
|
||||
from fnmatch import fnmatch
|
||||
import re
|
||||
|
||||
import lxml.etree
|
||||
import lxml.html
|
||||
from dateutil import tz
|
||||
|
||||
from . import feeds
|
||||
from . import crawler
|
||||
from . import readabilite
|
||||
|
||||
from . import caching, crawler, feeds, readabilite
|
||||
|
||||
try:
|
||||
# python 2
|
||||
from httplib import HTTPException
|
||||
from urlparse import urlparse, urljoin, parse_qs
|
||||
from urlparse import parse_qs, urljoin, urlparse
|
||||
except ImportError:
|
||||
# python 3
|
||||
from http.client import HTTPException
|
||||
from urllib.parse import urlparse, urljoin, parse_qs
|
||||
from urllib.parse import parse_qs, urljoin, urlparse
|
||||
|
||||
|
||||
MAX_ITEM = int(os.getenv('MAX_ITEM', 5)) # cache-only beyond
|
||||
@@ -64,7 +60,7 @@ def log(txt):
|
||||
|
||||
else:
|
||||
# when using internal server or cli
|
||||
print(repr(txt))
|
||||
print(repr(txt), file=sys.stderr)
|
||||
|
||||
|
||||
def len_html(txt):
|
||||
@@ -91,12 +87,12 @@ class Options:
|
||||
else:
|
||||
self.options = options or {}
|
||||
|
||||
def __getattr__(self, key):
|
||||
def __getattr__(self, key, default=None):
|
||||
if key in self.options:
|
||||
return self.options[key]
|
||||
|
||||
else:
|
||||
return False
|
||||
return default
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
self.options[key] = value
|
||||
@@ -104,6 +100,8 @@ class Options:
|
||||
def __contains__(self, key):
|
||||
return key in self.options
|
||||
|
||||
get = __getitem__ = __getattr__
|
||||
|
||||
|
||||
def ItemFix(item, options, feedurl='/'):
|
||||
""" Improves feed items (absolute links, resolve feedburner links, etc) """
|
||||
@@ -197,21 +195,20 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||
log(item.link)
|
||||
|
||||
# download
|
||||
delay = -1
|
||||
|
||||
if fast or options.fast:
|
||||
if fast or options.cache:
|
||||
# force cache, don't fetch
|
||||
delay = -2
|
||||
policy = 'offline'
|
||||
|
||||
elif options.force:
|
||||
# force refresh
|
||||
delay = 0
|
||||
policy = 'refresh'
|
||||
|
||||
else:
|
||||
delay = 24*60*60 # 24h
|
||||
policy = None
|
||||
|
||||
try:
|
||||
req = crawler.adv_get(url=item.link, delay=delay, timeout=TIMEOUT)
|
||||
req = crawler.adv_get(url=item.link, policy=policy, force_min=24*60*60, timeout=TIMEOUT)
|
||||
|
||||
except (IOError, HTTPException) as e:
|
||||
log('http error')
|
||||
@@ -221,7 +218,11 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||
log('non-text page')
|
||||
return True
|
||||
|
||||
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
|
||||
if not req['data']:
|
||||
log('empty page')
|
||||
return True
|
||||
|
||||
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode', xpath=options.xpath)
|
||||
|
||||
if out is not None:
|
||||
item.content = out
|
||||
@@ -265,33 +266,43 @@ def FeedFetch(url, options):
|
||||
# fetch feed
|
||||
delay = DELAY
|
||||
|
||||
if options.force:
|
||||
delay = 0
|
||||
if options.cache:
|
||||
policy = 'offline'
|
||||
|
||||
elif options.force:
|
||||
policy = 'refresh'
|
||||
|
||||
else:
|
||||
policy = None
|
||||
|
||||
try:
|
||||
req = crawler.adv_get(url=url, follow=('rss' if not options.items else None), delay=delay, timeout=TIMEOUT * 2)
|
||||
req = crawler.adv_get(url=url, post=options.post, follow=('rss' if not options.items else None), policy=policy, force_min=5*60, force_max=60*60, timeout=TIMEOUT)
|
||||
|
||||
except (IOError, HTTPException):
|
||||
raise MorssException('Error downloading feed')
|
||||
|
||||
if options.items:
|
||||
# using custom rules
|
||||
rss = feeds.FeedHTML(req['data'], encoding=req['encoding'])
|
||||
ruleset = {}
|
||||
|
||||
rss.rules['title'] = options.title if options.title else '//head/title'
|
||||
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
|
||||
ruleset['items'] = options.items
|
||||
|
||||
rss.rules['items'] = options.items
|
||||
if options.mode:
|
||||
ruleset['mode'] = options.mode
|
||||
|
||||
rss.rules['item_title'] = options.item_title if options.item_title else '.'
|
||||
rss.rules['item_link'] = options.item_link if options.item_link else './@href|.//a/@href|ancestor::a/@href'
|
||||
ruleset['title'] = options.get('title', '//head/title')
|
||||
ruleset['desc'] = options.get('desc', '//head/meta[@name="description"]/@content')
|
||||
|
||||
ruleset['item_title'] = options.get('item_title', '.')
|
||||
ruleset['item_link'] = options.get('item_link', '(.|.//a|ancestor::a)/@href')
|
||||
|
||||
if options.item_content:
|
||||
rss.rules['item_content'] = options.item_content
|
||||
ruleset['item_content'] = options.item_content
|
||||
|
||||
if options.item_time:
|
||||
rss.rules['item_time'] = options.item_time
|
||||
ruleset['item_time'] = options.item_time
|
||||
|
||||
rss = feeds.parse(req['data'], encoding=req['encoding'], ruleset=ruleset)
|
||||
rss = rss.convert(feeds.FeedXML)
|
||||
|
||||
else:
|
||||
@@ -321,16 +332,23 @@ def FeedGather(rss, url, options):
|
||||
if options.cache:
|
||||
max_time = 0
|
||||
|
||||
if options.newest:
|
||||
# :newest take the newest items
|
||||
now = datetime.now(tz.tzutc())
|
||||
sorted_items = sorted(rss.items, key=lambda x:x.updated or x.time or now, reverse=True)
|
||||
# sort
|
||||
sorted_items = list(rss.items)
|
||||
|
||||
else:
|
||||
# default behavior, take the first items (in appearing order)
|
||||
sorted_items = list(rss.items)
|
||||
if options.order == 'last':
|
||||
# `first` does nothing from a practical standpoint, so only `last` needs
|
||||
# to be addressed
|
||||
sorted_items = reversed(sorted_items)
|
||||
|
||||
elif options.order in ['newest', 'oldest']:
|
||||
now = datetime.now(tz.tzutc())
|
||||
sorted_items = sorted(sorted_items, key=lambda x:x.updated or x.time or now) # oldest to newest
|
||||
|
||||
if options.order == 'newest':
|
||||
sorted_items = reversed(sorted_items)
|
||||
|
||||
for i, item in enumerate(sorted_items):
|
||||
# hard cap
|
||||
if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0:
|
||||
log('dropped')
|
||||
item.remove()
|
||||
@@ -343,6 +361,7 @@ def FeedGather(rss, url, options):
|
||||
|
||||
item = ItemFix(item, options, url)
|
||||
|
||||
# soft cap
|
||||
if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
|
||||
if not options.proxy:
|
||||
if ItemFill(item, options, url, True) is False:
|
||||
@@ -409,7 +428,7 @@ def process(url, cache=None, options=None):
|
||||
options = Options(options)
|
||||
|
||||
if cache:
|
||||
crawler.default_cache = crawler.SQLiteCache(cache)
|
||||
caching.default_cache = caching.DiskCacheHandler(cache)
|
||||
|
||||
url, rss = FeedFetch(url, options)
|
||||
rss = FeedGather(rss, url, options)
|
||||
|
@@ -15,22 +15,22 @@
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import re
|
||||
|
||||
import bs4.builder._lxml
|
||||
import lxml.etree
|
||||
import lxml.html
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import lxml.html.soupparser
|
||||
|
||||
|
||||
class CustomTreeBuilder(bs4.builder._lxml.LXMLTreeBuilder):
|
||||
def default_parser(self, encoding):
|
||||
return lxml.html.HTMLParser(target=self, remove_comments=True, remove_pis=True, encoding=encoding)
|
||||
|
||||
|
||||
def parse(data, encoding=None):
|
||||
if encoding:
|
||||
data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')
|
||||
|
||||
else:
|
||||
data = BeautifulSoup(data, 'lxml').prettify('utf-8')
|
||||
|
||||
parser = lxml.html.HTMLParser(remove_comments=True, encoding='utf-8')
|
||||
|
||||
return lxml.html.fromstring(data, parser=parser)
|
||||
kwargs = {'from_encoding': encoding} if encoding else {}
|
||||
return lxml.html.soupparser.fromstring(data, builder=CustomTreeBuilder, **kwargs)
|
||||
|
||||
|
||||
def count_words(string):
|
||||
@@ -43,6 +43,8 @@ def count_words(string):
|
||||
if string is None:
|
||||
return 0
|
||||
|
||||
string = string.strip()
|
||||
|
||||
i = 0
|
||||
count = 0
|
||||
|
||||
@@ -152,15 +154,20 @@ def score_all(node):
|
||||
|
||||
for child in node:
|
||||
score = score_node(child)
|
||||
child.attrib['morss_own_score'] = str(float(score))
|
||||
set_score(child, score, 'morss_own_score')
|
||||
|
||||
if score > 0 or len(list(child.iterancestors())) <= 2:
|
||||
spread_score(child, score)
|
||||
score_all(child)
|
||||
|
||||
|
||||
def set_score(node, value):
|
||||
node.attrib['morss_score'] = str(float(value))
|
||||
def set_score(node, value, label='morss_score'):
|
||||
try:
|
||||
node.attrib[label] = str(float(value))
|
||||
|
||||
except KeyError:
|
||||
# catch issues with e.g. html comments
|
||||
pass
|
||||
|
||||
|
||||
def get_score(node):
|
||||
@@ -200,6 +207,12 @@ def clean_root(root, keep_threshold=None):
|
||||
def clean_node(node, keep_threshold=None):
|
||||
parent = node.getparent()
|
||||
|
||||
# remove comments
|
||||
if (isinstance(node, lxml.html.HtmlComment)
|
||||
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
|
||||
parent.remove(node)
|
||||
return
|
||||
|
||||
if parent is None:
|
||||
# this is <html/> (or a removed element waiting for GC)
|
||||
return
|
||||
@@ -210,7 +223,7 @@ def clean_node(node, keep_threshold=None):
|
||||
return
|
||||
|
||||
# high score, so keep
|
||||
if keep_threshold is not None and get_score(node) >= keep_threshold:
|
||||
if keep_threshold is not None and keep_threshold > 0 and get_score(node) >= keep_threshold:
|
||||
return
|
||||
|
||||
gdparent = parent.getparent()
|
||||
@@ -231,11 +244,6 @@ def clean_node(node, keep_threshold=None):
|
||||
parent.remove(node)
|
||||
return
|
||||
|
||||
# remove comments
|
||||
if isinstance(node, lxml.html.HtmlComment) or isinstance(node, lxml.html.HtmlProcessingInstruction):
|
||||
parent.remove(node)
|
||||
return
|
||||
|
||||
# remove if too many kids & too high link density
|
||||
wc = count_words(node.text_content())
|
||||
if wc != 0 and len(list(node.iter())) > 3:
|
||||
@@ -293,28 +301,26 @@ def clean_node(node, keep_threshold=None):
|
||||
gdparent.insert(gdparent.index(parent)+1, new_node)
|
||||
|
||||
|
||||
def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
|
||||
ancestorsA = list(nodeA.iterancestors())
|
||||
ancestorsB = list(nodeB.iterancestors())
|
||||
def lowest_common_ancestor(node_a, node_b, max_depth=None):
|
||||
ancestors_a = list(node_a.iterancestors())
|
||||
ancestors_b = list(node_b.iterancestors())
|
||||
|
||||
if max_depth is not None:
|
||||
ancestorsA = ancestorsA[:max_depth]
|
||||
ancestorsB = ancestorsB[:max_depth]
|
||||
ancestors_a = ancestors_a[:max_depth]
|
||||
ancestors_b = ancestors_b[:max_depth]
|
||||
|
||||
ancestorsA.insert(0, nodeA)
|
||||
ancestorsB.insert(0, nodeB)
|
||||
ancestors_a.insert(0, node_a)
|
||||
ancestors_b.insert(0, node_b)
|
||||
|
||||
for ancestorA in ancestorsA:
|
||||
if ancestorA in ancestorsB:
|
||||
return ancestorA
|
||||
for ancestor_a in ancestors_a:
|
||||
if ancestor_a in ancestors_b:
|
||||
return ancestor_a
|
||||
|
||||
return nodeA # should always find one tho, at least <html/>, but needed for max_depth
|
||||
return node_a # should always find one tho, at least <html/>, but needed for max_depth
|
||||
|
||||
|
||||
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5):
|
||||
" Input a raw html string, returns a raw html string of the article "
|
||||
|
||||
html = parse(data, encoding_in)
|
||||
def get_best_node(html, threshold=5):
|
||||
# score all nodes
|
||||
score_all(html)
|
||||
|
||||
# rank all nodes (largest to smallest)
|
||||
@@ -331,9 +337,33 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=
|
||||
else:
|
||||
best = ranked_nodes[0]
|
||||
|
||||
return best
|
||||
|
||||
|
||||
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5, xpath=None):
|
||||
" Input a raw html string, returns a raw html string of the article "
|
||||
|
||||
html = parse(data, encoding_in)
|
||||
|
||||
if xpath is not None:
|
||||
xpath_match = html.xpath(xpath)
|
||||
|
||||
if len(xpath_match):
|
||||
best = xpath_match[0]
|
||||
|
||||
else:
|
||||
best = get_best_node(html, threshold)
|
||||
|
||||
else:
|
||||
best = get_best_node(html, threshold)
|
||||
|
||||
if best is None:
|
||||
# if threshold not met
|
||||
return None
|
||||
|
||||
# clean up
|
||||
if not debug:
|
||||
keep_threshold = get_score(ranked_nodes[0]) * 3/4
|
||||
keep_threshold = get_score(best) * 3/4
|
||||
clean_root(best, keep_threshold)
|
||||
|
||||
# check for spammy content (links only)
|
||||
@@ -352,6 +382,7 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
|
||||
from . import crawler
|
||||
|
||||
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
|
||||
|
57
morss/util.py
Normal file
57
morss/util.py
Normal file
@@ -0,0 +1,57 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
|
||||
def pkg_path(*path_elements):
|
||||
return os.path.join(os.path.dirname(__file__), *path_elements)
|
||||
|
||||
|
||||
data_path_base = None
|
||||
|
||||
|
||||
def data_path(*path_elements):
|
||||
global data_path_base
|
||||
|
||||
path = os.path.join(*path_elements)
|
||||
|
||||
if data_path_base is not None:
|
||||
return os.path.join(data_path_base, path)
|
||||
|
||||
bases = [
|
||||
os.path.join(sys.prefix, 'share/morss'), # when installed as root
|
||||
pkg_path('../../../share/morss'),
|
||||
pkg_path('../../../../share/morss'),
|
||||
pkg_path('../share/morss'), # for `pip install --target=dir morss`
|
||||
pkg_path('..'), # when running from source tree
|
||||
]
|
||||
|
||||
if 'DATA_PATH' in os.environ:
|
||||
bases.append(os.environ['DATA_PATH'])
|
||||
|
||||
for base in bases:
|
||||
full_path = os.path.join(base, path)
|
||||
|
||||
if os.path.isfile(full_path):
|
||||
data_path_base = os.path.abspath(base)
|
||||
return data_path(path)
|
||||
|
||||
else:
|
||||
raise IOError()
|
141
morss/wsgi.py
141
morss/wsgi.py
@@ -15,16 +15,16 @@
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import sys
|
||||
import cgitb
|
||||
import mimetypes
|
||||
import os.path
|
||||
import re
|
||||
import lxml.etree
|
||||
|
||||
import cgitb
|
||||
import wsgiref.util
|
||||
import wsgiref.simple_server
|
||||
import sys
|
||||
import wsgiref.handlers
|
||||
import mimetypes
|
||||
import wsgiref.simple_server
|
||||
import wsgiref.util
|
||||
|
||||
import lxml.etree
|
||||
|
||||
try:
|
||||
# python 2
|
||||
@@ -33,13 +33,12 @@ except ImportError:
|
||||
# python 3
|
||||
from urllib.parse import unquote
|
||||
|
||||
from . import crawler
|
||||
from . import readabilite
|
||||
from .morss import FeedFetch, FeedGather, FeedFormat
|
||||
from .morss import Options, log, TIMEOUT, DELAY, MorssException
|
||||
from . import caching, crawler, readabilite
|
||||
from .morss import (DELAY, TIMEOUT, FeedFetch, FeedFormat, FeedGather,
|
||||
MorssException, Options, log)
|
||||
from .util import data_path
|
||||
|
||||
|
||||
PORT = int(os.getenv('PORT', 8080))
|
||||
PORT = int(os.getenv('PORT', 8000))
|
||||
|
||||
|
||||
def parse_options(options):
|
||||
@@ -50,7 +49,7 @@ def parse_options(options):
|
||||
split = option.split('=', 1)
|
||||
|
||||
if len(split) > 1:
|
||||
out[split[0]] = split[1]
|
||||
out[split[0]] = unquote(split[1]).replace('|', '/') # | -> / for backward compatibility (and Apache)
|
||||
|
||||
else:
|
||||
out[split[0]] = True
|
||||
@@ -58,14 +57,18 @@ def parse_options(options):
|
||||
return out
|
||||
|
||||
|
||||
def get_path(environ):
|
||||
def request_uri(environ):
|
||||
if 'REQUEST_URI' in environ:
|
||||
# when running on Apache
|
||||
url = unquote(environ['REQUEST_URI'][1:])
|
||||
# when running on Apache/uwsgi
|
||||
url = environ['REQUEST_URI']
|
||||
|
||||
elif 'RAW_URI' in environ:
|
||||
# gunicorn
|
||||
url = environ['RAW_URI']
|
||||
|
||||
else:
|
||||
# when using internal server
|
||||
url = environ['PATH_INFO'][1:]
|
||||
# when using other servers
|
||||
url = environ['PATH_INFO']
|
||||
|
||||
if environ['QUERY_STRING']:
|
||||
url += '?' + environ['QUERY_STRING']
|
||||
@@ -76,19 +79,13 @@ def get_path(environ):
|
||||
def cgi_parse_environ(environ):
|
||||
# get options
|
||||
|
||||
url = get_path(environ)
|
||||
url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
|
||||
url = request_uri(environ)[1:]
|
||||
url = re.sub(r'^(cgi/)?(morss.py|main.py)/', '', url)
|
||||
|
||||
if url.startswith(':'):
|
||||
split = url.split('/', 1)
|
||||
|
||||
raw_options = split[0].replace('|', '/').replace('\\\'', '\'').split(':')[1:]
|
||||
|
||||
if len(split) > 1:
|
||||
url = split[1]
|
||||
|
||||
else:
|
||||
url = ''
|
||||
parts = url.split('/', 1)
|
||||
raw_options = parts[0].split(':')[1:]
|
||||
url = parts[1] if len(parts) > 1 else ''
|
||||
|
||||
else:
|
||||
raw_options = []
|
||||
@@ -164,33 +161,28 @@ def middleware(func):
|
||||
def cgi_file_handler(environ, start_response, app):
|
||||
" Simple HTTP server to serve static files (.html, .css, etc.) "
|
||||
|
||||
url = get_path(environ)
|
||||
url = request_uri(environ)[1:]
|
||||
|
||||
if url == '':
|
||||
url = 'index.html'
|
||||
|
||||
if re.match(r'^/?([a-zA-Z0-9_-][a-zA-Z0-9\._-]+/?)*$', url):
|
||||
# if it is a legitimate url (no funny relative paths)
|
||||
paths = [
|
||||
os.path.join(sys.prefix, 'share/morss/www', url),
|
||||
os.path.join(os.path.dirname(__file__), '../www', url)
|
||||
]
|
||||
try:
|
||||
path = data_path('www', url)
|
||||
f = open(path, 'rb')
|
||||
|
||||
for path in paths:
|
||||
try:
|
||||
f = open(path, 'rb')
|
||||
except IOError:
|
||||
# problem with file (cannot open or not found)
|
||||
pass
|
||||
|
||||
except IOError:
|
||||
# problem with file (cannot open or not found)
|
||||
continue
|
||||
|
||||
else:
|
||||
# file successfully open
|
||||
headers = {}
|
||||
headers['status'] = '200 OK'
|
||||
headers['content-type'] = mimetypes.guess_type(path)[0] or 'application/octet-stream'
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return wsgiref.util.FileWrapper(f)
|
||||
else:
|
||||
# file successfully open
|
||||
headers = {}
|
||||
headers['status'] = '200 OK'
|
||||
headers['content-type'] = mimetypes.guess_type(path)[0] or 'application/octet-stream'
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return wsgiref.util.FileWrapper(f)
|
||||
|
||||
# regex didn't validate or no file found
|
||||
return app(environ, start_response)
|
||||
@@ -200,32 +192,36 @@ def cgi_get(environ, start_response):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
# get page
|
||||
req = crawler.adv_get(url=url, timeout=TIMEOUT)
|
||||
if options['get'] in ('page', 'article'):
|
||||
req = crawler.adv_get(url=url, timeout=TIMEOUT)
|
||||
|
||||
if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
||||
if options.get == 'page':
|
||||
html = readabilite.parse(req['data'], encoding=req['encoding'])
|
||||
html.make_links_absolute(req['url'])
|
||||
if req['contenttype'] in crawler.MIMETYPE['html']:
|
||||
if options['get'] == 'page':
|
||||
html = readabilite.parse(req['data'], encoding=req['encoding'])
|
||||
html.make_links_absolute(req['url'])
|
||||
|
||||
kill_tags = ['script', 'iframe', 'noscript']
|
||||
kill_tags = ['script', 'iframe', 'noscript']
|
||||
|
||||
for tag in kill_tags:
|
||||
for elem in html.xpath('//'+tag):
|
||||
elem.getparent().remove(elem)
|
||||
for tag in kill_tags:
|
||||
for elem in html.xpath('//'+tag):
|
||||
elem.getparent().remove(elem)
|
||||
|
||||
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
|
||||
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
|
||||
|
||||
elif options.get == 'article':
|
||||
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
|
||||
else: # i.e. options['get'] == 'article'
|
||||
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
|
||||
|
||||
elif req['contenttype'] in crawler.MIMETYPE['xml'] + crawler.MIMETYPE['rss'] + crawler.MIMETYPE['json']:
|
||||
output = req['data']
|
||||
|
||||
else:
|
||||
raise MorssException('no :get option passed')
|
||||
raise MorssException('unsupported mimetype')
|
||||
|
||||
else:
|
||||
output = req['data']
|
||||
raise MorssException('no :get option passed')
|
||||
|
||||
# return html page
|
||||
headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
|
||||
headers = {'status': '200 OK', 'content-type': req['contenttype'], 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return [output]
|
||||
|
||||
@@ -255,9 +251,9 @@ def cgi_error_handler(environ, start_response, app):
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
headers = {'status': '500 Oops', 'content-type': 'text/html'}
|
||||
headers = {'status': '404 Not Found', 'content-type': 'text/html', 'x-morss-error': repr(e)}
|
||||
start_response(headers['status'], list(headers.items()), sys.exc_info())
|
||||
log('ERROR: %s' % repr(e), force=True)
|
||||
log('ERROR: %s' % repr(e))
|
||||
return [cgitb.html(sys.exc_info())]
|
||||
|
||||
|
||||
@@ -283,13 +279,20 @@ def cgi_handle_request():
|
||||
wsgiref.handlers.CGIHandler().run(app)
|
||||
|
||||
|
||||
class WSGIRequestHandlerRequestUri(wsgiref.simple_server.WSGIRequestHandler):
|
||||
def get_environ(self):
|
||||
env = wsgiref.simple_server.WSGIRequestHandler.get_environ(self)
|
||||
env['REQUEST_URI'] = self.path
|
||||
return env
|
||||
|
||||
|
||||
def cgi_start_server():
|
||||
crawler.default_cache.autotrim()
|
||||
caching.default_cache.autotrim()
|
||||
|
||||
print('Serving http://localhost:%s/' % PORT)
|
||||
httpd = wsgiref.simple_server.make_server('', PORT, application)
|
||||
httpd = wsgiref.simple_server.make_server('', PORT, application, handler_class=WSGIRequestHandlerRequestUri)
|
||||
httpd.serve_forever()
|
||||
|
||||
|
||||
if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''):
|
||||
crawler.default_cache.autotrim()
|
||||
caching.default_cache.autotrim()
|
||||
|
52
setup.py
52
setup.py
@@ -1,24 +1,60 @@
|
||||
from setuptools import setup
|
||||
from datetime import datetime
|
||||
from glob import glob
|
||||
|
||||
from setuptools import setup
|
||||
|
||||
|
||||
def get_version():
|
||||
with open('morss/__init__.py', 'r+') as file:
|
||||
lines = file.readlines()
|
||||
|
||||
# look for hard coded version number
|
||||
for i in range(len(lines)):
|
||||
if lines[i].startswith('__version__'):
|
||||
version = lines[i].split('"')[1]
|
||||
break
|
||||
|
||||
# create (& save) one if none found
|
||||
if version == '':
|
||||
version = datetime.now().strftime('%Y%m%d.%H%M')
|
||||
lines[i] = '__version__ = "' + version + '"\n'
|
||||
|
||||
file.seek(0)
|
||||
file.writelines(lines)
|
||||
|
||||
# return version number
|
||||
return version
|
||||
|
||||
package_name = 'morss'
|
||||
|
||||
setup(
|
||||
name = package_name,
|
||||
version = get_version(),
|
||||
description = 'Get full-text RSS feeds',
|
||||
author = 'pictuga, Samuel Marks',
|
||||
author_email = 'contact at pictuga dot com',
|
||||
long_description = open('README.md').read(),
|
||||
long_description_content_type = 'text/markdown',
|
||||
author = 'pictuga',
|
||||
author_email = 'contact@pictuga.com',
|
||||
url = 'http://morss.it/',
|
||||
download_url = 'https://git.pictuga.com/pictuga/morss',
|
||||
project_urls = {
|
||||
'Source': 'https://git.pictuga.com/pictuga/morss',
|
||||
'Bug Tracker': 'https://github.com/pictuga/morss/issues',
|
||||
},
|
||||
license = 'AGPL v3',
|
||||
packages = [package_name],
|
||||
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet', 'pymysql'],
|
||||
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'],
|
||||
extras_require = {
|
||||
'full': ['redis', 'diskcache', 'gunicorn', 'setproctitle'],
|
||||
'dev': ['pylint', 'pyenchant', 'pytest', 'pytest-cov'],
|
||||
},
|
||||
python_requires = '>=2.7',
|
||||
package_data = {package_name: ['feedify.ini']},
|
||||
data_files = [
|
||||
('share/' + package_name, ['README.md', 'LICENSE']),
|
||||
('share/' + package_name + '/www', glob('www/*.*')),
|
||||
('share/' + package_name + '/www/cgi', [])
|
||||
],
|
||||
entry_points = {
|
||||
'console_scripts': [package_name + '=' + package_name + '.__main__:main']
|
||||
})
|
||||
'console_scripts': [package_name + '=' + package_name + '.__main__:main'],
|
||||
},
|
||||
scripts = ['morss-helper'],
|
||||
)
|
||||
|
60
tests/conftest.py
Normal file
60
tests/conftest.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import os
|
||||
import os.path
|
||||
import threading
|
||||
|
||||
import pytest
|
||||
|
||||
try:
|
||||
# python2
|
||||
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
|
||||
from SimpleHTTPServer import SimpleHTTPRequestHandler
|
||||
except:
|
||||
# python3
|
||||
from http.server import (BaseHTTPRequestHandler, HTTPServer,
|
||||
SimpleHTTPRequestHandler)
|
||||
|
||||
class HTTPReplayHandler(SimpleHTTPRequestHandler):
|
||||
" Serves pages saved alongside with headers. See `curl --http1.1 -is http://...` "
|
||||
|
||||
directory = os.path.join(os.path.dirname(__file__), './samples/')
|
||||
|
||||
__init__ = BaseHTTPRequestHandler.__init__
|
||||
|
||||
def do_GET(self):
|
||||
path = self.translate_path(self.path)
|
||||
|
||||
if os.path.isdir(path):
|
||||
f = self.list_directory(path)
|
||||
|
||||
else:
|
||||
f = open(path, 'rb')
|
||||
|
||||
try:
|
||||
self.copyfile(f, self.wfile)
|
||||
|
||||
finally:
|
||||
f.close()
|
||||
|
||||
class MuteHTTPServer(HTTPServer):
|
||||
def handle_error(self, request, client_address):
|
||||
# mute errors
|
||||
pass
|
||||
|
||||
def make_server(port=8888):
|
||||
print('Serving http://localhost:%s/' % port)
|
||||
return MuteHTTPServer(('', port), RequestHandlerClass=HTTPReplayHandler)
|
||||
|
||||
@pytest.fixture
|
||||
def replay_server():
|
||||
httpd = make_server()
|
||||
thread = threading.Thread(target=httpd.serve_forever)
|
||||
thread.start()
|
||||
|
||||
yield
|
||||
|
||||
httpd.shutdown()
|
||||
thread.join()
|
||||
|
||||
if __name__ == '__main__':
|
||||
httpd = make_server()
|
||||
httpd.serve_forever()
|
4
tests/samples/200-ok.txt
Normal file
4
tests/samples/200-ok.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/plain
|
||||
|
||||
success
|
3
tests/samples/301-redirect-abs.txt
Normal file
3
tests/samples/301-redirect-abs.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
HTTP/1.1 301 Moved Permanently
|
||||
location: /200-ok.txt
|
||||
|
3
tests/samples/301-redirect-rel.txt
Normal file
3
tests/samples/301-redirect-rel.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
HTTP/1.1 301 Moved Permanently
|
||||
location: ./200-ok.txt
|
||||
|
3
tests/samples/301-redirect-url.txt
Normal file
3
tests/samples/301-redirect-url.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
HTTP/1.1 301 Moved Permanently
|
||||
location: http://localhost:8888/200-ok.txt
|
||||
|
4
tests/samples/308-redirect.txt
Normal file
4
tests/samples/308-redirect.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
HTTP/1.1 308 Permanent Redirect
|
||||
location: /200-ok.txt
|
||||
|
||||
/200-ok.txt
|
8
tests/samples/alternate-abs.txt
Normal file
8
tests/samples/alternate-abs.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/html; charset=UTF-8
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><link rel="alternate" type="application/rss+xml" href="/200-ok.txt" /></head>
|
||||
<body>meta redirect</body>
|
||||
</html>
|
4
tests/samples/enc-gb2312-header.txt
Normal file
4
tests/samples/enc-gb2312-header.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/plain; charset=gb2312
|
||||
|
||||
<EFBFBD>ɹ<EFBFBD>
|
10
tests/samples/enc-gb2312-meta.txt
Normal file
10
tests/samples/enc-gb2312-meta.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/html
|
||||
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><meta charset="gb2312"/></head>
|
||||
<body>
|
||||
<EFBFBD>ɹ<EFBFBD>
|
||||
</body></html>
|
4
tests/samples/enc-iso-8859-1-header.txt
Normal file
4
tests/samples/enc-iso-8859-1-header.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/plain; charset=iso-8859-1
|
||||
|
||||
succ<EFBFBD>s
|
4
tests/samples/enc-iso-8859-1-missing.txt
Normal file
4
tests/samples/enc-iso-8859-1-missing.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/plain
|
||||
|
||||
succ<EFBFBD>s
|
4
tests/samples/enc-utf-8-header.txt
Normal file
4
tests/samples/enc-utf-8-header.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/plain; charset=UTF-8
|
||||
|
||||
succès
|
16
tests/samples/feed-atom-utf-8.txt
Normal file
16
tests/samples/feed-atom-utf-8.txt
Normal file
@@ -0,0 +1,16 @@
|
||||
HTTP/1.1 200 OK
|
||||
Content-Type: text/xml; charset=utf-8
|
||||
|
||||
<?xml version='1.0' encoding='utf-8'?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<title>!TITLE!</title>
|
||||
<subtitle>!DESC!</subtitle>
|
||||
<entry>
|
||||
<title>!ITEM_TITLE!</title>
|
||||
<summary>!ITEM_DESC!</summary>
|
||||
<content type="html">!ITEM_CONTENT!</content>
|
||||
<link href="!ITEM_LINK!"/>
|
||||
<updated>2022-01-01T00:00:01+01:00</updated>
|
||||
<published>2022-01-01T00:00:02+01:00</published>
|
||||
</entry>
|
||||
</feed>
|
15
tests/samples/feed-atom03-utf-8.txt
Normal file
15
tests/samples/feed-atom03-utf-8.txt
Normal file
@@ -0,0 +1,15 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: application/xml
|
||||
|
||||
<?xml version='1.0' encoding='utf-8' ?>
|
||||
<feed version='0.3' xmlns='http://purl.org/atom/ns#'>
|
||||
<title>!TITLE!</title>
|
||||
<subtitle>!DESC!</subtitle>
|
||||
<entry>
|
||||
<title>!ITEM_TITLE!</title>
|
||||
<link rel='alternate' type='text/html' href='!ITEM_LINK!' />
|
||||
<summary>!ITEM_DESC!</summary>
|
||||
<content>!ITEM_CONTENT!</content>
|
||||
<issued>2022-01-01T00:00:01+01:00</issued> <!-- FIXME -->
|
||||
</entry>
|
||||
</feed>
|
22
tests/samples/feed-html-utf-8.txt
Normal file
22
tests/samples/feed-html-utf-8.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
HTTP/1.1 200 OK
|
||||
Content-Type: text/html; charset=utf-8
|
||||
|
||||
<html>
|
||||
<head></head>
|
||||
|
||||
<body>
|
||||
<div id="header">
|
||||
<h1>!TITLE!</h1>
|
||||
<p>!DESC!</p>
|
||||
</div>
|
||||
|
||||
<div id="content">
|
||||
<div class="item">
|
||||
<a target="_blank" href="!ITEM_LINK!">!ITEM_TITLE!</a>
|
||||
<div class="desc">!ITEM_DESC!</div>
|
||||
<div class="content">!ITEM_CONTENT!</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
16
tests/samples/feed-json-utf-8.txt
Normal file
16
tests/samples/feed-json-utf-8.txt
Normal file
@@ -0,0 +1,16 @@
|
||||
HTTP/1.1 200 OK
|
||||
Content-Type: application/json; charset=utf-8
|
||||
|
||||
{
|
||||
"title": "!TITLE!",
|
||||
"desc": "!DESC!",
|
||||
"items": [
|
||||
{
|
||||
"title": "!ITEM_TITLE!",
|
||||
"time": "2022-01-01T00:00:01+0100",
|
||||
"url": "!ITEM_LINK!",
|
||||
"desc": "!ITEM_DESC!",
|
||||
"content": "!ITEM_CONTENT!"
|
||||
}
|
||||
]
|
||||
}
|
17
tests/samples/feed-rss-channel-utf-8.txt
Normal file
17
tests/samples/feed-rss-channel-utf-8.txt
Normal file
@@ -0,0 +1,17 @@
|
||||
HTTP/1.1 200 OK
|
||||
Content-Type: text/xml; charset=utf-8
|
||||
|
||||
<?xml version='1.0' encoding='utf-8'?>
|
||||
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
|
||||
<channel>
|
||||
<title>!TITLE!</title>
|
||||
<description>!DESC!</description>
|
||||
<item>
|
||||
<title>!ITEM_TITLE!</title>
|
||||
<pubDate>Mon, 01 Jan 2022 00:00:01 +0100</pubDate>
|
||||
<link>!ITEM_LINK!</link>
|
||||
<description>!ITEM_DESC!</description>
|
||||
<content:encoded>!ITEM_CONTENT!</content:encoded>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
BIN
tests/samples/gzip.txt
Normal file
BIN
tests/samples/gzip.txt
Normal file
Binary file not shown.
3
tests/samples/header-refresh.txt
Normal file
3
tests/samples/header-refresh.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
HTTP/1.1 200 OK
|
||||
refresh: 0;url=/200-ok.txt
|
||||
|
8
tests/samples/meta-redirect-abs.txt
Normal file
8
tests/samples/meta-redirect-abs.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/html; charset=UTF-8
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><meta http-equiv="refresh" content="2; url = /200-ok.txt" /></head>
|
||||
<body>meta redirect</body>
|
||||
</html>
|
8
tests/samples/meta-redirect-rel.txt
Normal file
8
tests/samples/meta-redirect-rel.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/html; charset=UTF-8
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><meta http-equiv="refresh" content="2; url = ./200-ok.txt" /></head>
|
||||
<body>meta redirect</body>
|
||||
</html>
|
8
tests/samples/meta-redirect-url.txt
Normal file
8
tests/samples/meta-redirect-url.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/html; charset=UTF-8
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><meta http-equiv="refresh" content="2; url = http://localhost:8888/200-ok.txt" /></head>
|
||||
<body>meta redirect</body>
|
||||
</html>
|
9220
tests/samples/size-1MiB.txt
Normal file
9220
tests/samples/size-1MiB.txt
Normal file
File diff suppressed because it is too large
Load Diff
62
tests/test_crawler.py
Normal file
62
tests/test_crawler.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import pytest
|
||||
|
||||
from morss.crawler import *
|
||||
|
||||
|
||||
def test_get(replay_server):
|
||||
assert get('http://localhost:8888/200-ok.txt') == b'success\r\n'
|
||||
|
||||
def test_adv_get(replay_server):
|
||||
assert adv_get('http://localhost:8888/200-ok.txt')['data'] == b'success\r\n'
|
||||
|
||||
@pytest.mark.parametrize('before,after', [
|
||||
(b'http://localhost:8888/', 'http://localhost:8888/'),
|
||||
('localhost:8888/', 'http://localhost:8888/'),
|
||||
('http:/localhost:8888/', 'http://localhost:8888/'),
|
||||
('http://localhost:8888/&/', 'http://localhost:8888/&/'),
|
||||
('http://localhost:8888/ /', 'http://localhost:8888/%20/'),
|
||||
('http://localhost-€/€/', 'http://xn--localhost--077e/%E2%82%AC/'),
|
||||
('http://localhost-€:8888/€/', 'http://xn--localhost--077e:8888/%E2%82%AC/'),
|
||||
])
|
||||
def test_sanitize_url(before, after):
|
||||
assert sanitize_url(before) == after
|
||||
|
||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(SizeLimitHandler(500*1024))])
|
||||
def test_size_limit_handler(replay_server, opener):
|
||||
assert len(opener.open('http://localhost:8888/size-1MiB.txt').read()) == 500*1024
|
||||
|
||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(GZIPHandler())])
|
||||
def test_gzip_handler(replay_server, opener):
|
||||
assert opener.open('http://localhost:8888/gzip.txt').read() == b'success\n'
|
||||
|
||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(EncodingFixHandler())])
|
||||
@pytest.mark.parametrize('url', [
|
||||
'enc-gb2312-header.txt', 'enc-gb2312-meta.txt', #'enc-gb2312-missing.txt',
|
||||
'enc-iso-8859-1-header.txt', 'enc-iso-8859-1-missing.txt',
|
||||
'enc-utf-8-header.txt',
|
||||
])
|
||||
def test_encoding_fix_handler(replay_server, opener, url):
|
||||
out = adv_get('http://localhost:8888/%s' % url)
|
||||
out = out['data'].decode(out['encoding'])
|
||||
assert 'succes' in out or 'succès' in out or '成功' in out
|
||||
|
||||
@pytest.mark.parametrize('opener', [custom_opener(follow='rss'), build_opener(AlternateHandler(MIMETYPE['rss']))])
|
||||
def test_alternate_handler(replay_server, opener):
|
||||
assert opener.open('http://localhost:8888/alternate-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||
|
||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPEquivHandler(), HTTPRefreshHandler())])
|
||||
def test_http_equiv_handler(replay_server, opener):
|
||||
assert opener.open('http://localhost:8888/meta-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||
assert opener.open('http://localhost:8888/meta-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||
assert opener.open('http://localhost:8888/meta-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||
|
||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPAllRedirectHandler())])
|
||||
def test_http_all_redirect_handler(replay_server, opener):
|
||||
assert opener.open('http://localhost:8888/308-redirect.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||
assert opener.open('http://localhost:8888/301-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||
assert opener.open('http://localhost:8888/301-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||
assert opener.open('http://localhost:8888/301-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||
|
||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPRefreshHandler())])
|
||||
def test_http_refresh_handler(replay_server, opener):
|
||||
assert opener.open('http://localhost:8888/header-refresh.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
108
tests/test_feeds.py
Normal file
108
tests/test_feeds.py
Normal file
@@ -0,0 +1,108 @@
|
||||
import pytest
|
||||
|
||||
from morss.crawler import adv_get
|
||||
from morss.feeds import *
|
||||
|
||||
|
||||
def get_feed(url):
|
||||
url = 'http://localhost:8888/%s' % url
|
||||
out = adv_get(url)
|
||||
feed = parse(out['data'], url=url, encoding=out['encoding'])
|
||||
return feed
|
||||
|
||||
def check_feed(feed):
|
||||
# NB. time and updated not covered
|
||||
assert feed.title == '!TITLE!'
|
||||
assert feed.desc == '!DESC!'
|
||||
assert feed[0] == feed.items[0]
|
||||
assert feed[0].title == '!ITEM_TITLE!'
|
||||
assert feed[0].link == '!ITEM_LINK!'
|
||||
assert '!ITEM_DESC!' in feed[0].desc # broader test due to possible inclusion of surrounding <div> in xml
|
||||
assert '!ITEM_CONTENT!' in feed[0].content
|
||||
|
||||
def check_output(feed):
|
||||
output = feed.tostring()
|
||||
assert '!TITLE!' in output
|
||||
assert '!DESC!' in output
|
||||
assert '!ITEM_TITLE!' in output
|
||||
assert '!ITEM_LINK!' in output
|
||||
assert '!ITEM_DESC!' in output
|
||||
assert '!ITEM_CONTENT!' in output
|
||||
|
||||
def check_change(feed):
|
||||
feed.title = '!TITLE2!'
|
||||
feed.desc = '!DESC2!'
|
||||
feed[0].title = '!ITEM_TITLE2!'
|
||||
feed[0].link = '!ITEM_LINK2!'
|
||||
feed[0].desc = '!ITEM_DESC2!'
|
||||
feed[0].content = '!ITEM_CONTENT2!'
|
||||
|
||||
assert feed.title == '!TITLE2!'
|
||||
assert feed.desc == '!DESC2!'
|
||||
assert feed[0].title == '!ITEM_TITLE2!'
|
||||
assert feed[0].link == '!ITEM_LINK2!'
|
||||
assert '!ITEM_DESC2!' in feed[0].desc
|
||||
assert '!ITEM_CONTENT2!' in feed[0].content
|
||||
|
||||
def check_add(feed):
|
||||
feed.append({
|
||||
'title': '!ITEM_TITLE3!',
|
||||
'link': '!ITEM_LINK3!',
|
||||
'desc': '!ITEM_DESC3!',
|
||||
'content': '!ITEM_CONTENT3!',
|
||||
})
|
||||
|
||||
assert feed[1].title == '!ITEM_TITLE3!'
|
||||
assert feed[1].link == '!ITEM_LINK3!'
|
||||
assert '!ITEM_DESC3!' in feed[1].desc
|
||||
assert '!ITEM_CONTENT3!' in feed[1].content
|
||||
|
||||
each_format = pytest.mark.parametrize('url', [
|
||||
'feed-rss-channel-utf-8.txt', 'feed-atom-utf-8.txt',
|
||||
'feed-atom03-utf-8.txt', 'feed-json-utf-8.txt', 'feed-html-utf-8.txt',
|
||||
])
|
||||
|
||||
each_check = pytest.mark.parametrize('check', [
|
||||
check_feed, check_output, check_change, check_add,
|
||||
])
|
||||
|
||||
@each_format
|
||||
@each_check
|
||||
def test_parse(replay_server, url, check):
|
||||
feed = get_feed(url)
|
||||
check(feed)
|
||||
|
||||
@each_format
|
||||
@each_check
|
||||
def test_convert_rss(replay_server, url, check):
|
||||
feed = get_feed(url)
|
||||
feed = feed.convert(FeedXML)
|
||||
check(feed)
|
||||
|
||||
@each_format
|
||||
@each_check
|
||||
def test_convert_json(replay_server, url, check):
|
||||
feed = get_feed(url)
|
||||
feed = feed.convert(FeedJSON)
|
||||
check(feed)
|
||||
|
||||
@each_format
|
||||
@each_check
|
||||
def test_convert_html(replay_server, url, check):
|
||||
feed = get_feed(url)
|
||||
feed = feed.convert(FeedHTML)
|
||||
if len(feed) > 1:
|
||||
# remove the 'blank' default html item
|
||||
del feed[0]
|
||||
check(feed)
|
||||
|
||||
@each_format
|
||||
def test_convert_csv(replay_server, url):
|
||||
# only csv output, not csv feed, check therefore differnet
|
||||
feed = get_feed(url)
|
||||
output = feed.tocsv()
|
||||
|
||||
assert '!ITEM_TITLE!' in output
|
||||
assert '!ITEM_LINK!' in output
|
||||
assert '!ITEM_DESC!' in output
|
||||
assert '!ITEM_CONTENT!' in output
|
@@ -1,15 +0,0 @@
|
||||
Options -Indexes
|
||||
|
||||
ErrorDocument 403 "Access forbidden"
|
||||
ErrorDocument 404 /cgi/main.py
|
||||
ErrorDocument 500 "A very nasty bug found his way onto this very server"
|
||||
|
||||
# Uncomment below line to turn debug on for all requests
|
||||
#SetEnv DEBUG 1
|
||||
|
||||
# Uncomment below line to turn debug on for requests with :debug in the url
|
||||
#SetEnvIf Request_URI :debug DEBUG=1
|
||||
|
||||
<Files ~ "\.(py|pyc|db|log)$">
|
||||
deny from all
|
||||
</Files>
|
@@ -1,9 +0,0 @@
|
||||
order allow,deny
|
||||
|
||||
deny from all
|
||||
|
||||
<Files main.py>
|
||||
allow from all
|
||||
AddHandler cgi-script .py
|
||||
Options +ExecCGI
|
||||
</Files>
|
@@ -16,6 +16,7 @@
|
||||
<title>RSS feed by morss</title>
|
||||
<meta name="viewport" content="width=device-width; initial-scale=1.0;" />
|
||||
<meta name="robots" content="noindex" />
|
||||
<link rel="shortcut icon" type="image/svg+xml" href="/logo.svg" sizes="any" />
|
||||
|
||||
<style type="text/css">
|
||||
body * {
|
||||
@@ -191,9 +192,9 @@
|
||||
feed as
|
||||
<select>
|
||||
<option value="">RSS</option>
|
||||
<option value=":json:cors">JSON</option>
|
||||
<option value=":html">HTML</option>
|
||||
<option value=":csv">CSV</option>
|
||||
<option value=":format=json:cors">JSON</option>
|
||||
<option value=":format=html">HTML</option>
|
||||
<option value=":format=csv">CSV</option>
|
||||
</select>
|
||||
using the
|
||||
<select>
|
||||
@@ -203,7 +204,9 @@
|
||||
link of the
|
||||
<select>
|
||||
<option value="">first</option>
|
||||
<option value=":newest" title="Select feed items by publication date (instead of appearing order)">newest (?)</option>
|
||||
<option value=":order=newest" title="Select feed items by publication date (instead of appearing order)">newest (?)</option>
|
||||
<option value=":order=last">last</option>
|
||||
<option value=":order=oldest">oldest</option>
|
||||
</select>
|
||||
items and
|
||||
<select>
|
||||
|
Reference in New Issue
Block a user