Compare commits
100 Commits
ed06ae6398
...
master
Author | SHA1 | Date | |
---|---|---|---|
c5b2df754e | |||
6529fdbdd8 | |||
f4da40fffb | |||
d27fc93f75 | |||
dfb2b83c06 | |||
4340b678d0 | |||
ff9503b0d0 | |||
|
8bdcd8f386 | ||
ea2ebedfcb | |||
438c32a312 | |||
8b26797e93 | |||
e1ed33f320 | |||
b65272daab | |||
4d64afe9cb | |||
d3b623482d | |||
32645548c2 | |||
d6b90448f3 | |||
da81edc651 | |||
4f2895f931 | |||
b2b04691d6 | |||
bfaf7b0fac | |||
32d9bc9d9d | |||
b138f11771 | |||
a01258700d | |||
4d6d3c9239 | |||
e81f6b173f | |||
fe5dbf1ce0 | |||
fdf9acd32b | |||
d05706e056 | |||
e88a823ada | |||
750850c162 | |||
c8669002e4 | |||
c524e54d2d | |||
ef14567d87 | |||
fb643f5ef1 | |||
dbdca910d8 | |||
9eb19fac04 | |||
d424e394d1 | |||
3f92787b38 | |||
afc31eb6e9 | |||
87d2fe772d | |||
917aa0fbc5 | |||
3e2b81286f | |||
15430a2b83 | |||
ecdb74812d | |||
2c7844942c | |||
e12cb4567a | |||
b74365b121 | |||
2020543469 | |||
676be4a4fe | |||
8870400a6e | |||
8e9cc541b0 | |||
2a7a1b83ec | |||
106f59afa1 | |||
ee514e2da3 | |||
e7578e859a | |||
3bcb8db974 | |||
7751792942 | |||
6e2e5ffa00 | |||
f6da0e1e9b | |||
2247ba13c5 | |||
d17b9a2f27 | |||
5ab45e60af | |||
368e4683d6 | |||
9fd8c7d6af | |||
89f5d07408 | |||
495bd44893 | |||
ff12dbea39 | |||
7885ab48df | |||
7cdcbd23e1 | |||
25f283da1f | |||
727d14e539 | |||
3392ae3973 | |||
0111ea1749 | |||
def397de5e | |||
d07aa566ed | |||
0ee16d4a7d | |||
ac9859d955 | |||
580565da77 | |||
b2600152ea | |||
27d8f06308 | |||
79c4477cfc | |||
c09aa8400a | |||
861c275f5b | |||
99a855c8fc | |||
bef7899cdd | |||
7513a3e74d | |||
5bf93b83df | |||
e7ecc018c5 | |||
34b7468ba5 | |||
5336d26204 | |||
c7082dcf6c | |||
c785adb4c3 | |||
73798d2fc1 | |||
18daf378e8 | |||
aa2b747c5e | |||
d390ed9715 | |||
0a5a8ceb7f | |||
d2d9d7f22e | |||
29ae99c24d |
24
.drone.yml
24
.drone.yml
@@ -1,24 +0,0 @@
|
|||||||
kind: pipeline
|
|
||||||
name: default
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: lint
|
|
||||||
image: alpine
|
|
||||||
commands:
|
|
||||||
- apk add --no-cache python3 py3-lxml py3-pip py3-wheel py3-enchant hunspell-en
|
|
||||||
- pip3 install --no-cache-dir .[full] .[dev]
|
|
||||||
- isort --check-only --diff .
|
|
||||||
- pylint morss --rcfile=.pylintrc --disable=C,R,W --fail-under=8
|
|
||||||
|
|
||||||
- name: pypi
|
|
||||||
image: plugins/pypi
|
|
||||||
settings:
|
|
||||||
username:
|
|
||||||
from_secret: pypi_user
|
|
||||||
password:
|
|
||||||
from_secret: pypi_pwd
|
|
||||||
when:
|
|
||||||
branch:
|
|
||||||
- master
|
|
||||||
event:
|
|
||||||
- push
|
|
78
.github/workflows/default.yml
vendored
Normal file
78
.github/workflows/default.yml
vendored
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
name: default
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test-lint:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Prepare image
|
||||||
|
run: apt-get -y update && apt-get -y install python3-pip libenchant-2-2 aspell-en
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: pip3 install .[full] .[dev]
|
||||||
|
- run: isort --check-only --diff .
|
||||||
|
- run: pylint morss --rcfile=.pylintrc --disable=C,R,W --fail-under=8
|
||||||
|
- run: pytest --cov=morss tests
|
||||||
|
|
||||||
|
python-publish:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Prepare image
|
||||||
|
run: apt-get -y update && apt-get -y install python3-pip python3-build
|
||||||
|
|
||||||
|
- name: Build package
|
||||||
|
run: python3 -m build
|
||||||
|
|
||||||
|
- name: Publish package
|
||||||
|
uses: https://github.com/pypa/gh-action-pypi-publish@release/v1
|
||||||
|
with:
|
||||||
|
password: ${{ secrets.pypi_api_token }}
|
||||||
|
|
||||||
|
docker-publish-deploy:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
container:
|
||||||
|
image: catthehacker/ubuntu:act-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up QEMU
|
||||||
|
uses: https://github.com/docker/setup-qemu-action@v2
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: https://github.com/docker/setup-buildx-action@v2
|
||||||
|
|
||||||
|
- name: Login to Docker Hub
|
||||||
|
uses: https://github.com/docker/login-action@v2
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.docker_user }}
|
||||||
|
password: ${{ secrets.docker_pwd }}
|
||||||
|
|
||||||
|
- name: Build and push
|
||||||
|
uses: https://github.com/docker/build-push-action@v4
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
platforms: linux/amd64,linux/arm64,linux/arm/v7
|
||||||
|
push: true
|
||||||
|
tags: ${{ secrets.docker_repo }}
|
||||||
|
|
||||||
|
- name: Deploy on server
|
||||||
|
uses: https://github.com/appleboy/ssh-action@v0.1.10
|
||||||
|
with:
|
||||||
|
host: ${{ secrets.ssh_host }}
|
||||||
|
username: ${{ secrets.ssh_user }}
|
||||||
|
key: ${{ secrets.ssh_key }}
|
||||||
|
script: morss-update
|
15
Dockerfile
15
Dockerfile
@@ -1,11 +1,16 @@
|
|||||||
FROM alpine:latest
|
FROM alpine:edge
|
||||||
|
|
||||||
RUN apk add --no-cache python3 py3-pip py3-wheel git py3-lxml
|
|
||||||
|
|
||||||
ADD . /app
|
ADD . /app
|
||||||
RUN pip3 install --no-cache-dir /app[full]
|
|
||||||
|
RUN set -ex; \
|
||||||
|
apk add --no-cache --virtual .run-deps python3 py3-lxml py3-setproctitle py3-setuptools; \
|
||||||
|
apk add --no-cache --virtual .build-deps py3-pip py3-wheel; \
|
||||||
|
pip3 install --no-cache-dir /app[full]; \
|
||||||
|
apk del .build-deps
|
||||||
|
|
||||||
USER 1000:1000
|
USER 1000:1000
|
||||||
|
|
||||||
ENTRYPOINT ["/bin/sh", "/app/docker-entry.sh"]
|
ENTRYPOINT ["/bin/sh", "/app/morss-helper"]
|
||||||
CMD ["run"]
|
CMD ["run"]
|
||||||
|
|
||||||
|
HEALTHCHECK CMD /bin/sh /app/morss-helper check
|
||||||
|
158
README.md
158
README.md
@@ -1,13 +1,14 @@
|
|||||||
# Morss - Get full-text RSS feeds
|
# Morss - Get full-text RSS feeds
|
||||||
|
|
||||||
|
[Homepage](https://morss.it/) •
|
||||||
|
[Upstream source code](https://git.pictuga.com/pictuga/morss) •
|
||||||
|
[Github mirror](https://github.com/pictuga/morss) (for Issues & Pull requests)
|
||||||
|
|
||||||
[](https://ci.pictuga.com/pictuga/morss)
|
[](https://ci.pictuga.com/pictuga/morss)
|
||||||
|
[](https://github.com/pictuga/morss/stargazers)
|
||||||
_GNU AGPLv3 code_
|
[](https://github.com/pictuga/morss/network/members)
|
||||||
_Provided logo is CC BY-NC-SA 4.0_
|
[](https://git.pictuga.com/pictuga/morss/src/branch/master/LICENSE)
|
||||||
|
[](https://creativecommons.org/licenses/by-nc-sa/4.0/)
|
||||||
Upstream source code: <https://git.pictuga.com/pictuga/morss>
|
|
||||||
Github mirror (for Issues & Pull requests): <https://github.com/pictuga/morss>
|
|
||||||
Homepage: <https://morss.it/>
|
|
||||||
|
|
||||||
This tool's goal is to get full-text RSS feeds out of striped RSS feeds,
|
This tool's goal is to get full-text RSS feeds out of striped RSS feeds,
|
||||||
commonly available on internet. Indeed most newspapers only make a small
|
commonly available on internet. Indeed most newspapers only make a small
|
||||||
@@ -40,7 +41,7 @@ Some features of morss:
|
|||||||
- Follow 301/meta redirects
|
- Follow 301/meta redirects
|
||||||
- Recover xml feeds with corrupt encoding
|
- Recover xml feeds with corrupt encoding
|
||||||
- Supports gzip-compressed http content
|
- Supports gzip-compressed http content
|
||||||
- HTTP caching with different backends (in-memory/sqlite/mysql/redis/diskcache)
|
- HTTP caching with different backends (in-memory/redis/diskcache)
|
||||||
- Works as server/cli tool
|
- Works as server/cli tool
|
||||||
- Deobfuscate various tracking links
|
- Deobfuscate various tracking links
|
||||||
|
|
||||||
@@ -48,21 +49,41 @@ Some features of morss:
|
|||||||
|
|
||||||
### Python package
|
### Python package
|
||||||
|
|
||||||
|
].status&url=https://ci.pictuga.com/api/repos/pictuga/morss/builds/latest)
|
||||||
|
[](https://pypi.org/project/morss/)
|
||||||
|
[](https://pypistats.org/packages/morss)
|
||||||
|
|
||||||
Simple install (without optional dependencies)
|
Simple install (without optional dependencies)
|
||||||
|
|
||||||
|
From pip
|
||||||
|
|
||||||
|
```shell
|
||||||
|
pip install morss
|
||||||
|
```
|
||||||
|
|
||||||
|
From git
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
pip install git+https://git.pictuga.com/pictuga/morss.git
|
pip install git+https://git.pictuga.com/pictuga/morss.git
|
||||||
```
|
```
|
||||||
|
|
||||||
Full installation (including optional dependencies)
|
Full installation (including optional dependencies)
|
||||||
|
|
||||||
|
From pip
|
||||||
|
|
||||||
|
```shell
|
||||||
|
pip install morss[full]
|
||||||
|
```
|
||||||
|
|
||||||
|
From git
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
pip install git+https://git.pictuga.com/pictuga/morss.git#egg=morss[full]
|
pip install git+https://git.pictuga.com/pictuga/morss.git#egg=morss[full]
|
||||||
```
|
```
|
||||||
|
|
||||||
The full install includes mysql, redis and diskcache (possible cache backends).
|
The full install includes all the cache backends. Otherwise, only in-memory
|
||||||
Otherwise, only in-memory and sqlite3 caches are available. The full install
|
cache is available. The full install also includes gunicorn (for more efficient
|
||||||
also includes gunicorn and gevent (for more efficient HTTP handling).
|
HTTP handling).
|
||||||
|
|
||||||
The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
|
The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
|
||||||
C code needs to be compiled). If possible on your distribution, try installing
|
C code needs to be compiled). If possible on your distribution, try installing
|
||||||
@@ -70,13 +91,37 @@ it with the system package manager.
|
|||||||
|
|
||||||
### Docker
|
### Docker
|
||||||
|
|
||||||
Build
|
].status&url=https://ci.pictuga.com/api/repos/pictuga/morss/builds/latest)
|
||||||
|
[](https://hub.docker.com/r/pictuga/morss)
|
||||||
|
[](https://hub.docker.com/r/pictuga/morss/tags)
|
||||||
|
|
||||||
|
From docker hub
|
||||||
|
|
||||||
|
With cli
|
||||||
|
|
||||||
|
```shell
|
||||||
|
docker pull pictuga/morss
|
||||||
|
```
|
||||||
|
|
||||||
|
With docker-compose **(recommended)**
|
||||||
|
|
||||||
|
```yml
|
||||||
|
services:
|
||||||
|
app:
|
||||||
|
image: pictuga/morss
|
||||||
|
ports:
|
||||||
|
- '8000:8000'
|
||||||
|
```
|
||||||
|
|
||||||
|
Build from source
|
||||||
|
|
||||||
|
With cli
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
docker build --tag morss https://git.pictuga.com/pictuga/morss.git --no-cache --pull
|
docker build --tag morss https://git.pictuga.com/pictuga/morss.git --no-cache --pull
|
||||||
```
|
```
|
||||||
|
|
||||||
With docker-compose:
|
With docker-compose
|
||||||
|
|
||||||
```yml
|
```yml
|
||||||
services:
|
services:
|
||||||
@@ -97,8 +142,8 @@ docker-compose build --no-cache --pull
|
|||||||
|
|
||||||
One-click deployment:
|
One-click deployment:
|
||||||
|
|
||||||
* Heroku: <https://heroku.com/deploy?template=https://github.com/pictuga/morss>
|
[](https://heroku.com/deploy?template=https://github.com/pictuga/morss)
|
||||||
* Google Cloud: <https://deploy.cloud.run/?git_repo=https://github.com/pictuga/morss.git>
|
[](https://deploy.cloud.run/?git_repo=https://github.com/pictuga/morss.git)
|
||||||
|
|
||||||
Providers supporting `cloud-init` (AWS, Oracle Cloud Infrastructure), based on Ubuntu:
|
Providers supporting `cloud-init` (AWS, Oracle Cloud Infrastructure), based on Ubuntu:
|
||||||
|
|
||||||
@@ -109,26 +154,28 @@ packages:
|
|||||||
- python3-pip
|
- python3-pip
|
||||||
- python3-wheel
|
- python3-wheel
|
||||||
- python3-lxml
|
- python3-lxml
|
||||||
- git
|
- python3-setproctitle
|
||||||
- ca-certificates
|
- ca-certificates
|
||||||
|
|
||||||
write_files:
|
write_files:
|
||||||
- path: /etc/environment
|
- path: /etc/environment
|
||||||
|
append: true
|
||||||
content: |
|
content: |
|
||||||
DEBUG=1
|
DEBUG=1
|
||||||
CACHE=diskcache
|
CACHE=diskcache
|
||||||
CACHE_SIZE=1073741824
|
CACHE_SIZE=1073741824 # 1GiB
|
||||||
- path: /var/lib/cloud/scripts/per-boot/morss.sh
|
- path: /var/lib/cloud/scripts/per-boot/morss.sh
|
||||||
permissions: 744
|
permissions: 744
|
||||||
content: |
|
content: |
|
||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
gunicorn --bind 0.0.0.0:${PORT:-8000} ${GUNICORN} --preload --access-logfile - --daemon morss
|
/usr/local/bin/morss-helper daemon
|
||||||
|
|
||||||
runcmd:
|
runcmd:
|
||||||
|
- source /etc/environment
|
||||||
- update-ca-certificates
|
- update-ca-certificates
|
||||||
- iptables -I INPUT 6 -m state --state NEW -p tcp --dport {PORT:-8000} -j ACCEPT
|
- iptables -I INPUT 6 -m state --state NEW -p tcp --dport ${PORT:-8000} -j ACCEPT
|
||||||
- netfilter-persistent save
|
- netfilter-persistent save
|
||||||
- pip install git+https://git.pictuga.com/pictuga/morss.git#egg=morss[full]
|
- pip install morss[full]
|
||||||
```
|
```
|
||||||
|
|
||||||
## Run
|
## Run
|
||||||
@@ -156,13 +203,19 @@ other clients.
|
|||||||
|
|
||||||
#### Using Docker
|
#### Using Docker
|
||||||
|
|
||||||
Run
|
From docker hub
|
||||||
|
|
||||||
|
```shell
|
||||||
|
docker run -p 8000:8000 pictuga/morss
|
||||||
|
```
|
||||||
|
|
||||||
|
From source
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
docker run -p 8000:8000 morss
|
docker run -p 8000:8000 morss
|
||||||
```
|
```
|
||||||
|
|
||||||
With docker-compose:
|
With docker-compose **(recommended)**
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
docker-compose up
|
docker-compose up
|
||||||
@@ -223,8 +276,30 @@ For this, you need to make sure your host allows python script execution. This
|
|||||||
method uses HTTP calls to fetch the RSS feeds, which will be handled through
|
method uses HTTP calls to fetch the RSS feeds, which will be handled through
|
||||||
`mod_cgi` for example on Apache severs.
|
`mod_cgi` for example on Apache severs.
|
||||||
|
|
||||||
Please pay attention to `main.py` permissions for it to be executable. Also
|
Please pay attention to `main.py` permissions for it to be executable. See below
|
||||||
ensure that the provided `/www/.htaccess` works well with your server.
|
some tips for the `.htaccess` file.
|
||||||
|
|
||||||
|
```htaccess
|
||||||
|
Options -Indexes
|
||||||
|
|
||||||
|
ErrorDocument 404 /cgi/main.py
|
||||||
|
|
||||||
|
# Turn debug on for all requests
|
||||||
|
SetEnv DEBUG 1
|
||||||
|
|
||||||
|
# Turn debug on for requests with :debug in the url
|
||||||
|
SetEnvIf Request_URI :debug DEBUG=1
|
||||||
|
|
||||||
|
<Files ~ "\.(py|pyc|db|log)$">
|
||||||
|
deny from all
|
||||||
|
</Files>
|
||||||
|
|
||||||
|
<Files main.py>
|
||||||
|
allow from all
|
||||||
|
AddHandler cgi-script .py
|
||||||
|
Options +ExecCGI
|
||||||
|
</Files>
|
||||||
|
```
|
||||||
|
|
||||||
### As a CLI application
|
### As a CLI application
|
||||||
|
|
||||||
@@ -278,7 +353,7 @@ Using cache and passing arguments:
|
|||||||
```python
|
```python
|
||||||
>>> import morss
|
>>> import morss
|
||||||
>>> url = 'http://feeds.bbci.co.uk/news/rss.xml'
|
>>> url = 'http://feeds.bbci.co.uk/news/rss.xml'
|
||||||
>>> cache = '/tmp/morss-cache.db' # sqlite cache location
|
>>> cache = '/tmp/morss-cache' # diskcache cache location
|
||||||
>>> options = {'csv':True}
|
>>> options = {'csv':True}
|
||||||
>>> xml_string = morss.process(url, cache, options)
|
>>> xml_string = morss.process(url, cache, options)
|
||||||
>>> xml_string[:50]
|
>>> xml_string[:50]
|
||||||
@@ -292,11 +367,10 @@ under the hood.
|
|||||||
Doing it step-by-step:
|
Doing it step-by-step:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import morss, morss.crawler
|
import morss
|
||||||
|
|
||||||
url = 'http://newspaper.example/feed.xml'
|
url = 'http://newspaper.example/feed.xml'
|
||||||
options = morss.Options(csv=True) # arguments
|
options = morss.Options(csv=True) # arguments
|
||||||
morss.crawler.sqlite_default = '/tmp/morss-cache.db' # sqlite cache location
|
|
||||||
|
|
||||||
url, rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
|
url, rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
|
||||||
rss = morss.FeedGather(rss, url, options) # this fills the feed and cleans it up
|
rss = morss.FeedGather(rss, url, options) # this fills the feed and cleans it up
|
||||||
@@ -317,10 +391,11 @@ The list of arguments can be obtained by running `morss --help`
|
|||||||
```
|
```
|
||||||
usage: morss [-h] [--post STRING] [--xpath XPATH]
|
usage: morss [-h] [--post STRING] [--xpath XPATH]
|
||||||
[--format {rss,json,html,csv}] [--search STRING] [--clip]
|
[--format {rss,json,html,csv}] [--search STRING] [--clip]
|
||||||
[--indent] [--cache] [--force] [--proxy] [--newest] [--firstlink]
|
[--indent] [--cache] [--force] [--proxy]
|
||||||
[--resolve] [--items XPATH] [--item_link XPATH]
|
[--order {first,last,newest,oldest}] [--firstlink] [--resolve]
|
||||||
[--item_title XPATH] [--item_content XPATH] [--item_time XPATH]
|
[--items XPATH] [--item_link XPATH] [--item_title XPATH]
|
||||||
[--nolink] [--noref] [--silent]
|
[--item_content XPATH] [--item_time XPATH]
|
||||||
|
[--mode {xml,html,json}] [--nolink] [--noref] [--silent]
|
||||||
url
|
url
|
||||||
|
|
||||||
Get full-text RSS feeds
|
Get full-text RSS feeds
|
||||||
@@ -328,7 +403,7 @@ Get full-text RSS feeds
|
|||||||
positional arguments:
|
positional arguments:
|
||||||
url feed url
|
url feed url
|
||||||
|
|
||||||
optional arguments:
|
options:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
--post STRING POST request
|
--post STRING POST request
|
||||||
--xpath XPATH xpath rule to manually detect the article
|
--xpath XPATH xpath rule to manually detect the article
|
||||||
@@ -347,8 +422,9 @@ action:
|
|||||||
articles' content), so as to save time
|
articles' content), so as to save time
|
||||||
--force force refetch the rss feed and articles
|
--force force refetch the rss feed and articles
|
||||||
--proxy doesn't fill the articles
|
--proxy doesn't fill the articles
|
||||||
--newest return the feed items in chronological order (morss
|
--order {first,last,newest,oldest}
|
||||||
ohterwise shows the items by appearing order)
|
order in which to process items (which are however NOT
|
||||||
|
sorted in the output)
|
||||||
--firstlink pull the first article mentioned in the description
|
--firstlink pull the first article mentioned in the description
|
||||||
instead of the default link
|
instead of the default link
|
||||||
--resolve replace tracking links with direct links to articles
|
--resolve replace tracking links with direct links to articles
|
||||||
@@ -363,6 +439,8 @@ custom feeds:
|
|||||||
--item_content XPATH entry's content
|
--item_content XPATH entry's content
|
||||||
--item_time XPATH entry's date & time (accepts a wide range of time
|
--item_time XPATH entry's date & time (accepts a wide range of time
|
||||||
formats)
|
formats)
|
||||||
|
--mode {xml,html,json}
|
||||||
|
parser to use for the custom feeds
|
||||||
|
|
||||||
misc:
|
misc:
|
||||||
--nolink drop links, but keeps links' inner text
|
--nolink drop links, but keeps links' inner text
|
||||||
@@ -388,6 +466,7 @@ To pass environment variables:
|
|||||||
- docker-compose: add an `environment:` section in the .yml file
|
- docker-compose: add an `environment:` section in the .yml file
|
||||||
- Gunicorn/uWSGI/CLI: prepend `KEY=value` before the command
|
- Gunicorn/uWSGI/CLI: prepend `KEY=value` before the command
|
||||||
- Apache: via the `SetEnv` instruction (see sample `.htaccess` provided)
|
- Apache: via the `SetEnv` instruction (see sample `.htaccess` provided)
|
||||||
|
- cloud-init: in the `/etc/environment` file
|
||||||
|
|
||||||
Generic:
|
Generic:
|
||||||
|
|
||||||
@@ -396,6 +475,7 @@ debugging.
|
|||||||
- `IGNORE_SSL=1`: to ignore SSL certs when fetch feeds and articles
|
- `IGNORE_SSL=1`: to ignore SSL certs when fetch feeds and articles
|
||||||
- `DELAY` (seconds) sets the browser cache delay, only for HTTP clients
|
- `DELAY` (seconds) sets the browser cache delay, only for HTTP clients
|
||||||
- `TIMEOUT` (seconds) sets the HTTP timeout when fetching rss feeds and articles
|
- `TIMEOUT` (seconds) sets the HTTP timeout when fetching rss feeds and articles
|
||||||
|
- `DATA_PATH`: to set custom file location for the `www` folder
|
||||||
|
|
||||||
When parsing long feeds, with a lot of items (100+), morss might take a lot of
|
When parsing long feeds, with a lot of items (100+), morss might take a lot of
|
||||||
time to parse it, or might even run into a memory overflow on some shared
|
time to parse it, or might even run into a memory overflow on some shared
|
||||||
@@ -422,15 +502,10 @@ be dropped from the feed, even if they're cached. `-1` for unlimited.
|
|||||||
morss uses caching to make loading faster. There are 3 possible cache backends:
|
morss uses caching to make loading faster. There are 3 possible cache backends:
|
||||||
|
|
||||||
- `(nothing/default)`: a simple python in-memory dict-like object.
|
- `(nothing/default)`: a simple python in-memory dict-like object.
|
||||||
- `CACHE=sqlite`: sqlite3 cache. Default file location is in-memory (i.e. it
|
|
||||||
will be cleared every time the program is run). Path can be defined with
|
|
||||||
`SQLITE_PATH`.
|
|
||||||
- `CACHE=mysql`: MySQL cache. Connection can be defined with the following
|
|
||||||
environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
|
|
||||||
- `CACHE=redis`: Redis cache. Connection can be defined with the following
|
- `CACHE=redis`: Redis cache. Connection can be defined with the following
|
||||||
environment variables: `REDIS_HOST`, `REDIS_PORT`, `REDIS_DB`, `REDIS_PWD`
|
environment variables: `REDIS_HOST`, `REDIS_PORT`, `REDIS_DB`, `REDIS_PWD`
|
||||||
- `CACHE=diskcache`: disk-based cache. Target directory canbe defined with
|
- `CACHE=diskcache`: disk-based cache. Target directory canbe defined with
|
||||||
`DISKCAHE_DIR`.
|
`DISKCACHE_DIR`.
|
||||||
|
|
||||||
To limit the size of the cache:
|
To limit the size of the cache:
|
||||||
|
|
||||||
@@ -440,6 +515,9 @@ entries. NB. When using `diskcache`, this is the cache max size in Bytes.
|
|||||||
- `CACHE_LIFESPAN` (seconds) sets how often the cache must be trimmed (i.e. cut
|
- `CACHE_LIFESPAN` (seconds) sets how often the cache must be trimmed (i.e. cut
|
||||||
down to the number of items set in `CACHE_SIZE`). Defaults to 1min.
|
down to the number of items set in `CACHE_SIZE`). Defaults to 1min.
|
||||||
|
|
||||||
|
Gunicorn also accepts command line arguments via the `GUNICORN_CMD_ARGS`
|
||||||
|
environment variable.
|
||||||
|
|
||||||
### Content matching
|
### Content matching
|
||||||
|
|
||||||
The content of articles is grabbed with our own readability fork. This means
|
The content of articles is grabbed with our own readability fork. This means
|
||||||
|
2
app.json
2
app.json
@@ -5,7 +5,7 @@
|
|||||||
"value": 1,
|
"value": 1,
|
||||||
"required": false
|
"required": false
|
||||||
},
|
},
|
||||||
"GUNICORN": {
|
"GUNICORN_CMD_ARGS": {
|
||||||
"value": "",
|
"value": "",
|
||||||
"required": false
|
"required": false
|
||||||
},
|
},
|
||||||
|
@@ -1,12 +0,0 @@
|
|||||||
#! /bin/sh
|
|
||||||
|
|
||||||
if [ "$1" = "sh" ] || [ "$1" = "bash" ]; then
|
|
||||||
exec $@
|
|
||||||
|
|
||||||
elif [ -z "$1" ] || [ "$@" = "run" ]; then
|
|
||||||
gunicorn --bind 0.0.0.0:${PORT:-8000} ${GUNICORN} --preload --access-logfile - morss
|
|
||||||
|
|
||||||
else
|
|
||||||
morss $@
|
|
||||||
|
|
||||||
fi
|
|
47
morss-helper
Executable file
47
morss-helper
Executable file
@@ -0,0 +1,47 @@
|
|||||||
|
#! /bin/sh
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
if ! command -v python && command -v python3 ; then
|
||||||
|
alias python='python3'
|
||||||
|
fi
|
||||||
|
|
||||||
|
run() {
|
||||||
|
gunicorn --bind 0.0.0.0:${PORT:-8000} --preload --access-logfile - morss
|
||||||
|
}
|
||||||
|
|
||||||
|
daemon() {
|
||||||
|
gunicorn --bind 0.0.0.0:${PORT:-8000} --preload --access-logfile - --daemon morss
|
||||||
|
}
|
||||||
|
|
||||||
|
reload() {
|
||||||
|
pid=$(pidof 'gunicorn: master [morss]' || true)
|
||||||
|
# NB. requires python-setproctitle
|
||||||
|
# `|| true` due to `set -e`
|
||||||
|
|
||||||
|
if [ -z "$pid" ]; then
|
||||||
|
# if gunicorn is not currently running
|
||||||
|
daemon
|
||||||
|
|
||||||
|
else
|
||||||
|
kill -s USR2 $pid
|
||||||
|
kill -s WINCH $pid
|
||||||
|
sleep 1 # give gunicorn some time to reload
|
||||||
|
kill -s TERM $pid
|
||||||
|
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
check() {
|
||||||
|
python -m morss.crawler http://localhost:${PORT:-8000}/ > /dev/null 2>&1
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ -z "$1" ]; then
|
||||||
|
run
|
||||||
|
|
||||||
|
elif [ "$1" = "sh" ] || [ "$1" = "bash" ] || command -v "$1" ; then
|
||||||
|
$@
|
||||||
|
|
||||||
|
else
|
||||||
|
python -m morss $@
|
||||||
|
|
||||||
|
fi
|
13
morss.service
Normal file
13
morss.service
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=morss server (gunicorn)
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
ExecStart=/usr/local/bin/morss-helper run
|
||||||
|
ExecReload=/usr/local/bin/morss-helper reload
|
||||||
|
KillMode=process
|
||||||
|
Restart=always
|
||||||
|
User=http
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
@@ -19,5 +19,7 @@
|
|||||||
|
|
||||||
# pylint: disable=unused-import,unused-variable
|
# pylint: disable=unused-import,unused-variable
|
||||||
|
|
||||||
|
__version__ = ""
|
||||||
|
|
||||||
from .morss import *
|
from .morss import *
|
||||||
from .wsgi import application
|
from .wsgi import application
|
||||||
|
@@ -16,7 +16,6 @@
|
|||||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import pickle
|
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
@@ -51,83 +50,6 @@ class BaseCache:
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
import sqlite3 # isort:skip
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class SQLiteCache(BaseCache):
|
|
||||||
def __init__(self, path=':memory:'):
|
|
||||||
self.con = sqlite3.connect(path, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
|
|
||||||
|
|
||||||
with self.con:
|
|
||||||
self.con.execute('CREATE TABLE IF NOT EXISTS data (ky UNICODE PRIMARY KEY, data BLOB, timestamp INT)')
|
|
||||||
self.con.execute('pragma journal_mode=WAL')
|
|
||||||
|
|
||||||
self.trim()
|
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
self.con.close()
|
|
||||||
|
|
||||||
def trim(self):
|
|
||||||
with self.con:
|
|
||||||
self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,))
|
|
||||||
|
|
||||||
def __getitem__(self, key):
|
|
||||||
row = self.con.execute('SELECT * FROM data WHERE ky=?', (key,)).fetchone()
|
|
||||||
|
|
||||||
if not row:
|
|
||||||
raise KeyError
|
|
||||||
|
|
||||||
return row[1]
|
|
||||||
|
|
||||||
def __setitem__(self, key, data):
|
|
||||||
with self.con:
|
|
||||||
self.con.execute('INSERT INTO data VALUES (?,?,?) ON CONFLICT(ky) DO UPDATE SET data=?, timestamp=?', (key, data, time.time(), data, time.time()))
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
import pymysql.cursors # isort:skip
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class MySQLCacheHandler(BaseCache):
|
|
||||||
def __init__(self, user, password, database, host='localhost'):
|
|
||||||
self.user = user
|
|
||||||
self.password = password
|
|
||||||
self.database = database
|
|
||||||
self.host = host
|
|
||||||
|
|
||||||
with self.cursor() as cursor:
|
|
||||||
cursor.execute('CREATE TABLE IF NOT EXISTS data (ky VARCHAR(255) NOT NULL PRIMARY KEY, data MEDIUMBLOB, timestamp INT)')
|
|
||||||
|
|
||||||
self.trim()
|
|
||||||
|
|
||||||
def cursor(self):
|
|
||||||
return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
|
|
||||||
|
|
||||||
def trim(self):
|
|
||||||
with self.cursor() as cursor:
|
|
||||||
cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,))
|
|
||||||
|
|
||||||
def __getitem__(self, key):
|
|
||||||
cursor = self.cursor()
|
|
||||||
cursor.execute('SELECT * FROM data WHERE ky=%s', (key,))
|
|
||||||
row = cursor.fetchone()
|
|
||||||
|
|
||||||
if not row:
|
|
||||||
raise KeyError
|
|
||||||
|
|
||||||
return row[1]
|
|
||||||
|
|
||||||
def __setitem__(self, key, data):
|
|
||||||
with self.cursor() as cursor:
|
|
||||||
cursor.execute('INSERT INTO data VALUES (%s,%s,%s) ON DUPLICATE KEY UPDATE data=%s, timestamp=%s',
|
|
||||||
(key, data, time.time(), data, time.time()))
|
|
||||||
|
|
||||||
|
|
||||||
class CappedDict(OrderedDict, BaseCache):
|
class CappedDict(OrderedDict, BaseCache):
|
||||||
def trim(self):
|
def trim(self):
|
||||||
if CACHE_SIZE >= 0:
|
if CACHE_SIZE >= 0:
|
||||||
@@ -182,20 +104,7 @@ class DiskCacheHandler(BaseCache):
|
|||||||
|
|
||||||
|
|
||||||
if 'CACHE' in os.environ:
|
if 'CACHE' in os.environ:
|
||||||
if os.environ['CACHE'] == 'mysql':
|
if os.environ['CACHE'] == 'redis':
|
||||||
default_cache = MySQLCacheHandler(
|
|
||||||
user = os.getenv('MYSQL_USER'),
|
|
||||||
password = os.getenv('MYSQL_PWD'),
|
|
||||||
database = os.getenv('MYSQL_DB'),
|
|
||||||
host = os.getenv('MYSQL_HOST', 'localhost')
|
|
||||||
)
|
|
||||||
|
|
||||||
elif os.environ['CACHE'] == 'sqlite':
|
|
||||||
default_cache = SQLiteCache(
|
|
||||||
os.getenv('SQLITE_PATH', ':memory:')
|
|
||||||
)
|
|
||||||
|
|
||||||
elif os.environ['CACHE'] == 'redis':
|
|
||||||
default_cache = RedisCacheHandler(
|
default_cache = RedisCacheHandler(
|
||||||
host = os.getenv('REDIS_HOST', 'localhost'),
|
host = os.getenv('REDIS_HOST', 'localhost'),
|
||||||
port = int(os.getenv('REDIS_PORT', 6379)),
|
port = int(os.getenv('REDIS_PORT', 6379)),
|
||||||
@@ -205,7 +114,7 @@ if 'CACHE' in os.environ:
|
|||||||
|
|
||||||
elif os.environ['CACHE'] == 'diskcache':
|
elif os.environ['CACHE'] == 'diskcache':
|
||||||
default_cache = DiskCacheHandler(
|
default_cache = DiskCacheHandler(
|
||||||
directory = os.getenv('DISKCAHE_DIR', '/tmp/morss-diskcache'),
|
directory = os.getenv('DISKCACHE_DIR', '/tmp/morss-diskcache'),
|
||||||
size_limit = CACHE_SIZE # in Bytes
|
size_limit = CACHE_SIZE # in Bytes
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@@ -44,7 +44,7 @@ def cli_app():
|
|||||||
group.add_argument('--cache', action='store_true', help='only take articles from the cache (ie. don\'t grab new articles\' content), so as to save time')
|
group.add_argument('--cache', action='store_true', help='only take articles from the cache (ie. don\'t grab new articles\' content), so as to save time')
|
||||||
group.add_argument('--force', action='store_true', help='force refetch the rss feed and articles')
|
group.add_argument('--force', action='store_true', help='force refetch the rss feed and articles')
|
||||||
group.add_argument('--proxy', action='store_true', help='doesn\'t fill the articles')
|
group.add_argument('--proxy', action='store_true', help='doesn\'t fill the articles')
|
||||||
group.add_argument('--newest', action='store_true', help='return the feed items in chronological order (morss ohterwise shows the items by appearing order)')
|
group.add_argument('--order', default='first', choices=('first', 'last', 'newest', 'oldest'), help='order in which to process items (which are however NOT sorted in the output)')
|
||||||
group.add_argument('--firstlink', action='store_true', help='pull the first article mentioned in the description instead of the default link')
|
group.add_argument('--firstlink', action='store_true', help='pull the first article mentioned in the description instead of the default link')
|
||||||
group.add_argument('--resolve', action='store_true', help='replace tracking links with direct links to articles (not compatible with --proxy)')
|
group.add_argument('--resolve', action='store_true', help='replace tracking links with direct links to articles (not compatible with --proxy)')
|
||||||
|
|
||||||
@@ -54,6 +54,7 @@ def cli_app():
|
|||||||
group.add_argument('--item_title', action='store', type=str, metavar='XPATH', help='entry\'s title')
|
group.add_argument('--item_title', action='store', type=str, metavar='XPATH', help='entry\'s title')
|
||||||
group.add_argument('--item_content', action='store', type=str, metavar='XPATH', help='entry\'s content')
|
group.add_argument('--item_content', action='store', type=str, metavar='XPATH', help='entry\'s content')
|
||||||
group.add_argument('--item_time', action='store', type=str, metavar='XPATH', help='entry\'s date & time (accepts a wide range of time formats)')
|
group.add_argument('--item_time', action='store', type=str, metavar='XPATH', help='entry\'s date & time (accepts a wide range of time formats)')
|
||||||
|
group.add_argument('--mode', default=None, choices=('xml', 'html', 'json'), help='parser to use for the custom feeds')
|
||||||
|
|
||||||
group = parser.add_argument_group('misc')
|
group = parser.add_argument_group('misc')
|
||||||
group.add_argument('--nolink', action='store_true', help='drop links, but keeps links\' inner text')
|
group.add_argument('--nolink', action='store_true', help='drop links, but keeps links\' inner text')
|
||||||
|
@@ -38,12 +38,12 @@ try:
|
|||||||
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
|
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
|
||||||
Request, addinfourl, build_opener, parse_http_list,
|
Request, addinfourl, build_opener, parse_http_list,
|
||||||
parse_keqv_list)
|
parse_keqv_list)
|
||||||
from urlparse import urlparse, urlunparse
|
from urlparse import urlsplit
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# python 3
|
# python 3
|
||||||
from email import message_from_string
|
from email import message_from_string
|
||||||
from http.client import HTTPMessage
|
from http.client import HTTPMessage
|
||||||
from urllib.parse import quote, urlparse, urlunparse
|
from urllib.parse import quote, urlsplit
|
||||||
from urllib.request import (BaseHandler, HTTPCookieProcessor,
|
from urllib.request import (BaseHandler, HTTPCookieProcessor,
|
||||||
HTTPRedirectHandler, Request, addinfourl,
|
HTTPRedirectHandler, Request, addinfourl,
|
||||||
build_opener, parse_http_list, parse_keqv_list)
|
build_opener, parse_http_list, parse_keqv_list)
|
||||||
@@ -59,7 +59,9 @@ except NameError:
|
|||||||
MIMETYPE = {
|
MIMETYPE = {
|
||||||
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
|
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
|
||||||
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
||||||
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
'html': ['text/html', 'application/xhtml+xml', 'application/xml'],
|
||||||
|
'json': ['application/json'],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_UAS = [
|
DEFAULT_UAS = [
|
||||||
@@ -111,8 +113,6 @@ def adv_get(url, post=None, timeout=None, *args, **kwargs):
|
|||||||
|
|
||||||
|
|
||||||
def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
|
def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
|
||||||
handlers = []
|
|
||||||
|
|
||||||
# as per urllib2 source code, these Handelers are added first
|
# as per urllib2 source code, these Handelers are added first
|
||||||
# *unless* one of the custom handlers inherits from one of them
|
# *unless* one of the custom handlers inherits from one of them
|
||||||
#
|
#
|
||||||
@@ -130,16 +130,18 @@ def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
|
|||||||
# http_error_* are run until sth is returned (other than None). If they all
|
# http_error_* are run until sth is returned (other than None). If they all
|
||||||
# return nothing, a python error is raised
|
# return nothing, a python error is raised
|
||||||
|
|
||||||
#handlers.append(DebugHandler())
|
handlers = [
|
||||||
handlers.append(SizeLimitHandler(500*1024)) # 500KiB
|
#DebugHandler(),
|
||||||
handlers.append(HTTPCookieProcessor())
|
SizeLimitHandler(500*1024), # 500KiB
|
||||||
handlers.append(GZIPHandler())
|
HTTPCookieProcessor(),
|
||||||
handlers.append(HTTPAllRedirectHandler())
|
GZIPHandler(),
|
||||||
handlers.append(HTTPEquivHandler())
|
HTTPAllRedirectHandler(),
|
||||||
handlers.append(HTTPRefreshHandler())
|
HTTPEquivHandler(),
|
||||||
handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
|
HTTPRefreshHandler(),
|
||||||
handlers.append(BrowserlyHeaderHandler())
|
UAHandler(random.choice(DEFAULT_UAS)),
|
||||||
handlers.append(EncodingFixHandler())
|
BrowserlyHeaderHandler(),
|
||||||
|
EncodingFixHandler(),
|
||||||
|
]
|
||||||
|
|
||||||
if follow:
|
if follow:
|
||||||
handlers.append(AlternateHandler(MIMETYPE[follow]))
|
handlers.append(AlternateHandler(MIMETYPE[follow]))
|
||||||
@@ -161,10 +163,20 @@ def is_ascii(string):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def soft_quote(string):
|
||||||
|
" url-quote only when not a valid ascii string "
|
||||||
|
|
||||||
|
if is_ascii(string):
|
||||||
|
return string
|
||||||
|
|
||||||
|
else:
|
||||||
|
return quote(string.encode('utf-8'))
|
||||||
|
|
||||||
|
|
||||||
def sanitize_url(url):
|
def sanitize_url(url):
|
||||||
# make sure the url is unicode, i.e. not bytes
|
# make sure the url is unicode, i.e. not bytes
|
||||||
if isinstance(url, bytes):
|
if isinstance(url, bytes):
|
||||||
url = url.decode()
|
url = url.decode('utf-8')
|
||||||
|
|
||||||
# make sure there's a protocol (http://)
|
# make sure there's a protocol (http://)
|
||||||
if url.split(':', 1)[0] not in PROTOCOL:
|
if url.split(':', 1)[0] not in PROTOCOL:
|
||||||
@@ -177,18 +189,19 @@ def sanitize_url(url):
|
|||||||
url = url.replace(' ', '%20')
|
url = url.replace(' ', '%20')
|
||||||
|
|
||||||
# escape non-ascii unicode characters
|
# escape non-ascii unicode characters
|
||||||
# https://stackoverflow.com/a/4391299
|
parts = urlsplit(url)
|
||||||
parts = list(urlparse(url))
|
|
||||||
|
|
||||||
for i in range(len(parts)):
|
parts = parts._replace(
|
||||||
if not is_ascii(parts[i]):
|
netloc=parts.netloc.replace(
|
||||||
if i == 1:
|
parts.hostname,
|
||||||
parts[i] = parts[i].encode('idna').decode('ascii')
|
parts.hostname.encode('idna').decode('ascii')
|
||||||
|
),
|
||||||
|
path=soft_quote(parts.path),
|
||||||
|
query=soft_quote(parts.query),
|
||||||
|
fragment=soft_quote(parts.fragment),
|
||||||
|
)
|
||||||
|
|
||||||
else:
|
return parts.geturl()
|
||||||
parts[i] = quote(parts[i].encode('utf-8'))
|
|
||||||
|
|
||||||
return urlunparse(parts)
|
|
||||||
|
|
||||||
|
|
||||||
class RespDataHandler(BaseHandler):
|
class RespDataHandler(BaseHandler):
|
||||||
@@ -355,7 +368,7 @@ class BrowserlyHeaderHandler(BaseHandler):
|
|||||||
def iter_html_tag(html_str, tag_name):
|
def iter_html_tag(html_str, tag_name):
|
||||||
" To avoid parsing whole pages when looking for a simple tag "
|
" To avoid parsing whole pages when looking for a simple tag "
|
||||||
|
|
||||||
re_tag = r'<%s(\s*[^>])*>' % tag_name
|
re_tag = r'<%s\s+[^>]+>' % tag_name
|
||||||
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
|
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
|
||||||
|
|
||||||
for tag_match in re.finditer(re_tag, html_str):
|
for tag_match in re.finditer(re_tag, html_str):
|
||||||
@@ -412,7 +425,7 @@ class HTTPRefreshHandler(BaseHandler):
|
|||||||
def http_response(self, req, resp):
|
def http_response(self, req, resp):
|
||||||
if 200 <= resp.code < 300:
|
if 200 <= resp.code < 300:
|
||||||
if resp.headers.get('refresh'):
|
if resp.headers.get('refresh'):
|
||||||
regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url=(["\']?)(?P<url>.+)\2$'
|
regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url\s*=\s*(["\']?)(?P<url>.+)\2$'
|
||||||
match = re.search(regex, resp.headers.get('refresh'))
|
match = re.search(regex, resp.headers.get('refresh'))
|
||||||
|
|
||||||
if match:
|
if match:
|
||||||
@@ -500,6 +513,8 @@ class CacheHandler(BaseHandler):
|
|||||||
self.cache[key] = pickle.dumps(data, 0)
|
self.cache[key] = pickle.dumps(data, 0)
|
||||||
|
|
||||||
def cached_response(self, req, fallback=None):
|
def cached_response(self, req, fallback=None):
|
||||||
|
req.from_morss_cache = True
|
||||||
|
|
||||||
data = self.load(req.get_full_url())
|
data = self.load(req.get_full_url())
|
||||||
|
|
||||||
if data is not None:
|
if data is not None:
|
||||||
@@ -512,6 +527,10 @@ class CacheHandler(BaseHandler):
|
|||||||
return fallback
|
return fallback
|
||||||
|
|
||||||
def save_response(self, req, resp):
|
def save_response(self, req, resp):
|
||||||
|
if req.from_morss_cache:
|
||||||
|
# do not re-save (would reset the timing)
|
||||||
|
return resp
|
||||||
|
|
||||||
data = resp.read()
|
data = resp.read()
|
||||||
|
|
||||||
self.save(req.get_full_url(), {
|
self.save(req.get_full_url(), {
|
||||||
@@ -530,6 +549,8 @@ class CacheHandler(BaseHandler):
|
|||||||
return resp
|
return resp
|
||||||
|
|
||||||
def http_request(self, req):
|
def http_request(self, req):
|
||||||
|
req.from_morss_cache = False # to track whether it comes from cache
|
||||||
|
|
||||||
data = self.load(req.get_full_url())
|
data = self.load(req.get_full_url())
|
||||||
|
|
||||||
if data is not None:
|
if data is not None:
|
||||||
@@ -621,8 +642,7 @@ class CacheHandler(BaseHandler):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def http_response(self, req, resp):
|
def http_response(self, req, resp):
|
||||||
# code for after-fetch, to know whether to save to hard-drive (if stiking to http headers' will)
|
# code for after-fetch, to know whether to save to hard-drive (if sticking to http headers' will)
|
||||||
# NB. It might re-save requests pulled from cache, which will re-set the time() to the latest, i.e. lenghten its useful life
|
|
||||||
|
|
||||||
if resp.code == 304 and resp.url in self.cache:
|
if resp.code == 304 and resp.url in self.cache:
|
||||||
# we are hopefully the first after the HTTP handler, so no need
|
# we are hopefully the first after the HTTP handler, so no need
|
||||||
|
@@ -90,9 +90,6 @@ item_updated = updated
|
|||||||
[html]
|
[html]
|
||||||
mode = html
|
mode = html
|
||||||
|
|
||||||
path =
|
|
||||||
http://localhost/
|
|
||||||
|
|
||||||
title = //div[@id='header']/h1
|
title = //div[@id='header']/h1
|
||||||
desc = //div[@id='header']/p
|
desc = //div[@id='header']/p
|
||||||
items = //div[@id='content']/div
|
items = //div[@id='content']/div
|
||||||
|
@@ -65,7 +65,8 @@ def parse_rules(filename=None):
|
|||||||
# for each rule
|
# for each rule
|
||||||
|
|
||||||
if rules[section][arg].startswith('file:'):
|
if rules[section][arg].startswith('file:'):
|
||||||
file_raw = open(data_path(rules[section][arg][5:])).read()
|
path = data_path('www', rules[section][arg][5:])
|
||||||
|
file_raw = open(path).read()
|
||||||
file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
|
file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
|
||||||
rules[section][arg] = file_clean
|
rules[section][arg] = file_clean
|
||||||
|
|
||||||
@@ -93,7 +94,7 @@ def parse(data, url=None, encoding=None, ruleset=None):
|
|||||||
if 'path' in ruleset:
|
if 'path' in ruleset:
|
||||||
for path in ruleset['path']:
|
for path in ruleset['path']:
|
||||||
if fnmatch(url, path):
|
if fnmatch(url, path):
|
||||||
parser = [x for x in parsers if x.mode == ruleset['mode']][0]
|
parser = [x for x in parsers if x.mode == ruleset.get('mode')][0] # FIXME what if no mode specified?
|
||||||
return parser(data, ruleset, encoding=encoding)
|
return parser(data, ruleset, encoding=encoding)
|
||||||
|
|
||||||
# 2) Try each and every parser
|
# 2) Try each and every parser
|
||||||
@@ -113,7 +114,7 @@ def parse(data, url=None, encoding=None, ruleset=None):
|
|||||||
else:
|
else:
|
||||||
# parsing worked, now we try the rulesets
|
# parsing worked, now we try the rulesets
|
||||||
|
|
||||||
ruleset_candidates = [x for x in rulesets if x.get('mode', None) in (parser.mode, None) and 'path' not in x]
|
ruleset_candidates = [x for x in rulesets if x.get('mode') in (parser.mode, None) and 'path' not in x]
|
||||||
# 'path' as they should have been caught beforehands
|
# 'path' as they should have been caught beforehands
|
||||||
# try anyway if no 'mode' specified
|
# try anyway if no 'mode' specified
|
||||||
|
|
||||||
@@ -186,11 +187,12 @@ class ParserBase(object):
|
|||||||
return self.convert(FeedHTML).tostring(**k)
|
return self.convert(FeedHTML).tostring(**k)
|
||||||
|
|
||||||
def convert(self, TargetParser):
|
def convert(self, TargetParser):
|
||||||
if type(self) == TargetParser:
|
|
||||||
return self
|
|
||||||
|
|
||||||
target = TargetParser()
|
target = TargetParser()
|
||||||
|
|
||||||
|
if type(self) == TargetParser and self.rules == target.rules:
|
||||||
|
# check both type *AND* rules (e.g. when going from freeform xml to rss)
|
||||||
|
return self
|
||||||
|
|
||||||
for attr in target.dic:
|
for attr in target.dic:
|
||||||
if attr == 'items':
|
if attr == 'items':
|
||||||
for item in self.items:
|
for item in self.items:
|
||||||
@@ -359,7 +361,13 @@ class ParserXML(ParserBase):
|
|||||||
|
|
||||||
def rule_search_all(self, rule):
|
def rule_search_all(self, rule):
|
||||||
try:
|
try:
|
||||||
return self.root.xpath(rule, namespaces=self.NSMAP)
|
match = self.root.xpath(rule, namespaces=self.NSMAP)
|
||||||
|
if isinstance(match, str):
|
||||||
|
# some xpath rules return a single string instead of an array (e.g. concatenate() )
|
||||||
|
return [match,]
|
||||||
|
|
||||||
|
else:
|
||||||
|
return match
|
||||||
|
|
||||||
except etree.XPathEvalError:
|
except etree.XPathEvalError:
|
||||||
return []
|
return []
|
||||||
@@ -422,7 +430,7 @@ class ParserXML(ParserBase):
|
|||||||
|
|
||||||
match = self.rule_search(rrule)
|
match = self.rule_search(rrule)
|
||||||
|
|
||||||
html_rich = ('atom' in rule or self.rules['mode'] == 'html') \
|
html_rich = ('atom' in rule or self.rules.get('mode') == 'html') \
|
||||||
and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
|
and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
|
||||||
|
|
||||||
if key is not None:
|
if key is not None:
|
||||||
@@ -433,7 +441,7 @@ class ParserXML(ParserBase):
|
|||||||
self._clean_node(match)
|
self._clean_node(match)
|
||||||
match.append(lxml.html.fragment_fromstring(value, create_parent='div'))
|
match.append(lxml.html.fragment_fromstring(value, create_parent='div'))
|
||||||
|
|
||||||
if self.rules['mode'] == 'html':
|
if self.rules.get('mode') == 'html':
|
||||||
match.find('div').drop_tag() # not supported by lxml.etree
|
match.find('div').drop_tag() # not supported by lxml.etree
|
||||||
|
|
||||||
else: # i.e. if atom
|
else: # i.e. if atom
|
||||||
@@ -482,7 +490,14 @@ class ParserHTML(ParserXML):
|
|||||||
repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
|
repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
|
||||||
rule = re.sub(pattern, repl, rule)
|
rule = re.sub(pattern, repl, rule)
|
||||||
|
|
||||||
return self.root.xpath(rule)
|
match = self.root.xpath(rule)
|
||||||
|
|
||||||
|
if isinstance(match, str):
|
||||||
|
# for some xpath rules, see XML parser
|
||||||
|
return [match,]
|
||||||
|
|
||||||
|
else:
|
||||||
|
return match
|
||||||
|
|
||||||
except etree.XPathEvalError:
|
except etree.XPathEvalError:
|
||||||
return []
|
return []
|
||||||
@@ -684,7 +699,7 @@ class Feed(object):
|
|||||||
try:
|
try:
|
||||||
setattr(item, attr, new[attr])
|
setattr(item, attr, new[attr])
|
||||||
|
|
||||||
except (IndexError, TypeError):
|
except (KeyError, IndexError, TypeError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return item
|
return item
|
||||||
|
@@ -17,6 +17,7 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from fnmatch import fnmatch
|
from fnmatch import fnmatch
|
||||||
@@ -59,7 +60,7 @@ def log(txt):
|
|||||||
|
|
||||||
else:
|
else:
|
||||||
# when using internal server or cli
|
# when using internal server or cli
|
||||||
print(repr(txt))
|
print(repr(txt), file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
def len_html(txt):
|
def len_html(txt):
|
||||||
@@ -286,11 +287,14 @@ def FeedFetch(url, options):
|
|||||||
|
|
||||||
ruleset['items'] = options.items
|
ruleset['items'] = options.items
|
||||||
|
|
||||||
|
if options.mode:
|
||||||
|
ruleset['mode'] = options.mode
|
||||||
|
|
||||||
ruleset['title'] = options.get('title', '//head/title')
|
ruleset['title'] = options.get('title', '//head/title')
|
||||||
ruleset['desc'] = options.get('desc', '//head/meta[@name="description"]/@content')
|
ruleset['desc'] = options.get('desc', '//head/meta[@name="description"]/@content')
|
||||||
|
|
||||||
ruleset['item_title'] = options.get('item_title', '.')
|
ruleset['item_title'] = options.get('item_title', '.')
|
||||||
ruleset['item_link'] = options.get('item_link', './@href|.//a/@href|ancestor::a/@href')
|
ruleset['item_link'] = options.get('item_link', '(.|.//a|ancestor::a)/@href')
|
||||||
|
|
||||||
if options.item_content:
|
if options.item_content:
|
||||||
ruleset['item_content'] = options.item_content
|
ruleset['item_content'] = options.item_content
|
||||||
@@ -328,14 +332,20 @@ def FeedGather(rss, url, options):
|
|||||||
if options.cache:
|
if options.cache:
|
||||||
max_time = 0
|
max_time = 0
|
||||||
|
|
||||||
if options.newest:
|
# sort
|
||||||
# :newest take the newest items (instead of appearing order)
|
sorted_items = list(rss.items)
|
||||||
now = datetime.now(tz.tzutc())
|
|
||||||
sorted_items = sorted(rss.items, key=lambda x:x.updated or x.time or now, reverse=True)
|
|
||||||
|
|
||||||
else:
|
if options.order == 'last':
|
||||||
# default behavior, take the first items (in appearing order)
|
# `first` does nothing from a practical standpoint, so only `last` needs
|
||||||
sorted_items = list(rss.items)
|
# to be addressed
|
||||||
|
sorted_items = reversed(sorted_items)
|
||||||
|
|
||||||
|
elif options.order in ['newest', 'oldest']:
|
||||||
|
now = datetime.now(tz.tzutc())
|
||||||
|
sorted_items = sorted(sorted_items, key=lambda x:x.updated or x.time or now) # oldest to newest
|
||||||
|
|
||||||
|
if options.order == 'newest':
|
||||||
|
sorted_items = reversed(sorted_items)
|
||||||
|
|
||||||
for i, item in enumerate(sorted_items):
|
for i, item in enumerate(sorted_items):
|
||||||
# hard cap
|
# hard cap
|
||||||
@@ -418,7 +428,7 @@ def process(url, cache=None, options=None):
|
|||||||
options = Options(options)
|
options = Options(options)
|
||||||
|
|
||||||
if cache:
|
if cache:
|
||||||
caching.default_cache = caching.SQLiteCache(cache)
|
caching.default_cache = caching.DiskCacheHandler(cache)
|
||||||
|
|
||||||
url, rss = FeedFetch(url, options)
|
url, rss = FeedFetch(url, options)
|
||||||
rss = FeedGather(rss, url, options)
|
rss = FeedGather(rss, url, options)
|
||||||
|
@@ -17,21 +17,20 @@
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
import bs4.builder._lxml
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
import lxml.html
|
import lxml.html
|
||||||
from bs4 import BeautifulSoup
|
import lxml.html.soupparser
|
||||||
|
|
||||||
|
|
||||||
|
class CustomTreeBuilder(bs4.builder._lxml.LXMLTreeBuilder):
|
||||||
|
def default_parser(self, encoding):
|
||||||
|
return lxml.html.HTMLParser(target=self, remove_comments=True, remove_pis=True, encoding=encoding)
|
||||||
|
|
||||||
|
|
||||||
def parse(data, encoding=None):
|
def parse(data, encoding=None):
|
||||||
if encoding:
|
kwargs = {'from_encoding': encoding} if encoding else {}
|
||||||
data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')
|
return lxml.html.soupparser.fromstring(data, builder=CustomTreeBuilder, **kwargs)
|
||||||
|
|
||||||
else:
|
|
||||||
data = BeautifulSoup(data, 'lxml').prettify('utf-8')
|
|
||||||
|
|
||||||
parser = lxml.html.HTMLParser(remove_comments=True, encoding='utf-8')
|
|
||||||
|
|
||||||
return lxml.html.fromstring(data, parser=parser)
|
|
||||||
|
|
||||||
|
|
||||||
def count_words(string):
|
def count_words(string):
|
||||||
@@ -155,15 +154,20 @@ def score_all(node):
|
|||||||
|
|
||||||
for child in node:
|
for child in node:
|
||||||
score = score_node(child)
|
score = score_node(child)
|
||||||
child.attrib['morss_own_score'] = str(float(score))
|
set_score(child, score, 'morss_own_score')
|
||||||
|
|
||||||
if score > 0 or len(list(child.iterancestors())) <= 2:
|
if score > 0 or len(list(child.iterancestors())) <= 2:
|
||||||
spread_score(child, score)
|
spread_score(child, score)
|
||||||
score_all(child)
|
score_all(child)
|
||||||
|
|
||||||
|
|
||||||
def set_score(node, value):
|
def set_score(node, value, label='morss_score'):
|
||||||
node.attrib['morss_score'] = str(float(value))
|
try:
|
||||||
|
node.attrib[label] = str(float(value))
|
||||||
|
|
||||||
|
except KeyError:
|
||||||
|
# catch issues with e.g. html comments
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def get_score(node):
|
def get_score(node):
|
||||||
@@ -203,6 +207,12 @@ def clean_root(root, keep_threshold=None):
|
|||||||
def clean_node(node, keep_threshold=None):
|
def clean_node(node, keep_threshold=None):
|
||||||
parent = node.getparent()
|
parent = node.getparent()
|
||||||
|
|
||||||
|
# remove comments
|
||||||
|
if (isinstance(node, lxml.html.HtmlComment)
|
||||||
|
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
|
||||||
|
parent.remove(node)
|
||||||
|
return
|
||||||
|
|
||||||
if parent is None:
|
if parent is None:
|
||||||
# this is <html/> (or a removed element waiting for GC)
|
# this is <html/> (or a removed element waiting for GC)
|
||||||
return
|
return
|
||||||
@@ -234,11 +244,6 @@ def clean_node(node, keep_threshold=None):
|
|||||||
parent.remove(node)
|
parent.remove(node)
|
||||||
return
|
return
|
||||||
|
|
||||||
# remove comments
|
|
||||||
if isinstance(node, lxml.html.HtmlComment) or isinstance(node, lxml.html.HtmlProcessingInstruction):
|
|
||||||
parent.remove(node)
|
|
||||||
return
|
|
||||||
|
|
||||||
# remove if too many kids & too high link density
|
# remove if too many kids & too high link density
|
||||||
wc = count_words(node.text_content())
|
wc = count_words(node.text_content())
|
||||||
if wc != 0 and len(list(node.iter())) > 3:
|
if wc != 0 and len(list(node.iter())) > 3:
|
||||||
|
@@ -15,35 +15,42 @@
|
|||||||
# You should have received a copy of the GNU Affero General Public License along
|
# You should have received a copy of the GNU Affero General Public License along
|
||||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
import os
|
||||||
import os.path
|
import os.path
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
def pkg_path(path=''):
|
def pkg_path(*path_elements):
|
||||||
return os.path.join(os.path.dirname(__file__), path)
|
return os.path.join(os.path.dirname(__file__), *path_elements)
|
||||||
|
|
||||||
|
|
||||||
data_path_base = None
|
data_path_base = None
|
||||||
|
|
||||||
|
|
||||||
def data_path(path=''):
|
def data_path(*path_elements):
|
||||||
global data_path_base
|
global data_path_base
|
||||||
|
|
||||||
|
path = os.path.join(*path_elements)
|
||||||
|
|
||||||
if data_path_base is not None:
|
if data_path_base is not None:
|
||||||
return os.path.join(data_path_base, path)
|
return os.path.join(data_path_base, path)
|
||||||
|
|
||||||
bases = [
|
bases = [
|
||||||
os.path.join(sys.prefix, 'share/morss/www'),
|
os.path.join(sys.prefix, 'share/morss'), # when installed as root
|
||||||
os.path.join(pkg_path(), './../../../../share/morss/www'),
|
pkg_path('../../../share/morss'),
|
||||||
os.path.join(pkg_path(), '../www'),
|
pkg_path('../../../../share/morss'),
|
||||||
os.path.join(pkg_path(), '../..')
|
pkg_path('../share/morss'), # for `pip install --target=dir morss`
|
||||||
|
pkg_path('..'), # when running from source tree
|
||||||
]
|
]
|
||||||
|
|
||||||
|
if 'DATA_PATH' in os.environ:
|
||||||
|
bases.append(os.environ['DATA_PATH'])
|
||||||
|
|
||||||
for base in bases:
|
for base in bases:
|
||||||
full_path = os.path.join(base, path)
|
full_path = os.path.join(base, path)
|
||||||
|
|
||||||
if os.path.isfile(full_path):
|
if os.path.isfile(full_path):
|
||||||
data_path_base = base
|
data_path_base = os.path.abspath(base)
|
||||||
return data_path(path)
|
return data_path(path)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
@@ -36,6 +36,7 @@ except ImportError:
|
|||||||
from . import caching, crawler, readabilite
|
from . import caching, crawler, readabilite
|
||||||
from .morss import (DELAY, TIMEOUT, FeedFetch, FeedFormat, FeedGather,
|
from .morss import (DELAY, TIMEOUT, FeedFetch, FeedFormat, FeedGather,
|
||||||
MorssException, Options, log)
|
MorssException, Options, log)
|
||||||
|
from .util import data_path
|
||||||
|
|
||||||
PORT = int(os.getenv('PORT', 8000))
|
PORT = int(os.getenv('PORT', 8000))
|
||||||
|
|
||||||
@@ -167,26 +168,21 @@ def cgi_file_handler(environ, start_response, app):
|
|||||||
|
|
||||||
if re.match(r'^/?([a-zA-Z0-9_-][a-zA-Z0-9\._-]+/?)*$', url):
|
if re.match(r'^/?([a-zA-Z0-9_-][a-zA-Z0-9\._-]+/?)*$', url):
|
||||||
# if it is a legitimate url (no funny relative paths)
|
# if it is a legitimate url (no funny relative paths)
|
||||||
paths = [
|
try:
|
||||||
os.path.join(sys.prefix, 'share/morss/www', url),
|
path = data_path('www', url)
|
||||||
os.path.join(os.path.dirname(__file__), '../www', url)
|
f = open(path, 'rb')
|
||||||
]
|
|
||||||
|
|
||||||
for path in paths:
|
except IOError:
|
||||||
try:
|
# problem with file (cannot open or not found)
|
||||||
f = open(path, 'rb')
|
pass
|
||||||
|
|
||||||
except IOError:
|
else:
|
||||||
# problem with file (cannot open or not found)
|
# file successfully open
|
||||||
continue
|
headers = {}
|
||||||
|
headers['status'] = '200 OK'
|
||||||
else:
|
headers['content-type'] = mimetypes.guess_type(path)[0] or 'application/octet-stream'
|
||||||
# file successfully open
|
start_response(headers['status'], list(headers.items()))
|
||||||
headers = {}
|
return wsgiref.util.FileWrapper(f)
|
||||||
headers['status'] = '200 OK'
|
|
||||||
headers['content-type'] = mimetypes.guess_type(path)[0] or 'application/octet-stream'
|
|
||||||
start_response(headers['status'], list(headers.items()))
|
|
||||||
return wsgiref.util.FileWrapper(f)
|
|
||||||
|
|
||||||
# regex didn't validate or no file found
|
# regex didn't validate or no file found
|
||||||
return app(environ, start_response)
|
return app(environ, start_response)
|
||||||
@@ -196,32 +192,36 @@ def cgi_get(environ, start_response):
|
|||||||
url, options = cgi_parse_environ(environ)
|
url, options = cgi_parse_environ(environ)
|
||||||
|
|
||||||
# get page
|
# get page
|
||||||
req = crawler.adv_get(url=url, timeout=TIMEOUT)
|
if options['get'] in ('page', 'article'):
|
||||||
|
req = crawler.adv_get(url=url, timeout=TIMEOUT)
|
||||||
|
|
||||||
if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
if req['contenttype'] in crawler.MIMETYPE['html']:
|
||||||
if options['get'] == 'page':
|
if options['get'] == 'page':
|
||||||
html = readabilite.parse(req['data'], encoding=req['encoding'])
|
html = readabilite.parse(req['data'], encoding=req['encoding'])
|
||||||
html.make_links_absolute(req['url'])
|
html.make_links_absolute(req['url'])
|
||||||
|
|
||||||
kill_tags = ['script', 'iframe', 'noscript']
|
kill_tags = ['script', 'iframe', 'noscript']
|
||||||
|
|
||||||
for tag in kill_tags:
|
for tag in kill_tags:
|
||||||
for elem in html.xpath('//'+tag):
|
for elem in html.xpath('//'+tag):
|
||||||
elem.getparent().remove(elem)
|
elem.getparent().remove(elem)
|
||||||
|
|
||||||
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
|
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
|
||||||
|
|
||||||
elif options['get'] == 'article':
|
else: # i.e. options['get'] == 'article'
|
||||||
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
|
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
|
||||||
|
|
||||||
|
elif req['contenttype'] in crawler.MIMETYPE['xml'] + crawler.MIMETYPE['rss'] + crawler.MIMETYPE['json']:
|
||||||
|
output = req['data']
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise MorssException('no :get option passed')
|
raise MorssException('unsupported mimetype')
|
||||||
|
|
||||||
else:
|
else:
|
||||||
output = req['data']
|
raise MorssException('no :get option passed')
|
||||||
|
|
||||||
# return html page
|
# return html page
|
||||||
headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
|
headers = {'status': '200 OK', 'content-type': req['contenttype'], 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
|
||||||
start_response(headers['status'], list(headers.items()))
|
start_response(headers['status'], list(headers.items()))
|
||||||
return [output]
|
return [output]
|
||||||
|
|
||||||
@@ -281,7 +281,7 @@ def cgi_handle_request():
|
|||||||
|
|
||||||
class WSGIRequestHandlerRequestUri(wsgiref.simple_server.WSGIRequestHandler):
|
class WSGIRequestHandlerRequestUri(wsgiref.simple_server.WSGIRequestHandler):
|
||||||
def get_environ(self):
|
def get_environ(self):
|
||||||
env = super().get_environ()
|
env = wsgiref.simple_server.WSGIRequestHandler.get_environ(self)
|
||||||
env['REQUEST_URI'] = self.path
|
env['REQUEST_URI'] = self.path
|
||||||
return env
|
return env
|
||||||
|
|
||||||
|
43
setup.py
43
setup.py
@@ -3,29 +3,58 @@ from glob import glob
|
|||||||
|
|
||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
|
|
||||||
|
|
||||||
|
def get_version():
|
||||||
|
with open('morss/__init__.py', 'r+') as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
|
||||||
|
# look for hard coded version number
|
||||||
|
for i in range(len(lines)):
|
||||||
|
if lines[i].startswith('__version__'):
|
||||||
|
version = lines[i].split('"')[1]
|
||||||
|
break
|
||||||
|
|
||||||
|
# create (& save) one if none found
|
||||||
|
if version == '':
|
||||||
|
version = datetime.now().strftime('%Y%m%d.%H%M')
|
||||||
|
lines[i] = '__version__ = "' + version + '"\n'
|
||||||
|
|
||||||
|
file.seek(0)
|
||||||
|
file.writelines(lines)
|
||||||
|
|
||||||
|
# return version number
|
||||||
|
return version
|
||||||
|
|
||||||
package_name = 'morss'
|
package_name = 'morss'
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name = package_name,
|
name = package_name,
|
||||||
version = datetime.now().strftime('%Y%m%d.%H%M'),
|
version = get_version(),
|
||||||
description = 'Get full-text RSS feeds',
|
description = 'Get full-text RSS feeds',
|
||||||
|
long_description = open('README.md').read(),
|
||||||
|
long_description_content_type = 'text/markdown',
|
||||||
author = 'pictuga',
|
author = 'pictuga',
|
||||||
author_email = 'contact@pictuga.com',
|
author_email = 'contact@pictuga.com',
|
||||||
url = 'http://morss.it/',
|
url = 'http://morss.it/',
|
||||||
download_url = 'https://git.pictuga.com/pictuga/morss',
|
project_urls = {
|
||||||
|
'Source': 'https://git.pictuga.com/pictuga/morss',
|
||||||
|
'Bug Tracker': 'https://github.com/pictuga/morss/issues',
|
||||||
|
},
|
||||||
license = 'AGPL v3',
|
license = 'AGPL v3',
|
||||||
packages = [package_name],
|
packages = [package_name],
|
||||||
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'],
|
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'],
|
||||||
extras_require = {
|
extras_require = {
|
||||||
'full': ['pymysql', 'redis', 'diskcache', 'gunicorn'],
|
'full': ['redis', 'diskcache', 'gunicorn', 'setproctitle'],
|
||||||
'dev': ['pylint']
|
'dev': ['pylint', 'pyenchant', 'pytest', 'pytest-cov'],
|
||||||
},
|
},
|
||||||
|
python_requires = '>=2.7',
|
||||||
package_data = {package_name: ['feedify.ini']},
|
package_data = {package_name: ['feedify.ini']},
|
||||||
data_files = [
|
data_files = [
|
||||||
('share/' + package_name, ['README.md', 'LICENSE']),
|
('share/' + package_name, ['README.md', 'LICENSE']),
|
||||||
('share/' + package_name + '/www', glob('www/*.*')),
|
('share/' + package_name + '/www', glob('www/*.*')),
|
||||||
('share/' + package_name + '/www/cgi', [])
|
|
||||||
],
|
],
|
||||||
entry_points = {
|
entry_points = {
|
||||||
'console_scripts': [package_name + '=' + package_name + '.__main__:main']
|
'console_scripts': [package_name + '=' + package_name + '.__main__:main'],
|
||||||
})
|
},
|
||||||
|
scripts = ['morss-helper'],
|
||||||
|
)
|
||||||
|
60
tests/conftest.py
Normal file
60
tests/conftest.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
import os
|
||||||
|
import os.path
|
||||||
|
import threading
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
try:
|
||||||
|
# python2
|
||||||
|
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
|
||||||
|
from SimpleHTTPServer import SimpleHTTPRequestHandler
|
||||||
|
except:
|
||||||
|
# python3
|
||||||
|
from http.server import (BaseHTTPRequestHandler, HTTPServer,
|
||||||
|
SimpleHTTPRequestHandler)
|
||||||
|
|
||||||
|
class HTTPReplayHandler(SimpleHTTPRequestHandler):
|
||||||
|
" Serves pages saved alongside with headers. See `curl --http1.1 -is http://...` "
|
||||||
|
|
||||||
|
directory = os.path.join(os.path.dirname(__file__), './samples/')
|
||||||
|
|
||||||
|
__init__ = BaseHTTPRequestHandler.__init__
|
||||||
|
|
||||||
|
def do_GET(self):
|
||||||
|
path = self.translate_path(self.path)
|
||||||
|
|
||||||
|
if os.path.isdir(path):
|
||||||
|
f = self.list_directory(path)
|
||||||
|
|
||||||
|
else:
|
||||||
|
f = open(path, 'rb')
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.copyfile(f, self.wfile)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
class MuteHTTPServer(HTTPServer):
|
||||||
|
def handle_error(self, request, client_address):
|
||||||
|
# mute errors
|
||||||
|
pass
|
||||||
|
|
||||||
|
def make_server(port=8888):
|
||||||
|
print('Serving http://localhost:%s/' % port)
|
||||||
|
return MuteHTTPServer(('', port), RequestHandlerClass=HTTPReplayHandler)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def replay_server():
|
||||||
|
httpd = make_server()
|
||||||
|
thread = threading.Thread(target=httpd.serve_forever)
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
yield
|
||||||
|
|
||||||
|
httpd.shutdown()
|
||||||
|
thread.join()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
httpd = make_server()
|
||||||
|
httpd.serve_forever()
|
4
tests/samples/200-ok.txt
Normal file
4
tests/samples/200-ok.txt
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
HTTP/1.1 200 OK
|
||||||
|
content-type: text/plain
|
||||||
|
|
||||||
|
success
|
3
tests/samples/301-redirect-abs.txt
Normal file
3
tests/samples/301-redirect-abs.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
HTTP/1.1 301 Moved Permanently
|
||||||
|
location: /200-ok.txt
|
||||||
|
|
3
tests/samples/301-redirect-rel.txt
Normal file
3
tests/samples/301-redirect-rel.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
HTTP/1.1 301 Moved Permanently
|
||||||
|
location: ./200-ok.txt
|
||||||
|
|
3
tests/samples/301-redirect-url.txt
Normal file
3
tests/samples/301-redirect-url.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
HTTP/1.1 301 Moved Permanently
|
||||||
|
location: http://localhost:8888/200-ok.txt
|
||||||
|
|
4
tests/samples/308-redirect.txt
Normal file
4
tests/samples/308-redirect.txt
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
HTTP/1.1 308 Permanent Redirect
|
||||||
|
location: /200-ok.txt
|
||||||
|
|
||||||
|
/200-ok.txt
|
8
tests/samples/alternate-abs.txt
Normal file
8
tests/samples/alternate-abs.txt
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
HTTP/1.1 200 OK
|
||||||
|
content-type: text/html; charset=UTF-8
|
||||||
|
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head><link rel="alternate" type="application/rss+xml" href="/200-ok.txt" /></head>
|
||||||
|
<body>meta redirect</body>
|
||||||
|
</html>
|
4
tests/samples/enc-gb2312-header.txt
Normal file
4
tests/samples/enc-gb2312-header.txt
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
HTTP/1.1 200 OK
|
||||||
|
content-type: text/plain; charset=gb2312
|
||||||
|
|
||||||
|
<EFBFBD>ɹ<EFBFBD>
|
10
tests/samples/enc-gb2312-meta.txt
Normal file
10
tests/samples/enc-gb2312-meta.txt
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
HTTP/1.1 200 OK
|
||||||
|
content-type: text/html
|
||||||
|
|
||||||
|
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head><meta charset="gb2312"/></head>
|
||||||
|
<body>
|
||||||
|
<EFBFBD>ɹ<EFBFBD>
|
||||||
|
</body></html>
|
4
tests/samples/enc-iso-8859-1-header.txt
Normal file
4
tests/samples/enc-iso-8859-1-header.txt
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
HTTP/1.1 200 OK
|
||||||
|
content-type: text/plain; charset=iso-8859-1
|
||||||
|
|
||||||
|
succ<EFBFBD>s
|
4
tests/samples/enc-iso-8859-1-missing.txt
Normal file
4
tests/samples/enc-iso-8859-1-missing.txt
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
HTTP/1.1 200 OK
|
||||||
|
content-type: text/plain
|
||||||
|
|
||||||
|
succ<EFBFBD>s
|
4
tests/samples/enc-utf-8-header.txt
Normal file
4
tests/samples/enc-utf-8-header.txt
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
HTTP/1.1 200 OK
|
||||||
|
content-type: text/plain; charset=UTF-8
|
||||||
|
|
||||||
|
succès
|
16
tests/samples/feed-atom-utf-8.txt
Normal file
16
tests/samples/feed-atom-utf-8.txt
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
HTTP/1.1 200 OK
|
||||||
|
Content-Type: text/xml; charset=utf-8
|
||||||
|
|
||||||
|
<?xml version='1.0' encoding='utf-8'?>
|
||||||
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<title>!TITLE!</title>
|
||||||
|
<subtitle>!DESC!</subtitle>
|
||||||
|
<entry>
|
||||||
|
<title>!ITEM_TITLE!</title>
|
||||||
|
<summary>!ITEM_DESC!</summary>
|
||||||
|
<content type="html">!ITEM_CONTENT!</content>
|
||||||
|
<link href="!ITEM_LINK!"/>
|
||||||
|
<updated>2022-01-01T00:00:01+01:00</updated>
|
||||||
|
<published>2022-01-01T00:00:02+01:00</published>
|
||||||
|
</entry>
|
||||||
|
</feed>
|
15
tests/samples/feed-atom03-utf-8.txt
Normal file
15
tests/samples/feed-atom03-utf-8.txt
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
HTTP/1.1 200 OK
|
||||||
|
content-type: application/xml
|
||||||
|
|
||||||
|
<?xml version='1.0' encoding='utf-8' ?>
|
||||||
|
<feed version='0.3' xmlns='http://purl.org/atom/ns#'>
|
||||||
|
<title>!TITLE!</title>
|
||||||
|
<subtitle>!DESC!</subtitle>
|
||||||
|
<entry>
|
||||||
|
<title>!ITEM_TITLE!</title>
|
||||||
|
<link rel='alternate' type='text/html' href='!ITEM_LINK!' />
|
||||||
|
<summary>!ITEM_DESC!</summary>
|
||||||
|
<content>!ITEM_CONTENT!</content>
|
||||||
|
<issued>2022-01-01T00:00:01+01:00</issued> <!-- FIXME -->
|
||||||
|
</entry>
|
||||||
|
</feed>
|
22
tests/samples/feed-html-utf-8.txt
Normal file
22
tests/samples/feed-html-utf-8.txt
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
HTTP/1.1 200 OK
|
||||||
|
Content-Type: text/html; charset=utf-8
|
||||||
|
|
||||||
|
<html>
|
||||||
|
<head></head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
<div id="header">
|
||||||
|
<h1>!TITLE!</h1>
|
||||||
|
<p>!DESC!</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="content">
|
||||||
|
<div class="item">
|
||||||
|
<a target="_blank" href="!ITEM_LINK!">!ITEM_TITLE!</a>
|
||||||
|
<div class="desc">!ITEM_DESC!</div>
|
||||||
|
<div class="content">!ITEM_CONTENT!</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
16
tests/samples/feed-json-utf-8.txt
Normal file
16
tests/samples/feed-json-utf-8.txt
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
HTTP/1.1 200 OK
|
||||||
|
Content-Type: application/json; charset=utf-8
|
||||||
|
|
||||||
|
{
|
||||||
|
"title": "!TITLE!",
|
||||||
|
"desc": "!DESC!",
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"title": "!ITEM_TITLE!",
|
||||||
|
"time": "2022-01-01T00:00:01+0100",
|
||||||
|
"url": "!ITEM_LINK!",
|
||||||
|
"desc": "!ITEM_DESC!",
|
||||||
|
"content": "!ITEM_CONTENT!"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
17
tests/samples/feed-rss-channel-utf-8.txt
Normal file
17
tests/samples/feed-rss-channel-utf-8.txt
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
HTTP/1.1 200 OK
|
||||||
|
Content-Type: text/xml; charset=utf-8
|
||||||
|
|
||||||
|
<?xml version='1.0' encoding='utf-8'?>
|
||||||
|
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
|
||||||
|
<channel>
|
||||||
|
<title>!TITLE!</title>
|
||||||
|
<description>!DESC!</description>
|
||||||
|
<item>
|
||||||
|
<title>!ITEM_TITLE!</title>
|
||||||
|
<pubDate>Mon, 01 Jan 2022 00:00:01 +0100</pubDate>
|
||||||
|
<link>!ITEM_LINK!</link>
|
||||||
|
<description>!ITEM_DESC!</description>
|
||||||
|
<content:encoded>!ITEM_CONTENT!</content:encoded>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>
|
BIN
tests/samples/gzip.txt
Normal file
BIN
tests/samples/gzip.txt
Normal file
Binary file not shown.
3
tests/samples/header-refresh.txt
Normal file
3
tests/samples/header-refresh.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
HTTP/1.1 200 OK
|
||||||
|
refresh: 0;url=/200-ok.txt
|
||||||
|
|
8
tests/samples/meta-redirect-abs.txt
Normal file
8
tests/samples/meta-redirect-abs.txt
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
HTTP/1.1 200 OK
|
||||||
|
content-type: text/html; charset=UTF-8
|
||||||
|
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head><meta http-equiv="refresh" content="2; url = /200-ok.txt" /></head>
|
||||||
|
<body>meta redirect</body>
|
||||||
|
</html>
|
8
tests/samples/meta-redirect-rel.txt
Normal file
8
tests/samples/meta-redirect-rel.txt
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
HTTP/1.1 200 OK
|
||||||
|
content-type: text/html; charset=UTF-8
|
||||||
|
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head><meta http-equiv="refresh" content="2; url = ./200-ok.txt" /></head>
|
||||||
|
<body>meta redirect</body>
|
||||||
|
</html>
|
8
tests/samples/meta-redirect-url.txt
Normal file
8
tests/samples/meta-redirect-url.txt
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
HTTP/1.1 200 OK
|
||||||
|
content-type: text/html; charset=UTF-8
|
||||||
|
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head><meta http-equiv="refresh" content="2; url = http://localhost:8888/200-ok.txt" /></head>
|
||||||
|
<body>meta redirect</body>
|
||||||
|
</html>
|
9220
tests/samples/size-1MiB.txt
Normal file
9220
tests/samples/size-1MiB.txt
Normal file
File diff suppressed because it is too large
Load Diff
62
tests/test_crawler.py
Normal file
62
tests/test_crawler.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from morss.crawler import *
|
||||||
|
|
||||||
|
|
||||||
|
def test_get(replay_server):
|
||||||
|
assert get('http://localhost:8888/200-ok.txt') == b'success\r\n'
|
||||||
|
|
||||||
|
def test_adv_get(replay_server):
|
||||||
|
assert adv_get('http://localhost:8888/200-ok.txt')['data'] == b'success\r\n'
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('before,after', [
|
||||||
|
(b'http://localhost:8888/', 'http://localhost:8888/'),
|
||||||
|
('localhost:8888/', 'http://localhost:8888/'),
|
||||||
|
('http:/localhost:8888/', 'http://localhost:8888/'),
|
||||||
|
('http://localhost:8888/&/', 'http://localhost:8888/&/'),
|
||||||
|
('http://localhost:8888/ /', 'http://localhost:8888/%20/'),
|
||||||
|
('http://localhost-€/€/', 'http://xn--localhost--077e/%E2%82%AC/'),
|
||||||
|
('http://localhost-€:8888/€/', 'http://xn--localhost--077e:8888/%E2%82%AC/'),
|
||||||
|
])
|
||||||
|
def test_sanitize_url(before, after):
|
||||||
|
assert sanitize_url(before) == after
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(SizeLimitHandler(500*1024))])
|
||||||
|
def test_size_limit_handler(replay_server, opener):
|
||||||
|
assert len(opener.open('http://localhost:8888/size-1MiB.txt').read()) == 500*1024
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(GZIPHandler())])
|
||||||
|
def test_gzip_handler(replay_server, opener):
|
||||||
|
assert opener.open('http://localhost:8888/gzip.txt').read() == b'success\n'
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(EncodingFixHandler())])
|
||||||
|
@pytest.mark.parametrize('url', [
|
||||||
|
'enc-gb2312-header.txt', 'enc-gb2312-meta.txt', #'enc-gb2312-missing.txt',
|
||||||
|
'enc-iso-8859-1-header.txt', 'enc-iso-8859-1-missing.txt',
|
||||||
|
'enc-utf-8-header.txt',
|
||||||
|
])
|
||||||
|
def test_encoding_fix_handler(replay_server, opener, url):
|
||||||
|
out = adv_get('http://localhost:8888/%s' % url)
|
||||||
|
out = out['data'].decode(out['encoding'])
|
||||||
|
assert 'succes' in out or 'succès' in out or '成功' in out
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('opener', [custom_opener(follow='rss'), build_opener(AlternateHandler(MIMETYPE['rss']))])
|
||||||
|
def test_alternate_handler(replay_server, opener):
|
||||||
|
assert opener.open('http://localhost:8888/alternate-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPEquivHandler(), HTTPRefreshHandler())])
|
||||||
|
def test_http_equiv_handler(replay_server, opener):
|
||||||
|
assert opener.open('http://localhost:8888/meta-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||||
|
assert opener.open('http://localhost:8888/meta-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||||
|
assert opener.open('http://localhost:8888/meta-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPAllRedirectHandler())])
|
||||||
|
def test_http_all_redirect_handler(replay_server, opener):
|
||||||
|
assert opener.open('http://localhost:8888/308-redirect.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||||
|
assert opener.open('http://localhost:8888/301-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||||
|
assert opener.open('http://localhost:8888/301-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||||
|
assert opener.open('http://localhost:8888/301-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPRefreshHandler())])
|
||||||
|
def test_http_refresh_handler(replay_server, opener):
|
||||||
|
assert opener.open('http://localhost:8888/header-refresh.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
108
tests/test_feeds.py
Normal file
108
tests/test_feeds.py
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from morss.crawler import adv_get
|
||||||
|
from morss.feeds import *
|
||||||
|
|
||||||
|
|
||||||
|
def get_feed(url):
|
||||||
|
url = 'http://localhost:8888/%s' % url
|
||||||
|
out = adv_get(url)
|
||||||
|
feed = parse(out['data'], url=url, encoding=out['encoding'])
|
||||||
|
return feed
|
||||||
|
|
||||||
|
def check_feed(feed):
|
||||||
|
# NB. time and updated not covered
|
||||||
|
assert feed.title == '!TITLE!'
|
||||||
|
assert feed.desc == '!DESC!'
|
||||||
|
assert feed[0] == feed.items[0]
|
||||||
|
assert feed[0].title == '!ITEM_TITLE!'
|
||||||
|
assert feed[0].link == '!ITEM_LINK!'
|
||||||
|
assert '!ITEM_DESC!' in feed[0].desc # broader test due to possible inclusion of surrounding <div> in xml
|
||||||
|
assert '!ITEM_CONTENT!' in feed[0].content
|
||||||
|
|
||||||
|
def check_output(feed):
|
||||||
|
output = feed.tostring()
|
||||||
|
assert '!TITLE!' in output
|
||||||
|
assert '!DESC!' in output
|
||||||
|
assert '!ITEM_TITLE!' in output
|
||||||
|
assert '!ITEM_LINK!' in output
|
||||||
|
assert '!ITEM_DESC!' in output
|
||||||
|
assert '!ITEM_CONTENT!' in output
|
||||||
|
|
||||||
|
def check_change(feed):
|
||||||
|
feed.title = '!TITLE2!'
|
||||||
|
feed.desc = '!DESC2!'
|
||||||
|
feed[0].title = '!ITEM_TITLE2!'
|
||||||
|
feed[0].link = '!ITEM_LINK2!'
|
||||||
|
feed[0].desc = '!ITEM_DESC2!'
|
||||||
|
feed[0].content = '!ITEM_CONTENT2!'
|
||||||
|
|
||||||
|
assert feed.title == '!TITLE2!'
|
||||||
|
assert feed.desc == '!DESC2!'
|
||||||
|
assert feed[0].title == '!ITEM_TITLE2!'
|
||||||
|
assert feed[0].link == '!ITEM_LINK2!'
|
||||||
|
assert '!ITEM_DESC2!' in feed[0].desc
|
||||||
|
assert '!ITEM_CONTENT2!' in feed[0].content
|
||||||
|
|
||||||
|
def check_add(feed):
|
||||||
|
feed.append({
|
||||||
|
'title': '!ITEM_TITLE3!',
|
||||||
|
'link': '!ITEM_LINK3!',
|
||||||
|
'desc': '!ITEM_DESC3!',
|
||||||
|
'content': '!ITEM_CONTENT3!',
|
||||||
|
})
|
||||||
|
|
||||||
|
assert feed[1].title == '!ITEM_TITLE3!'
|
||||||
|
assert feed[1].link == '!ITEM_LINK3!'
|
||||||
|
assert '!ITEM_DESC3!' in feed[1].desc
|
||||||
|
assert '!ITEM_CONTENT3!' in feed[1].content
|
||||||
|
|
||||||
|
each_format = pytest.mark.parametrize('url', [
|
||||||
|
'feed-rss-channel-utf-8.txt', 'feed-atom-utf-8.txt',
|
||||||
|
'feed-atom03-utf-8.txt', 'feed-json-utf-8.txt', 'feed-html-utf-8.txt',
|
||||||
|
])
|
||||||
|
|
||||||
|
each_check = pytest.mark.parametrize('check', [
|
||||||
|
check_feed, check_output, check_change, check_add,
|
||||||
|
])
|
||||||
|
|
||||||
|
@each_format
|
||||||
|
@each_check
|
||||||
|
def test_parse(replay_server, url, check):
|
||||||
|
feed = get_feed(url)
|
||||||
|
check(feed)
|
||||||
|
|
||||||
|
@each_format
|
||||||
|
@each_check
|
||||||
|
def test_convert_rss(replay_server, url, check):
|
||||||
|
feed = get_feed(url)
|
||||||
|
feed = feed.convert(FeedXML)
|
||||||
|
check(feed)
|
||||||
|
|
||||||
|
@each_format
|
||||||
|
@each_check
|
||||||
|
def test_convert_json(replay_server, url, check):
|
||||||
|
feed = get_feed(url)
|
||||||
|
feed = feed.convert(FeedJSON)
|
||||||
|
check(feed)
|
||||||
|
|
||||||
|
@each_format
|
||||||
|
@each_check
|
||||||
|
def test_convert_html(replay_server, url, check):
|
||||||
|
feed = get_feed(url)
|
||||||
|
feed = feed.convert(FeedHTML)
|
||||||
|
if len(feed) > 1:
|
||||||
|
# remove the 'blank' default html item
|
||||||
|
del feed[0]
|
||||||
|
check(feed)
|
||||||
|
|
||||||
|
@each_format
|
||||||
|
def test_convert_csv(replay_server, url):
|
||||||
|
# only csv output, not csv feed, check therefore differnet
|
||||||
|
feed = get_feed(url)
|
||||||
|
output = feed.tocsv()
|
||||||
|
|
||||||
|
assert '!ITEM_TITLE!' in output
|
||||||
|
assert '!ITEM_LINK!' in output
|
||||||
|
assert '!ITEM_DESC!' in output
|
||||||
|
assert '!ITEM_CONTENT!' in output
|
@@ -1,15 +0,0 @@
|
|||||||
Options -Indexes
|
|
||||||
|
|
||||||
ErrorDocument 403 "Access forbidden"
|
|
||||||
ErrorDocument 404 /cgi/main.py
|
|
||||||
ErrorDocument 500 "A very nasty bug found his way onto this very server"
|
|
||||||
|
|
||||||
# Uncomment below line to turn debug on for all requests
|
|
||||||
#SetEnv DEBUG 1
|
|
||||||
|
|
||||||
# Uncomment below line to turn debug on for requests with :debug in the url
|
|
||||||
#SetEnvIf Request_URI :debug DEBUG=1
|
|
||||||
|
|
||||||
<Files ~ "\.(py|pyc|db|log)$">
|
|
||||||
deny from all
|
|
||||||
</Files>
|
|
@@ -1,9 +0,0 @@
|
|||||||
order allow,deny
|
|
||||||
|
|
||||||
deny from all
|
|
||||||
|
|
||||||
<Files main.py>
|
|
||||||
allow from all
|
|
||||||
AddHandler cgi-script .py
|
|
||||||
Options +ExecCGI
|
|
||||||
</Files>
|
|
@@ -16,6 +16,7 @@
|
|||||||
<title>RSS feed by morss</title>
|
<title>RSS feed by morss</title>
|
||||||
<meta name="viewport" content="width=device-width; initial-scale=1.0;" />
|
<meta name="viewport" content="width=device-width; initial-scale=1.0;" />
|
||||||
<meta name="robots" content="noindex" />
|
<meta name="robots" content="noindex" />
|
||||||
|
<link rel="shortcut icon" type="image/svg+xml" href="/logo.svg" sizes="any" />
|
||||||
|
|
||||||
<style type="text/css">
|
<style type="text/css">
|
||||||
body * {
|
body * {
|
||||||
@@ -203,7 +204,9 @@
|
|||||||
link of the
|
link of the
|
||||||
<select>
|
<select>
|
||||||
<option value="">first</option>
|
<option value="">first</option>
|
||||||
<option value=":newest" title="Select feed items by publication date (instead of appearing order)">newest (?)</option>
|
<option value=":order=newest" title="Select feed items by publication date (instead of appearing order)">newest (?)</option>
|
||||||
|
<option value=":order=last">last</option>
|
||||||
|
<option value=":order=oldest">oldest</option>
|
||||||
</select>
|
</select>
|
||||||
items and
|
items and
|
||||||
<select>
|
<select>
|
||||||
|
Reference in New Issue
Block a user