Compare commits
1 Commits
master
...
7f4589c578
Author | SHA1 | Date | |
---|---|---|---|
7f4589c578 |
78
.github/workflows/default.yml
vendored
78
.github/workflows/default.yml
vendored
@@ -1,78 +0,0 @@
|
||||
name: default
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
jobs:
|
||||
test-lint:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Prepare image
|
||||
run: apt-get -y update && apt-get -y install python3-pip libenchant-2-2 aspell-en
|
||||
|
||||
- name: Install dependencies
|
||||
run: pip3 install .[full] .[dev]
|
||||
- run: isort --check-only --diff .
|
||||
- run: pylint morss --rcfile=.pylintrc --disable=C,R,W --fail-under=8
|
||||
- run: pytest --cov=morss tests
|
||||
|
||||
python-publish:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Prepare image
|
||||
run: apt-get -y update && apt-get -y install python3-pip python3-build
|
||||
|
||||
- name: Build package
|
||||
run: python3 -m build
|
||||
|
||||
- name: Publish package
|
||||
uses: https://github.com/pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
password: ${{ secrets.pypi_api_token }}
|
||||
|
||||
docker-publish-deploy:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: catthehacker/ubuntu:act-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: https://github.com/docker/setup-qemu-action@v2
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: https://github.com/docker/setup-buildx-action@v2
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: https://github.com/docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.docker_user }}
|
||||
password: ${{ secrets.docker_pwd }}
|
||||
|
||||
- name: Build and push
|
||||
uses: https://github.com/docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64,linux/arm64,linux/arm/v7
|
||||
push: true
|
||||
tags: ${{ secrets.docker_repo }}
|
||||
|
||||
- name: Deploy on server
|
||||
uses: https://github.com/appleboy/ssh-action@v0.1.10
|
||||
with:
|
||||
host: ${{ secrets.ssh_host }}
|
||||
username: ${{ secrets.ssh_user }}
|
||||
key: ${{ secrets.ssh_key }}
|
||||
script: morss-update
|
50
.pylintrc
50
.pylintrc
@@ -1,50 +0,0 @@
|
||||
[MASTER]
|
||||
ignore=CVS
|
||||
suggestion-mode=yes
|
||||
extension-pkg-allow-list=lxml.etree
|
||||
|
||||
[MESSAGES CONTROL]
|
||||
disable=missing-function-docstring,
|
||||
missing-class-docstring,
|
||||
missing-module-docstring,
|
||||
wrong-spelling-in-comment,
|
||||
|
||||
[REPORTS]
|
||||
reports=yes
|
||||
score=yes
|
||||
|
||||
[SPELLING]
|
||||
spelling-dict=en_GB
|
||||
spelling-ignore-words=morss
|
||||
|
||||
[STRING]
|
||||
check-quote-consistency=yes
|
||||
check-str-concat-over-line-jumps=yes
|
||||
|
||||
[VARIABLES]
|
||||
allow-global-unused-variables=no
|
||||
init-import=no
|
||||
|
||||
[FORMAT]
|
||||
expected-line-ending-format=LF
|
||||
indent-string=' '
|
||||
max-line-length=120
|
||||
max-module-lines=1000
|
||||
|
||||
[BASIC]
|
||||
argument-naming-style=snake_case
|
||||
attr-naming-style=snake_case
|
||||
class-attribute-naming-style=snake_case
|
||||
class-const-naming-style=UPPER_CASE
|
||||
class-naming-style=PascalCase
|
||||
const-naming-style=UPPER_CASE
|
||||
function-naming-style=snake_case
|
||||
inlinevar-naming-style=snake_case
|
||||
method-naming-style=snake_case
|
||||
module-naming-style=snake_case
|
||||
variable-naming-style=snake_case
|
||||
|
||||
include-naming-hint=yes
|
||||
|
||||
bad-names=foo, bar
|
||||
good-names=i, j, k
|
18
Dockerfile
18
Dockerfile
@@ -1,16 +1,8 @@
|
||||
FROM alpine:edge
|
||||
FROM alpine:latest
|
||||
|
||||
RUN apk add python3 py3-lxml py3-gunicorn py3-pip git
|
||||
|
||||
ADD . /app
|
||||
RUN pip3 install /app
|
||||
|
||||
RUN set -ex; \
|
||||
apk add --no-cache --virtual .run-deps python3 py3-lxml py3-setproctitle py3-setuptools; \
|
||||
apk add --no-cache --virtual .build-deps py3-pip py3-wheel; \
|
||||
pip3 install --no-cache-dir /app[full]; \
|
||||
apk del .build-deps
|
||||
|
||||
USER 1000:1000
|
||||
|
||||
ENTRYPOINT ["/bin/sh", "/app/morss-helper"]
|
||||
CMD ["run"]
|
||||
|
||||
HEALTHCHECK CMD /bin/sh /app/morss-helper check
|
||||
CMD gunicorn --bind 0.0.0.0:8080 -w 4 morss:cgi_standalone_app
|
||||
|
508
README.md
508
README.md
@@ -1,14 +1,10 @@
|
||||
# Morss - Get full-text RSS feeds
|
||||
|
||||
[Homepage](https://morss.it/) •
|
||||
[Upstream source code](https://git.pictuga.com/pictuga/morss) •
|
||||
[Github mirror](https://github.com/pictuga/morss) (for Issues & Pull requests)
|
||||
_GNU AGPLv3 code_
|
||||
|
||||
[](https://ci.pictuga.com/pictuga/morss)
|
||||
[](https://github.com/pictuga/morss/stargazers)
|
||||
[](https://github.com/pictuga/morss/network/members)
|
||||
[](https://git.pictuga.com/pictuga/morss/src/branch/master/LICENSE)
|
||||
[](https://creativecommons.org/licenses/by-nc-sa/4.0/)
|
||||
Upstream source code: https://git.pictuga.com/pictuga/morss
|
||||
Github mirror (for Issues & Pull requests): https://github.com/pictuga/morss
|
||||
Homepage: https://morss.it/
|
||||
|
||||
This tool's goal is to get full-text RSS feeds out of striped RSS feeds,
|
||||
commonly available on internet. Indeed most newspapers only make a small
|
||||
@@ -22,7 +18,7 @@ Morss also provides additional features, such as: .csv and json export, extended
|
||||
control over output. A strength of morss is its ability to deal with broken
|
||||
feeds, and to replace tracking links with direct links to the actual content.
|
||||
|
||||
Morss can also generate feeds from html and json files (see `feeds.py`), which
|
||||
Morss can also generate feeds from html and json files (see `feedify.py`), which
|
||||
for instance makes it possible to get feeds for Facebook or Twitter, using
|
||||
hand-written rules (ie. there's no automatic detection of links to build feeds).
|
||||
Please mind that feeds based on html files may stop working unexpectedly, due to
|
||||
@@ -33,7 +29,6 @@ Additionally morss can detect rss feeds in html pages' `<meta>`.
|
||||
You can use this program online for free at **[morss.it](https://morss.it/)**.
|
||||
|
||||
Some features of morss:
|
||||
|
||||
- Read RSS/Atom feeds
|
||||
- Create RSS feeds from json/html pages
|
||||
- Export feeds as RSS/JSON/CSV/HTML
|
||||
@@ -41,213 +36,77 @@ Some features of morss:
|
||||
- Follow 301/meta redirects
|
||||
- Recover xml feeds with corrupt encoding
|
||||
- Supports gzip-compressed http content
|
||||
- HTTP caching with different backends (in-memory/redis/diskcache)
|
||||
- HTTP caching with 3 different backends (in-memory/sqlite/mysql)
|
||||
- Works as server/cli tool
|
||||
- Deobfuscate various tracking links
|
||||
|
||||
## Install
|
||||
## Dependencies
|
||||
|
||||
### Python package
|
||||
You do need:
|
||||
|
||||
].status&url=https://ci.pictuga.com/api/repos/pictuga/morss/builds/latest)
|
||||
[](https://pypi.org/project/morss/)
|
||||
[](https://pypistats.org/packages/morss)
|
||||
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
|
||||
- [lxml](http://lxml.de/) for xml parsing
|
||||
- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
|
||||
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
|
||||
- [chardet](https://pypi.python.org/pypi/chardet)
|
||||
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
|
||||
- pymysql
|
||||
|
||||
Simple install (without optional dependencies)
|
||||
|
||||
From pip
|
||||
Simplest way to get these:
|
||||
|
||||
```shell
|
||||
pip install morss
|
||||
pip install git+https://git.pictuga.com/pictuga/morss.git@master
|
||||
```
|
||||
|
||||
From git
|
||||
|
||||
```shell
|
||||
pip install git+https://git.pictuga.com/pictuga/morss.git
|
||||
```
|
||||
|
||||
Full installation (including optional dependencies)
|
||||
|
||||
From pip
|
||||
|
||||
```shell
|
||||
pip install morss[full]
|
||||
```
|
||||
|
||||
From git
|
||||
|
||||
```shell
|
||||
pip install git+https://git.pictuga.com/pictuga/morss.git#egg=morss[full]
|
||||
```
|
||||
|
||||
The full install includes all the cache backends. Otherwise, only in-memory
|
||||
cache is available. The full install also includes gunicorn (for more efficient
|
||||
HTTP handling).
|
||||
|
||||
The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
|
||||
C code needs to be compiled). If possible on your distribution, try installing
|
||||
it with the system package manager.
|
||||
|
||||
### Docker
|
||||
You may also need:
|
||||
|
||||
].status&url=https://ci.pictuga.com/api/repos/pictuga/morss/builds/latest)
|
||||
[](https://hub.docker.com/r/pictuga/morss)
|
||||
[](https://hub.docker.com/r/pictuga/morss/tags)
|
||||
- Apache, with python-cgi support, to run on a server
|
||||
- a fast internet connection
|
||||
|
||||
From docker hub
|
||||
## Arguments
|
||||
|
||||
With cli
|
||||
morss accepts some arguments, to lightly alter the output of morss. Arguments
|
||||
may need to have a value (usually a string or a number). In the different "Use
|
||||
cases" below is detailed how to pass those arguments to morss.
|
||||
|
||||
```shell
|
||||
docker pull pictuga/morss
|
||||
```
|
||||
The arguments are:
|
||||
|
||||
With docker-compose **(recommended)**
|
||||
- Change what morss does
|
||||
- `json`: output as JSON
|
||||
- `html`: outpout as HTML
|
||||
- `csv`: outpout as CSV
|
||||
- `proxy`: doesn't fill the articles
|
||||
- `clip`: stick the full article content under the original feed content (useful for twitter)
|
||||
- `search=STRING`: does a basic case-sensitive search in the feed
|
||||
- Advanced
|
||||
- `csv`: export to csv
|
||||
- `indent`: returns indented XML or JSON, takes more place, but human-readable
|
||||
- `nolink`: drop links, but keeps links' inner text
|
||||
- `noref`: drop items' link
|
||||
- `cache`: only take articles from the cache (ie. don't grab new articles' content), so as to save time
|
||||
- `debug`: to have some feedback from the script execution. Useful for debugging
|
||||
- `theforce`: force download the rss feed and ignore cached http errros
|
||||
- `silent`: don't output the final RSS (useless on its own, but can be nice when debugging)
|
||||
- http server only
|
||||
- `callback=NAME`: for JSONP calls
|
||||
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers)
|
||||
- `txt`: changes the http content-type to txt (for faster "`view-source:`")
|
||||
- Custom feeds: you can turn any HTML page into a RSS feed using morss, using xpath rules. The article content will be fetched as usual (with readabilite). Please note that you will have to **replace** any `/` in your rule with a `|` when using morss as a webserver
|
||||
- `items`: (**mandatory** to activate the custom feeds function) xpath rule to match all the RSS entries
|
||||
- `item_link`: xpath rule relative to `items` to point to the entry's link
|
||||
- `item_title`: entry's title
|
||||
- `item_content`: entry's description
|
||||
- `item_time`: entry's date & time (accepts a wide range of time formats)
|
||||
|
||||
```yml
|
||||
services:
|
||||
app:
|
||||
image: pictuga/morss
|
||||
ports:
|
||||
- '8000:8000'
|
||||
```
|
||||
|
||||
Build from source
|
||||
|
||||
With cli
|
||||
|
||||
```shell
|
||||
docker build --tag morss https://git.pictuga.com/pictuga/morss.git --no-cache --pull
|
||||
```
|
||||
|
||||
With docker-compose
|
||||
|
||||
```yml
|
||||
services:
|
||||
app:
|
||||
build: https://git.pictuga.com/pictuga/morss.git
|
||||
image: morss
|
||||
ports:
|
||||
- '8000:8000'
|
||||
```
|
||||
|
||||
Then execute
|
||||
|
||||
```shell
|
||||
docker-compose build --no-cache --pull
|
||||
```
|
||||
|
||||
### Cloud providers
|
||||
|
||||
One-click deployment:
|
||||
|
||||
[](https://heroku.com/deploy?template=https://github.com/pictuga/morss)
|
||||
[](https://deploy.cloud.run/?git_repo=https://github.com/pictuga/morss.git)
|
||||
|
||||
Providers supporting `cloud-init` (AWS, Oracle Cloud Infrastructure), based on Ubuntu:
|
||||
|
||||
``` yml
|
||||
#cloud-config
|
||||
|
||||
packages:
|
||||
- python3-pip
|
||||
- python3-wheel
|
||||
- python3-lxml
|
||||
- python3-setproctitle
|
||||
- ca-certificates
|
||||
|
||||
write_files:
|
||||
- path: /etc/environment
|
||||
append: true
|
||||
content: |
|
||||
DEBUG=1
|
||||
CACHE=diskcache
|
||||
CACHE_SIZE=1073741824 # 1GiB
|
||||
- path: /var/lib/cloud/scripts/per-boot/morss.sh
|
||||
permissions: 744
|
||||
content: |
|
||||
#!/bin/sh
|
||||
/usr/local/bin/morss-helper daemon
|
||||
|
||||
runcmd:
|
||||
- source /etc/environment
|
||||
- update-ca-certificates
|
||||
- iptables -I INPUT 6 -m state --state NEW -p tcp --dport ${PORT:-8000} -j ACCEPT
|
||||
- netfilter-persistent save
|
||||
- pip install morss[full]
|
||||
```
|
||||
|
||||
## Run
|
||||
## Use cases
|
||||
|
||||
morss will auto-detect what "mode" to use.
|
||||
|
||||
### Running on/as a server
|
||||
|
||||
Set up the server as indicated below, then visit:
|
||||
|
||||
```
|
||||
http://PATH/TO/MORSS/[main.py/][:argwithoutvalue[:argwithvalue=value[...]]]/FEEDURL
|
||||
```
|
||||
|
||||
For example: `http://morss.example/:clip/https://twitter.com/pictuga`
|
||||
|
||||
*(Brackets indicate optional text)*
|
||||
|
||||
The `main.py` part is only needed if your server doesn't support the Apache
|
||||
redirect rule set in the provided `.htaccess`.
|
||||
|
||||
Works like a charm with [Tiny Tiny RSS](https://tt-rss.org/), and most probably
|
||||
other clients.
|
||||
|
||||
|
||||
#### Using Docker
|
||||
|
||||
From docker hub
|
||||
|
||||
```shell
|
||||
docker run -p 8000:8000 pictuga/morss
|
||||
```
|
||||
|
||||
From source
|
||||
|
||||
```shell
|
||||
docker run -p 8000:8000 morss
|
||||
```
|
||||
|
||||
With docker-compose **(recommended)**
|
||||
|
||||
```shell
|
||||
docker-compose up
|
||||
```
|
||||
|
||||
#### Using Gunicorn
|
||||
|
||||
```shell
|
||||
gunicorn --preload morss
|
||||
```
|
||||
|
||||
#### Using uWSGI
|
||||
|
||||
Running this command should do:
|
||||
|
||||
```shell
|
||||
uwsgi --http :8000 --plugin python --wsgi-file main.py
|
||||
```
|
||||
|
||||
#### Using morss' internal HTTP server
|
||||
|
||||
Morss can run its own, **very basic**, HTTP server, meant for debugging mostly.
|
||||
The latter should start when you run morss without any argument, on port 8000.
|
||||
I'd highly recommend you to use gunicorn or something similar for better
|
||||
performance.
|
||||
|
||||
```shell
|
||||
morss
|
||||
```
|
||||
|
||||
You can change the port using environment variables like this `PORT=9000 morss`.
|
||||
|
||||
### Running on a server
|
||||
#### Via mod_cgi/FastCGI with Apache/nginx
|
||||
|
||||
For this, you'll want to change a bit the architecture of the files, for example
|
||||
@@ -276,49 +135,73 @@ For this, you need to make sure your host allows python script execution. This
|
||||
method uses HTTP calls to fetch the RSS feeds, which will be handled through
|
||||
`mod_cgi` for example on Apache severs.
|
||||
|
||||
Please pay attention to `main.py` permissions for it to be executable. See below
|
||||
some tips for the `.htaccess` file.
|
||||
Please pay attention to `main.py` permissions for it to be executable. Also
|
||||
ensure that the provided `/www/.htaccess` works well with your server.
|
||||
|
||||
```htaccess
|
||||
Options -Indexes
|
||||
#### Using uWSGI
|
||||
|
||||
ErrorDocument 404 /cgi/main.py
|
||||
Running this command should do:
|
||||
|
||||
# Turn debug on for all requests
|
||||
SetEnv DEBUG 1
|
||||
|
||||
# Turn debug on for requests with :debug in the url
|
||||
SetEnvIf Request_URI :debug DEBUG=1
|
||||
|
||||
<Files ~ "\.(py|pyc|db|log)$">
|
||||
deny from all
|
||||
</Files>
|
||||
|
||||
<Files main.py>
|
||||
allow from all
|
||||
AddHandler cgi-script .py
|
||||
Options +ExecCGI
|
||||
</Files>
|
||||
```shell
|
||||
uwsgi --http :8080 --plugin python --wsgi-file main.py
|
||||
```
|
||||
|
||||
#### Using Gunicorn
|
||||
|
||||
```shell
|
||||
gunicorn morss:cgi_standalone_app
|
||||
```
|
||||
|
||||
#### Using docker
|
||||
|
||||
Build & run
|
||||
|
||||
```shell
|
||||
docker build https://git.pictuga.com/pictuga/morss.git -t morss
|
||||
docker run -p 8080:8080 morss
|
||||
```
|
||||
|
||||
In one line
|
||||
|
||||
```shell
|
||||
docker run -p 8080:8080 $(docker build -q https://git.pictuga.com/pictuga/morss.git)
|
||||
```
|
||||
|
||||
#### Using morss' internal HTTP server
|
||||
|
||||
Morss can run its own HTTP server. The later should start when you run morss
|
||||
without any argument, on port 8080.
|
||||
|
||||
```shell
|
||||
morss
|
||||
```
|
||||
|
||||
You can change the port like this `morss 9000`.
|
||||
|
||||
#### Passing arguments
|
||||
|
||||
Then visit:
|
||||
```
|
||||
http://PATH/TO/MORSS/[main.py/][:argwithoutvalue[:argwithvalue=value[...]]]/FEEDURL
|
||||
```
|
||||
For example: `http://morss.example/:clip/https://twitter.com/pictuga`
|
||||
|
||||
*(Brackets indicate optional text)*
|
||||
|
||||
The `main.py` part is only needed if your server doesn't support the Apache redirect rule set in the provided `.htaccess`.
|
||||
|
||||
Works like a charm with [Tiny Tiny RSS](http://tt-rss.org/redmine/projects/tt-rss/wiki), and most probably other clients.
|
||||
|
||||
### As a CLI application
|
||||
|
||||
Run:
|
||||
|
||||
```
|
||||
morss [--argwithoutvalue] [--argwithvalue=value] [...] FEEDURL
|
||||
morss [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
|
||||
```
|
||||
|
||||
For example: `morss --clip http://feeds.bbci.co.uk/news/rss.xml`
|
||||
For example: `morss debug http://feeds.bbci.co.uk/news/rss.xml`
|
||||
|
||||
*(Brackets indicate optional text)*
|
||||
|
||||
If using Docker:
|
||||
|
||||
```shell
|
||||
docker run morss --clip http://feeds.bbci.co.uk/news/rss.xml
|
||||
```
|
||||
|
||||
### As a newsreader hook
|
||||
|
||||
To use it, the newsreader [Liferea](http://lzone.de/liferea/) is required
|
||||
@@ -326,13 +209,10 @@ To use it, the newsreader [Liferea](http://lzone.de/liferea/) is required
|
||||
scripts can be run on top of the RSS feed, using its
|
||||
[output](http://lzone.de/liferea/scraping.htm) as an RSS feed.
|
||||
|
||||
To use this script, you have to enable "(Unix) command" in liferea feed
|
||||
settings, and use the command:
|
||||
|
||||
To use this script, you have to enable "(Unix) command" in liferea feed settings, and use the command:
|
||||
```
|
||||
morss [--argwithoutvalue] [--argwithvalue=value] [...] FEEDURL
|
||||
morss [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
|
||||
```
|
||||
|
||||
For example: `morss http://feeds.bbci.co.uk/news/rss.xml`
|
||||
|
||||
*(Brackets indicate optional text)*
|
||||
@@ -340,7 +220,6 @@ For example: `morss http://feeds.bbci.co.uk/news/rss.xml`
|
||||
### As a python library
|
||||
|
||||
Quickly get a full-text feed:
|
||||
|
||||
```python
|
||||
>>> import morss
|
||||
>>> xml_string = morss.process('http://feeds.bbci.co.uk/news/rss.xml')
|
||||
@@ -349,11 +228,10 @@ Quickly get a full-text feed:
|
||||
```
|
||||
|
||||
Using cache and passing arguments:
|
||||
|
||||
```python
|
||||
>>> import morss
|
||||
>>> url = 'http://feeds.bbci.co.uk/news/rss.xml'
|
||||
>>> cache = '/tmp/morss-cache' # diskcache cache location
|
||||
>>> cache = '/tmp/morss-cache.db' # sqlite cache location
|
||||
>>> options = {'csv':True}
|
||||
>>> xml_string = morss.process(url, cache, options)
|
||||
>>> xml_string[:50]
|
||||
@@ -365,165 +243,54 @@ possible to call the simpler functions, to have more control on what's happening
|
||||
under the hood.
|
||||
|
||||
Doing it step-by-step:
|
||||
|
||||
```python
|
||||
import morss
|
||||
import morss, morss.crawler
|
||||
|
||||
url = 'http://newspaper.example/feed.xml'
|
||||
options = morss.Options(csv=True) # arguments
|
||||
morss.crawler.sqlite_default = '/tmp/morss-cache.db' # sqlite cache location
|
||||
|
||||
url, rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
|
||||
url = morss.UrlFix(url) # make sure the url is properly formatted
|
||||
rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
|
||||
rss = morss.FeedGather(rss, url, options) # this fills the feed and cleans it up
|
||||
|
||||
output = morss.FeedFormat(rss, options, 'unicode') # formats final feed
|
||||
```
|
||||
|
||||
## Arguments and settings
|
||||
## Cache information
|
||||
|
||||
### Arguments
|
||||
morss uses caching to make loading faster. There are 3 possible cache backends
|
||||
(visible in `morss/crawler.py`):
|
||||
|
||||
morss accepts some arguments, to lightly alter the output of morss. Arguments
|
||||
may need to have a value (usually a string or a number). How to pass those
|
||||
arguments to morss is explained in Run above.
|
||||
- `{}`: a simple python in-memory dict() object
|
||||
- `SQLiteCache`: sqlite3 cache. Default file location is in-memory (i.e. it will
|
||||
be cleared every time the program is run
|
||||
- `MySQLCacheHandler`
|
||||
|
||||
The list of arguments can be obtained by running `morss --help`
|
||||
|
||||
```
|
||||
usage: morss [-h] [--post STRING] [--xpath XPATH]
|
||||
[--format {rss,json,html,csv}] [--search STRING] [--clip]
|
||||
[--indent] [--cache] [--force] [--proxy]
|
||||
[--order {first,last,newest,oldest}] [--firstlink] [--resolve]
|
||||
[--items XPATH] [--item_link XPATH] [--item_title XPATH]
|
||||
[--item_content XPATH] [--item_time XPATH]
|
||||
[--mode {xml,html,json}] [--nolink] [--noref] [--silent]
|
||||
url
|
||||
|
||||
Get full-text RSS feeds
|
||||
|
||||
positional arguments:
|
||||
url feed url
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
--post STRING POST request
|
||||
--xpath XPATH xpath rule to manually detect the article
|
||||
|
||||
output:
|
||||
--format {rss,json,html,csv}
|
||||
output format
|
||||
--search STRING does a basic case-sensitive search in the feed
|
||||
--clip stick the full article content under the original feed
|
||||
content (useful for twitter)
|
||||
--indent returns indented XML or JSON, takes more place, but
|
||||
human-readable
|
||||
|
||||
action:
|
||||
--cache only take articles from the cache (ie. don't grab new
|
||||
articles' content), so as to save time
|
||||
--force force refetch the rss feed and articles
|
||||
--proxy doesn't fill the articles
|
||||
--order {first,last,newest,oldest}
|
||||
order in which to process items (which are however NOT
|
||||
sorted in the output)
|
||||
--firstlink pull the first article mentioned in the description
|
||||
instead of the default link
|
||||
--resolve replace tracking links with direct links to articles
|
||||
(not compatible with --proxy)
|
||||
|
||||
custom feeds:
|
||||
--items XPATH (mandatory to activate the custom feeds function)
|
||||
xpath rule to match all the RSS entries
|
||||
--item_link XPATH xpath rule relative to items to point to the entry's
|
||||
link
|
||||
--item_title XPATH entry's title
|
||||
--item_content XPATH entry's content
|
||||
--item_time XPATH entry's date & time (accepts a wide range of time
|
||||
formats)
|
||||
--mode {xml,html,json}
|
||||
parser to use for the custom feeds
|
||||
|
||||
misc:
|
||||
--nolink drop links, but keeps links' inner text
|
||||
--noref drop items' link
|
||||
--silent don't output the final RSS (useless on its own, but
|
||||
can be nice when debugging)
|
||||
|
||||
GNU AGPLv3 code
|
||||
```
|
||||
|
||||
Further HTTP-only options:
|
||||
|
||||
- `callback=NAME`: for JSONP calls
|
||||
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other
|
||||
servers)
|
||||
- `txt`: changes the http content-type to txt (for faster "`view-source:`")
|
||||
|
||||
### Environment variables
|
||||
|
||||
To pass environment variables:
|
||||
|
||||
- Docker-cli: `docker run -p 8000:8000 morss --env KEY=value`
|
||||
- docker-compose: add an `environment:` section in the .yml file
|
||||
- Gunicorn/uWSGI/CLI: prepend `KEY=value` before the command
|
||||
- Apache: via the `SetEnv` instruction (see sample `.htaccess` provided)
|
||||
- cloud-init: in the `/etc/environment` file
|
||||
|
||||
Generic:
|
||||
|
||||
- `DEBUG=1`: to have some feedback from the script execution. Useful for
|
||||
debugging.
|
||||
- `IGNORE_SSL=1`: to ignore SSL certs when fetch feeds and articles
|
||||
- `DELAY` (seconds) sets the browser cache delay, only for HTTP clients
|
||||
- `TIMEOUT` (seconds) sets the HTTP timeout when fetching rss feeds and articles
|
||||
- `DATA_PATH`: to set custom file location for the `www` folder
|
||||
## Configuration
|
||||
### Length limitation
|
||||
|
||||
When parsing long feeds, with a lot of items (100+), morss might take a lot of
|
||||
time to parse it, or might even run into a memory overflow on some shared
|
||||
hosting plans (limits around 10Mb), in which case you might want to adjust the
|
||||
below settings via environment variables.
|
||||
different values at the top of the script.
|
||||
|
||||
Also, if the request takes too long to process, the http request might be
|
||||
discarded. See relevant config for
|
||||
[gunicorn](https://docs.gunicorn.org/en/stable/settings.html#timeout) or
|
||||
[nginx](http://nginx.org/en/docs/http/ngx_http_proxy_module.html#proxy_read_timeout).
|
||||
- `MAX_TIME` sets the maximum amount of time spent *fetching* articles, more time might be spent taking older articles from cache. `-1` for unlimited.
|
||||
- `MAX_ITEM` sets the maximum number of articles to fetch. `-1` for unlimited. More articles will be taken from cache following the nexts settings.
|
||||
- `LIM_TIME` sets the maximum amount of time spent working on the feed (whether or not it's already cached). Articles beyond that limit will be dropped from the feed. `-1` for unlimited.
|
||||
- `LIM_ITEM` sets the maximum number of article checked, limiting both the number of articles fetched and taken from cache. Articles beyond that limit will be dropped from the feed, even if they're cached. `-1` for unlimited.
|
||||
|
||||
- `MAX_TIME` (seconds) sets the maximum amount of time spent *fetching*
|
||||
articles, more time might be spent taking older articles from cache. `-1` for
|
||||
unlimited.
|
||||
- `MAX_ITEM` sets the maximum number of articles to fetch. `-1` for unlimited.
|
||||
More articles will be taken from cache following the nexts settings.
|
||||
- `LIM_TIME` (seconds) sets the maximum amount of time spent working on the feed
|
||||
(whether or not it's already cached). Articles beyond that limit will be dropped
|
||||
from the feed. `-1` for unlimited.
|
||||
- `LIM_ITEM` sets the maximum number of article checked, limiting both the
|
||||
number of articles fetched and taken from cache. Articles beyond that limit will
|
||||
be dropped from the feed, even if they're cached. `-1` for unlimited.
|
||||
### Other settings
|
||||
|
||||
morss uses caching to make loading faster. There are 3 possible cache backends:
|
||||
|
||||
- `(nothing/default)`: a simple python in-memory dict-like object.
|
||||
- `CACHE=redis`: Redis cache. Connection can be defined with the following
|
||||
environment variables: `REDIS_HOST`, `REDIS_PORT`, `REDIS_DB`, `REDIS_PWD`
|
||||
- `CACHE=diskcache`: disk-based cache. Target directory canbe defined with
|
||||
`DISKCACHE_DIR`.
|
||||
|
||||
To limit the size of the cache:
|
||||
|
||||
- `CACHE_SIZE` sets the target number of items in the cache (further items will
|
||||
be deleted but the cache might be temporarily bigger than that). Defaults to 1k
|
||||
entries. NB. When using `diskcache`, this is the cache max size in Bytes.
|
||||
- `CACHE_LIFESPAN` (seconds) sets how often the cache must be trimmed (i.e. cut
|
||||
down to the number of items set in `CACHE_SIZE`). Defaults to 1min.
|
||||
|
||||
Gunicorn also accepts command line arguments via the `GUNICORN_CMD_ARGS`
|
||||
environment variable.
|
||||
- `DELAY` sets the browser cache delay, only for HTTP clients
|
||||
- `TIMEOUT` sets the HTTP timeout when fetching rss feeds and articles
|
||||
|
||||
### Content matching
|
||||
|
||||
The content of articles is grabbed with our own readability fork. This means
|
||||
that most of the time the right content is matched. However sometimes it fails,
|
||||
therefore some tweaking is required. Most of the time, what has to be done is to
|
||||
add some "rules" in the main script file in `readabilite.py` (not in morss).
|
||||
add some "rules" in the main script file in *readability* (not in morss).
|
||||
|
||||
Most of the time when hardly nothing is matched, it means that the main content
|
||||
of the article is made of images, videos, pictures, etc., which readability
|
||||
@@ -534,3 +301,14 @@ morss will also try to figure out whether the full content is already in place
|
||||
(for those websites which understood the whole point of RSS feeds). However this
|
||||
detection is very simple, and only works if the actual content is put in the
|
||||
"content" section in the feed and not in the "summary" section.
|
||||
|
||||
***
|
||||
|
||||
## Todo
|
||||
|
||||
You can contribute to this project. If you're not sure what to do, you can pick
|
||||
from this list:
|
||||
|
||||
- Add ability to run morss.py as an update daemon
|
||||
- Add ability to use custom xpath rule instead of readability
|
||||
- More ideas here <https://github.com/pictuga/morss/issues/15>
|
||||
|
21
app.json
21
app.json
@@ -1,21 +0,0 @@
|
||||
{
|
||||
"stack": "container",
|
||||
"env": {
|
||||
"DEBUG": {
|
||||
"value": 1,
|
||||
"required": false
|
||||
},
|
||||
"GUNICORN_CMD_ARGS": {
|
||||
"value": "",
|
||||
"required": false
|
||||
},
|
||||
"CACHE": {
|
||||
"value": "diskcache",
|
||||
"required": false
|
||||
},
|
||||
"CACHE_SIZE": {
|
||||
"value": 1073741824,
|
||||
"required": false
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,3 +0,0 @@
|
||||
build:
|
||||
docker:
|
||||
web: Dockerfile
|
20
main.py
Executable file → Normal file
20
main.py
Executable file → Normal file
@@ -1,24 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
from morss.__main__ import main
|
||||
from morss.wsgi import application
|
||||
from morss import main, cgi_standalone_app as application
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
47
morss-helper
47
morss-helper
@@ -1,47 +0,0 @@
|
||||
#! /bin/sh
|
||||
set -ex
|
||||
|
||||
if ! command -v python && command -v python3 ; then
|
||||
alias python='python3'
|
||||
fi
|
||||
|
||||
run() {
|
||||
gunicorn --bind 0.0.0.0:${PORT:-8000} --preload --access-logfile - morss
|
||||
}
|
||||
|
||||
daemon() {
|
||||
gunicorn --bind 0.0.0.0:${PORT:-8000} --preload --access-logfile - --daemon morss
|
||||
}
|
||||
|
||||
reload() {
|
||||
pid=$(pidof 'gunicorn: master [morss]' || true)
|
||||
# NB. requires python-setproctitle
|
||||
# `|| true` due to `set -e`
|
||||
|
||||
if [ -z "$pid" ]; then
|
||||
# if gunicorn is not currently running
|
||||
daemon
|
||||
|
||||
else
|
||||
kill -s USR2 $pid
|
||||
kill -s WINCH $pid
|
||||
sleep 1 # give gunicorn some time to reload
|
||||
kill -s TERM $pid
|
||||
|
||||
fi
|
||||
}
|
||||
|
||||
check() {
|
||||
python -m morss.crawler http://localhost:${PORT:-8000}/ > /dev/null 2>&1
|
||||
}
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
run
|
||||
|
||||
elif [ "$1" = "sh" ] || [ "$1" = "bash" ] || command -v "$1" ; then
|
||||
$@
|
||||
|
||||
else
|
||||
python -m morss $@
|
||||
|
||||
fi
|
@@ -1,13 +0,0 @@
|
||||
[Unit]
|
||||
Description=morss server (gunicorn)
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
ExecStart=/usr/local/bin/morss-helper run
|
||||
ExecReload=/usr/local/bin/morss-helper reload
|
||||
KillMode=process
|
||||
Restart=always
|
||||
User=http
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
@@ -1,25 +1,2 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
# ran on `import morss`
|
||||
|
||||
# pylint: disable=unused-import,unused-variable
|
||||
|
||||
__version__ = ""
|
||||
|
||||
from .morss import *
|
||||
from .wsgi import application
|
||||
|
@@ -1,48 +1,5 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
# ran on `python -m morss`
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
from . import cli, wsgi
|
||||
from .morss import MorssException
|
||||
|
||||
|
||||
def main():
|
||||
if 'REQUEST_URI' in os.environ:
|
||||
# mod_cgi (w/o file handler)
|
||||
wsgi.cgi_handle_request()
|
||||
|
||||
elif len(sys.argv) <= 1:
|
||||
# start internal (basic) http server (w/ file handler)
|
||||
wsgi.cgi_start_server()
|
||||
|
||||
else:
|
||||
# as a CLI app
|
||||
try:
|
||||
cli.cli_app()
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
print('ERROR: %s' % e.message)
|
||||
from .morss import main
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
122
morss/caching.py
122
morss/caching.py
@@ -1,122 +0,0 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from collections import OrderedDict
|
||||
|
||||
CACHE_SIZE = int(os.getenv('CACHE_SIZE', 1000)) # max number of items in cache (default: 1k items)
|
||||
CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60)) # how often to auto-clear the cache (default: 1min)
|
||||
|
||||
|
||||
class BaseCache:
|
||||
""" Subclasses must behave like a dict """
|
||||
|
||||
def trim(self):
|
||||
pass
|
||||
|
||||
def autotrim(self, delay=CACHE_LIFESPAN):
|
||||
# trim the cache every so often
|
||||
|
||||
self.trim()
|
||||
|
||||
t = threading.Timer(delay, self.autotrim)
|
||||
t.daemon = True
|
||||
t.start()
|
||||
|
||||
def __contains__(self, url):
|
||||
try:
|
||||
self[url]
|
||||
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
class CappedDict(OrderedDict, BaseCache):
|
||||
def trim(self):
|
||||
if CACHE_SIZE >= 0:
|
||||
for i in range( max( len(self) - CACHE_SIZE , 0 )):
|
||||
self.popitem(False)
|
||||
|
||||
def __setitem__(self, key, data):
|
||||
# https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
|
||||
if key in self:
|
||||
del self[key]
|
||||
OrderedDict.__setitem__(self, key, data)
|
||||
|
||||
|
||||
try:
|
||||
import redis # isort:skip
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class RedisCacheHandler(BaseCache):
|
||||
def __init__(self, host='localhost', port=6379, db=0, password=None):
|
||||
self.r = redis.Redis(host=host, port=port, db=db, password=password)
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.r.get(key)
|
||||
|
||||
def __setitem__(self, key, data):
|
||||
self.r.set(key, data)
|
||||
|
||||
|
||||
try:
|
||||
import diskcache # isort:skip
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class DiskCacheHandler(BaseCache):
|
||||
def __init__(self, directory=None, **kwargs):
|
||||
self.cache = diskcache.Cache(directory=directory, eviction_policy='least-frequently-used', **kwargs)
|
||||
|
||||
def __del__(self):
|
||||
self.cache.close()
|
||||
|
||||
def trim(self):
|
||||
self.cache.cull()
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.cache[key]
|
||||
|
||||
def __setitem__(self, key, data):
|
||||
self.cache.set(key, data)
|
||||
|
||||
|
||||
if 'CACHE' in os.environ:
|
||||
if os.environ['CACHE'] == 'redis':
|
||||
default_cache = RedisCacheHandler(
|
||||
host = os.getenv('REDIS_HOST', 'localhost'),
|
||||
port = int(os.getenv('REDIS_PORT', 6379)),
|
||||
db = int(os.getenv('REDIS_DB', 0)),
|
||||
password = os.getenv('REDIS_PWD', None)
|
||||
)
|
||||
|
||||
elif os.environ['CACHE'] == 'diskcache':
|
||||
default_cache = DiskCacheHandler(
|
||||
directory = os.getenv('DISKCACHE_DIR', '/tmp/morss-diskcache'),
|
||||
size_limit = CACHE_SIZE # in Bytes
|
||||
)
|
||||
|
||||
else:
|
||||
default_cache = CappedDict()
|
72
morss/cli.py
72
morss/cli.py
@@ -1,72 +0,0 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import argparse
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
from .morss import FeedFetch, FeedFormat, FeedGather, Options
|
||||
|
||||
|
||||
def cli_app():
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='morss',
|
||||
description='Get full-text RSS feeds',
|
||||
epilog='GNU AGPLv3 code'
|
||||
)
|
||||
|
||||
parser.add_argument('url', help='feed url')
|
||||
|
||||
parser.add_argument('--post', action='store', type=str, metavar='STRING', help='POST request')
|
||||
parser.add_argument('--xpath', action='store', type=str, metavar='XPATH', help='xpath rule to manually detect the article')
|
||||
|
||||
group = parser.add_argument_group('output')
|
||||
group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
|
||||
group.add_argument('--search', action='store', type=str, metavar='STRING', help='does a basic case-sensitive search in the feed')
|
||||
group.add_argument('--clip', action='store_true', help='stick the full article content under the original feed content (useful for twitter)')
|
||||
group.add_argument('--indent', action='store_true', help='returns indented XML or JSON, takes more place, but human-readable')
|
||||
|
||||
group = parser.add_argument_group('action')
|
||||
group.add_argument('--cache', action='store_true', help='only take articles from the cache (ie. don\'t grab new articles\' content), so as to save time')
|
||||
group.add_argument('--force', action='store_true', help='force refetch the rss feed and articles')
|
||||
group.add_argument('--proxy', action='store_true', help='doesn\'t fill the articles')
|
||||
group.add_argument('--order', default='first', choices=('first', 'last', 'newest', 'oldest'), help='order in which to process items (which are however NOT sorted in the output)')
|
||||
group.add_argument('--firstlink', action='store_true', help='pull the first article mentioned in the description instead of the default link')
|
||||
group.add_argument('--resolve', action='store_true', help='replace tracking links with direct links to articles (not compatible with --proxy)')
|
||||
|
||||
group = parser.add_argument_group('custom feeds')
|
||||
group.add_argument('--items', action='store', type=str, metavar='XPATH', help='(mandatory to activate the custom feeds function) xpath rule to match all the RSS entries')
|
||||
group.add_argument('--item_link', action='store', type=str, metavar='XPATH', help='xpath rule relative to items to point to the entry\'s link')
|
||||
group.add_argument('--item_title', action='store', type=str, metavar='XPATH', help='entry\'s title')
|
||||
group.add_argument('--item_content', action='store', type=str, metavar='XPATH', help='entry\'s content')
|
||||
group.add_argument('--item_time', action='store', type=str, metavar='XPATH', help='entry\'s date & time (accepts a wide range of time formats)')
|
||||
group.add_argument('--mode', default=None, choices=('xml', 'html', 'json'), help='parser to use for the custom feeds')
|
||||
|
||||
group = parser.add_argument_group('misc')
|
||||
group.add_argument('--nolink', action='store_true', help='drop links, but keeps links\' inner text')
|
||||
group.add_argument('--noref', action='store_true', help='drop items\' link')
|
||||
group.add_argument('--silent', action='store_true', help='don\'t output the final RSS (useless on its own, but can be nice when debugging)')
|
||||
|
||||
options = Options(vars(parser.parse_args()))
|
||||
url = options.url
|
||||
|
||||
url, rss = FeedFetch(url, options)
|
||||
rss = FeedGather(rss, url, options)
|
||||
out = FeedFormat(rss, options, 'unicode')
|
||||
|
||||
if not options.silent:
|
||||
print(out)
|
702
morss/crawler.py
702
morss/crawler.py
@@ -1,52 +1,26 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
|
||||
import zlib
|
||||
from cgi import parse_header
|
||||
from collections import OrderedDict
|
||||
from io import BytesIO, StringIO
|
||||
|
||||
import re
|
||||
import chardet
|
||||
|
||||
from .caching import default_cache
|
||||
from cgi import parse_header
|
||||
import lxml.html
|
||||
import time
|
||||
import random
|
||||
|
||||
try:
|
||||
# python 2
|
||||
from urllib2 import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
|
||||
from urllib import quote
|
||||
|
||||
from httplib import HTTPMessage
|
||||
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
|
||||
Request, addinfourl, build_opener, parse_http_list,
|
||||
parse_keqv_list)
|
||||
from urlparse import urlsplit
|
||||
from urlparse import urlparse, urlunparse
|
||||
import mimetools
|
||||
except ImportError:
|
||||
# python 3
|
||||
from email import message_from_string
|
||||
from http.client import HTTPMessage
|
||||
from urllib.parse import quote, urlsplit
|
||||
from urllib.request import (BaseHandler, HTTPCookieProcessor,
|
||||
HTTPRedirectHandler, Request, addinfourl,
|
||||
build_opener, parse_http_list, parse_keqv_list)
|
||||
from urllib.request import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
|
||||
from urllib.parse import quote
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
import email
|
||||
|
||||
try:
|
||||
# python 2
|
||||
@@ -59,9 +33,7 @@ except NameError:
|
||||
MIMETYPE = {
|
||||
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
|
||||
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
||||
'html': ['text/html', 'application/xhtml+xml', 'application/xml'],
|
||||
'json': ['application/json'],
|
||||
}
|
||||
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
||||
|
||||
|
||||
DEFAULT_UAS = [
|
||||
@@ -86,17 +58,14 @@ def get(*args, **kwargs):
|
||||
return adv_get(*args, **kwargs)['data']
|
||||
|
||||
|
||||
def adv_get(url, post=None, timeout=None, *args, **kwargs):
|
||||
def adv_get(url, timeout=None, *args, **kwargs):
|
||||
url = sanitize_url(url)
|
||||
|
||||
if post is not None:
|
||||
post = post.encode('utf-8')
|
||||
|
||||
if timeout is None:
|
||||
con = custom_opener(*args, **kwargs).open(url, data=post)
|
||||
con = custom_handler(*args, **kwargs).open(url)
|
||||
|
||||
else:
|
||||
con = custom_opener(*args, **kwargs).open(url, data=post, timeout=timeout)
|
||||
con = custom_handler(*args, **kwargs).open(url, timeout=timeout)
|
||||
|
||||
data = con.read()
|
||||
|
||||
@@ -104,7 +73,7 @@ def adv_get(url, post=None, timeout=None, *args, **kwargs):
|
||||
encoding= detect_encoding(data, con)
|
||||
|
||||
return {
|
||||
'data': data,
|
||||
'data':data,
|
||||
'url': con.geturl(),
|
||||
'con': con,
|
||||
'contenttype': contenttype,
|
||||
@@ -112,7 +81,9 @@ def adv_get(url, post=None, timeout=None, *args, **kwargs):
|
||||
}
|
||||
|
||||
|
||||
def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
|
||||
def custom_handler(follow=None, delay=None, encoding=None):
|
||||
handlers = []
|
||||
|
||||
# as per urllib2 source code, these Handelers are added first
|
||||
# *unless* one of the custom handlers inherits from one of them
|
||||
#
|
||||
@@ -120,33 +91,21 @@ def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
|
||||
# HTTPDefaultErrorHandler, HTTPRedirectHandler,
|
||||
# FTPHandler, FileHandler, HTTPErrorProcessor]
|
||||
# & HTTPSHandler
|
||||
#
|
||||
# when processing a request:
|
||||
# (1) all the *_request are run
|
||||
# (2) the *_open are run until sth is returned (other than None)
|
||||
# (3) all the *_response are run
|
||||
#
|
||||
# During (3), if an http error occurs (i.e. not a 2XX response code), the
|
||||
# http_error_* are run until sth is returned (other than None). If they all
|
||||
# return nothing, a python error is raised
|
||||
|
||||
handlers = [
|
||||
#DebugHandler(),
|
||||
SizeLimitHandler(500*1024), # 500KiB
|
||||
HTTPCookieProcessor(),
|
||||
GZIPHandler(),
|
||||
HTTPAllRedirectHandler(),
|
||||
HTTPEquivHandler(),
|
||||
HTTPRefreshHandler(),
|
||||
UAHandler(random.choice(DEFAULT_UAS)),
|
||||
BrowserlyHeaderHandler(),
|
||||
EncodingFixHandler(),
|
||||
]
|
||||
#handlers.append(DebugHandler())
|
||||
handlers.append(SizeLimitHandler(100*1024)) # 100KiB
|
||||
handlers.append(HTTPCookieProcessor())
|
||||
handlers.append(GZIPHandler())
|
||||
handlers.append(HTTPEquivHandler())
|
||||
handlers.append(HTTPRefreshHandler())
|
||||
handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
|
||||
handlers.append(BrowserlyHeaderHandler())
|
||||
handlers.append(EncodingFixHandler(encoding))
|
||||
|
||||
if follow:
|
||||
handlers.append(AlternateHandler(MIMETYPE[follow]))
|
||||
|
||||
handlers.append(CacheHandler(policy=policy, force_min=force_min, force_max=force_max))
|
||||
handlers.append(CacheHandler(force_min=delay))
|
||||
|
||||
return build_opener(*handlers)
|
||||
|
||||
@@ -163,90 +122,28 @@ def is_ascii(string):
|
||||
return True
|
||||
|
||||
|
||||
def soft_quote(string):
|
||||
" url-quote only when not a valid ascii string "
|
||||
|
||||
if is_ascii(string):
|
||||
return string
|
||||
|
||||
else:
|
||||
return quote(string.encode('utf-8'))
|
||||
|
||||
|
||||
def sanitize_url(url):
|
||||
# make sure the url is unicode, i.e. not bytes
|
||||
if isinstance(url, bytes):
|
||||
url = url.decode('utf-8')
|
||||
url = url.decode()
|
||||
|
||||
# make sure there's a protocol (http://)
|
||||
if url.split(':', 1)[0] not in PROTOCOL:
|
||||
url = 'http://' + url
|
||||
|
||||
# turns out some websites have really badly fomatted urls (fix http:/badurl)
|
||||
url = re.sub('^(https?):/([^/])', r'\1://\2', url)
|
||||
|
||||
# escape spaces
|
||||
url = url.replace(' ', '%20')
|
||||
|
||||
# escape non-ascii unicode characters
|
||||
parts = urlsplit(url)
|
||||
# Escape non-ascii unicode characters
|
||||
# https://stackoverflow.com/a/4391299
|
||||
parts = list(urlparse(url))
|
||||
|
||||
parts = parts._replace(
|
||||
netloc=parts.netloc.replace(
|
||||
parts.hostname,
|
||||
parts.hostname.encode('idna').decode('ascii')
|
||||
),
|
||||
path=soft_quote(parts.path),
|
||||
query=soft_quote(parts.query),
|
||||
fragment=soft_quote(parts.fragment),
|
||||
)
|
||||
for i in range(len(parts)):
|
||||
if not is_ascii(parts[i]):
|
||||
if i == 1:
|
||||
parts[i] = parts[i].encode('idna').decode('ascii')
|
||||
|
||||
return parts.geturl()
|
||||
else:
|
||||
parts[i] = quote(parts[i].encode('utf-8'))
|
||||
|
||||
|
||||
class RespDataHandler(BaseHandler):
|
||||
" Make it easier to use the reponse body "
|
||||
|
||||
def data_reponse(self, req, resp, data):
|
||||
pass
|
||||
|
||||
def http_response(self, req, resp):
|
||||
# read data
|
||||
data = resp.read()
|
||||
|
||||
# process data and use returned content (if any)
|
||||
data = self.data_response(req, resp, data) or data
|
||||
|
||||
# reformat the stuff
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
|
||||
|
||||
class RespStrHandler(RespDataHandler):
|
||||
" Make it easier to use the _decoded_ reponse body "
|
||||
|
||||
def str_reponse(self, req, resp, data_str):
|
||||
pass
|
||||
|
||||
def data_response(self, req, resp, data):
|
||||
#decode
|
||||
enc = detect_encoding(data, resp)
|
||||
data_str = data.decode(enc, 'replace')
|
||||
|
||||
#process
|
||||
data_str = self.str_response(req, resp, data_str)
|
||||
|
||||
# return
|
||||
data = data_str.encode(enc) if data_str is not None else data
|
||||
|
||||
#return
|
||||
return data
|
||||
return urlunparse(parts)
|
||||
|
||||
|
||||
class DebugHandler(BaseHandler):
|
||||
@@ -269,7 +166,7 @@ class SizeLimitHandler(BaseHandler):
|
||||
|
||||
handler_order = 450
|
||||
|
||||
def __init__(self, limit=5*1024**2):
|
||||
def __init__(self, limit=5*1024^2):
|
||||
self.limit = limit
|
||||
|
||||
def http_response(self, req, resp):
|
||||
@@ -290,23 +187,35 @@ def UnGzip(data):
|
||||
return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data)
|
||||
|
||||
|
||||
class GZIPHandler(RespDataHandler):
|
||||
class GZIPHandler(BaseHandler):
|
||||
def http_request(self, req):
|
||||
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
||||
return req
|
||||
|
||||
def data_response(self, req, resp, data):
|
||||
def http_response(self, req, resp):
|
||||
if 200 <= resp.code < 300:
|
||||
if resp.headers.get('Content-Encoding') == 'gzip':
|
||||
data = resp.read()
|
||||
|
||||
data = UnGzip(data)
|
||||
|
||||
resp.headers['Content-Encoding'] = 'identity'
|
||||
|
||||
return UnGzip(data)
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
https_request = http_request
|
||||
|
||||
|
||||
def detect_encoding(data, resp=None):
|
||||
enc = detect_raw_encoding(data, resp)
|
||||
|
||||
if enc.lower() == 'gb2312':
|
||||
if enc == 'gb2312':
|
||||
enc = 'gbk'
|
||||
|
||||
return enc
|
||||
@@ -337,9 +246,32 @@ def detect_raw_encoding(data, resp=None):
|
||||
return 'utf-8'
|
||||
|
||||
|
||||
class EncodingFixHandler(RespStrHandler):
|
||||
def str_response(self, req, resp, data_str):
|
||||
return data_str
|
||||
class EncodingFixHandler(BaseHandler):
|
||||
def __init__(self, encoding=None):
|
||||
self.encoding = encoding
|
||||
|
||||
def http_response(self, req, resp):
|
||||
maintype = resp.info().get('Content-Type', '').split('/')[0]
|
||||
if 200 <= resp.code < 300 and maintype == 'text':
|
||||
data = resp.read()
|
||||
|
||||
if not self.encoding:
|
||||
enc = detect_encoding(data, resp)
|
||||
else:
|
||||
enc = self.encoding
|
||||
|
||||
if enc:
|
||||
data = data.decode(enc, 'replace')
|
||||
data = data.encode(enc)
|
||||
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
|
||||
|
||||
class UAHandler(BaseHandler):
|
||||
@@ -365,58 +297,60 @@ class BrowserlyHeaderHandler(BaseHandler):
|
||||
https_request = http_request
|
||||
|
||||
|
||||
def iter_html_tag(html_str, tag_name):
|
||||
" To avoid parsing whole pages when looking for a simple tag "
|
||||
|
||||
re_tag = r'<%s\s+[^>]+>' % tag_name
|
||||
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
|
||||
|
||||
for tag_match in re.finditer(re_tag, html_str):
|
||||
attr_match = re.findall(re_attr, tag_match.group(0))
|
||||
|
||||
if attr_match is not None:
|
||||
yield dict(attr_match)
|
||||
|
||||
|
||||
class AlternateHandler(RespStrHandler):
|
||||
class AlternateHandler(BaseHandler):
|
||||
" Follow <link rel='alternate' type='application/rss+xml' href='...' /> "
|
||||
|
||||
def __init__(self, follow=None):
|
||||
self.follow = follow or []
|
||||
|
||||
def str_response(self, req, resp, data_str):
|
||||
def http_response(self, req, resp):
|
||||
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
||||
|
||||
if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
|
||||
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
|
||||
|
||||
for link in iter_html_tag(data_str[:10000], 'link'):
|
||||
if (link.get('rel') == 'alternate'
|
||||
and link.get('type') in self.follow
|
||||
and 'href' in link):
|
||||
data = resp.read()
|
||||
links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
|
||||
|
||||
for link in links:
|
||||
if link.get('type', '') in self.follow:
|
||||
resp.code = 302
|
||||
resp.msg = 'Moved Temporarily'
|
||||
resp.headers['location'] = link.get('href')
|
||||
break
|
||||
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
class HTTPEquivHandler(RespStrHandler):
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
|
||||
|
||||
class HTTPEquivHandler(BaseHandler):
|
||||
" Handler to support <meta http-equiv='...' content='...' />, since it defines HTTP headers "
|
||||
|
||||
handler_order = 600
|
||||
|
||||
def str_response(self, req, resp, data_str):
|
||||
def http_response(self, req, resp):
|
||||
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
||||
if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
|
||||
data = resp.read()
|
||||
|
||||
for meta in iter_html_tag(data_str[:10000], 'meta'):
|
||||
if 'http-equiv' in meta and 'content' in meta:
|
||||
resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
|
||||
headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
|
||||
|
||||
for header in headers:
|
||||
resp.headers[header.get('http-equiv').lower()] = header.get('content')
|
||||
|
||||
class HTTPAllRedirectHandler(HTTPRedirectHandler):
|
||||
def http_error_308(self, req, fp, code, msg, headers):
|
||||
return self.http_error_301(req, fp, 301, msg, headers)
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
|
||||
|
||||
class HTTPRefreshHandler(BaseHandler):
|
||||
@@ -425,7 +359,7 @@ class HTTPRefreshHandler(BaseHandler):
|
||||
def http_response(self, req, resp):
|
||||
if 200 <= resp.code < 300:
|
||||
if resp.headers.get('refresh'):
|
||||
regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url\s*=\s*(["\']?)(?P<url>.+)\2$'
|
||||
regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url=(["\']?)(?P<url>.+)\2$'
|
||||
match = re.search(regex, resp.headers.get('refresh'))
|
||||
|
||||
if match:
|
||||
@@ -441,105 +375,139 @@ class HTTPRefreshHandler(BaseHandler):
|
||||
https_response = http_response
|
||||
|
||||
|
||||
def parse_headers(text=u'\n\n'):
|
||||
if sys.version_info[0] >= 3:
|
||||
# python 3
|
||||
return message_from_string(text, _class=HTTPMessage)
|
||||
|
||||
else:
|
||||
# python 2
|
||||
return HTTPMessage(StringIO(text))
|
||||
|
||||
|
||||
def error_response(code, msg, url=''):
|
||||
# return an error as a response
|
||||
resp = addinfourl(BytesIO(), parse_headers(), url, code)
|
||||
resp.msg = msg
|
||||
return resp
|
||||
default_cache = {}
|
||||
|
||||
|
||||
class CacheHandler(BaseHandler):
|
||||
" Cache based on etags/last-modified "
|
||||
|
||||
privacy = 'private' # Websites can indicate whether the page should be cached
|
||||
# by CDNs (e.g. shouldn't be the case for
|
||||
# private/confidential/user-specific pages. With this
|
||||
# setting, decide whether you want the cache to behave
|
||||
# like a CDN (i.e. don't cache private pages, 'public'),
|
||||
# or to behave like a end-user private pages
|
||||
# ('private'). If unsure, 'public' is the safest bet,
|
||||
# but many websites abuse this feature...
|
||||
|
||||
# NB. This overrides all the other min/max/policy settings.
|
||||
private_cache = False # False to behave like a CDN (or if you just don't care), True like a PC
|
||||
handler_order = 499
|
||||
|
||||
def __init__(self, cache=None, force_min=None, force_max=None, policy=None):
|
||||
def __init__(self, cache=None, force_min=None):
|
||||
self.cache = cache or default_cache
|
||||
self.force_min = force_min
|
||||
self.force_max = force_max
|
||||
self.policy = policy # can be cached/refresh/offline/None (default)
|
||||
|
||||
# Servers indicate how long they think their content is "valid". With
|
||||
# this parameter (force_min/max, expressed in seconds), we can override
|
||||
# the validity period (i.e. bypassing http headers)
|
||||
# Special choices, via "policy":
|
||||
# cached: use the cache no matter what (and fetch the page online if
|
||||
# not present in cache)
|
||||
# refresh: valid zero second, i.e. force refresh
|
||||
# offline: same as cached, i.e. use the cache no matter what, but do
|
||||
# NOT fetch the page online if not present in cache, throw an
|
||||
# error instead
|
||||
# None: just follow protocols
|
||||
|
||||
# sanity checks
|
||||
assert self.force_max is None or self.force_max >= 0
|
||||
assert self.force_min is None or self.force_min >= 0
|
||||
assert self.force_max is None or self.force_min is None or self.force_max >= self.force_min
|
||||
self.force_min = force_min # force_min (seconds) to bypass http headers, -1 forever, 0 never, -2 do nothing if not in cache
|
||||
|
||||
def load(self, url):
|
||||
try:
|
||||
data = pickle.loads(self.cache[url])
|
||||
|
||||
out = list(self.cache[url])
|
||||
except KeyError:
|
||||
data = None
|
||||
out = [None, None, unicode(), bytes(), 0]
|
||||
|
||||
if sys.version_info[0] >= 3:
|
||||
out[2] = email.message_from_string(out[2] or unicode()) # headers
|
||||
else:
|
||||
out[2] = mimetools.Message(StringIO(out[2] or unicode()))
|
||||
|
||||
return out
|
||||
|
||||
def save(self, url, code, msg, headers, data, timestamp):
|
||||
self.cache[url] = (code, msg, unicode(headers), data, timestamp)
|
||||
|
||||
def http_request(self, req):
|
||||
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
|
||||
|
||||
if 'etag' in headers:
|
||||
req.add_unredirected_header('If-None-Match', headers['etag'])
|
||||
|
||||
if 'last-modified' in headers:
|
||||
req.add_unredirected_header('If-Modified-Since', headers.get('last-modified'))
|
||||
|
||||
return req
|
||||
|
||||
def http_open(self, req):
|
||||
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
|
||||
|
||||
# some info needed to process everything
|
||||
cache_control = parse_http_list(headers.get('cache-control', ()))
|
||||
cache_control += parse_http_list(headers.get('pragma', ()))
|
||||
|
||||
cc_list = [x for x in cache_control if '=' not in x]
|
||||
cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
|
||||
|
||||
cache_age = time.time() - timestamp
|
||||
|
||||
# list in a simple way what to do when
|
||||
if req.get_header('Morss') == 'from_304': # for whatever reason, we need an uppercase
|
||||
# we're just in the middle of a dirty trick, use cache
|
||||
pass
|
||||
|
||||
elif self.force_min == -2:
|
||||
if code is not None:
|
||||
# already in cache, perfect, use cache
|
||||
pass
|
||||
|
||||
else:
|
||||
headers['Morss'] = 'from_cache'
|
||||
resp = addinfourl(BytesIO(), headers, req.get_full_url(), 409)
|
||||
resp.msg = 'Conflict'
|
||||
return resp
|
||||
|
||||
elif code is None:
|
||||
# cache empty, refresh
|
||||
return None
|
||||
|
||||
elif self.force_min == -1:
|
||||
# force use cache
|
||||
pass
|
||||
|
||||
elif self.force_min == 0:
|
||||
# force refresh
|
||||
return None
|
||||
|
||||
elif code == 301 and cache_age < 7*24*3600:
|
||||
# "301 Moved Permanently" has to be cached...as long as we want (awesome HTTP specs), let's say a week (why not?)
|
||||
# use force_min=0 if you want to bypass this (needed for a proper refresh)
|
||||
pass
|
||||
|
||||
elif self.force_min is None and ('no-cache' in cc_list
|
||||
or 'no-store' in cc_list
|
||||
or ('private' in cc_list and not self.private_cache)):
|
||||
# kindly follow web servers indications, refresh
|
||||
return None
|
||||
|
||||
elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
|
||||
# server says it's still fine (and we trust him, if not, use force_min=0), use cache
|
||||
pass
|
||||
|
||||
elif self.force_min is not None and self.force_min > cache_age:
|
||||
# still recent enough for us, use cache
|
||||
pass
|
||||
|
||||
else:
|
||||
data['headers'] = parse_headers(data['headers'] or unicode())
|
||||
# according to the www, we have to refresh when nothing is said
|
||||
return None
|
||||
|
||||
return data
|
||||
# return the cache as a response
|
||||
headers['morss'] = 'from_cache' # TODO delete the morss header from incoming pages, to avoid websites messing up with us
|
||||
resp = addinfourl(BytesIO(data), headers, req.get_full_url(), code)
|
||||
resp.msg = msg
|
||||
|
||||
def save(self, key, data):
|
||||
data['headers'] = unicode(data['headers'])
|
||||
self.cache[key] = pickle.dumps(data, 0)
|
||||
return resp
|
||||
|
||||
def cached_response(self, req, fallback=None):
|
||||
req.from_morss_cache = True
|
||||
def http_response(self, req, resp):
|
||||
# code for after-fetch, to know whether to save to hard-drive (if stiking to http headers' will)
|
||||
|
||||
data = self.load(req.get_full_url())
|
||||
|
||||
if data is not None:
|
||||
# return the cache as a response
|
||||
resp = addinfourl(BytesIO(data['data']), data['headers'], req.get_full_url(), data['code'])
|
||||
resp.msg = data['msg']
|
||||
if resp.code == 304:
|
||||
return resp
|
||||
|
||||
else:
|
||||
return fallback
|
||||
if ('cache-control' in resp.headers or 'pragma' in resp.headers) and self.force_min is None:
|
||||
cache_control = parse_http_list(resp.headers.get('cache-control', ()))
|
||||
cache_control += parse_http_list(resp.headers.get('pragma', ()))
|
||||
|
||||
def save_response(self, req, resp):
|
||||
if req.from_morss_cache:
|
||||
# do not re-save (would reset the timing)
|
||||
cc_list = [x for x in cache_control if '=' not in x]
|
||||
|
||||
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache):
|
||||
# kindly follow web servers indications
|
||||
return resp
|
||||
|
||||
if resp.headers.get('Morss') == 'from_cache':
|
||||
# it comes from cache, so no need to save it again
|
||||
return resp
|
||||
|
||||
# save to disk
|
||||
data = resp.read()
|
||||
|
||||
self.save(req.get_full_url(), {
|
||||
'code': resp.code,
|
||||
'msg': resp.msg,
|
||||
'headers': resp.headers,
|
||||
'data': data,
|
||||
'timestamp': time.time()
|
||||
})
|
||||
self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
|
||||
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
@@ -548,140 +516,118 @@ class CacheHandler(BaseHandler):
|
||||
|
||||
return resp
|
||||
|
||||
def http_request(self, req):
|
||||
req.from_morss_cache = False # to track whether it comes from cache
|
||||
def http_error_304(self, req, fp, code, msg, headers):
|
||||
cache = list(self.load(req.get_full_url()))
|
||||
|
||||
data = self.load(req.get_full_url())
|
||||
if cache[0]:
|
||||
cache[-1] = time.time()
|
||||
self.save(req.get_full_url(), *cache)
|
||||
|
||||
if data is not None:
|
||||
if 'etag' in data['headers']:
|
||||
req.add_unredirected_header('If-None-Match', data['headers']['etag'])
|
||||
new = Request(req.get_full_url(),
|
||||
headers=req.headers,
|
||||
unverifiable=True)
|
||||
|
||||
if 'last-modified' in data['headers']:
|
||||
req.add_unredirected_header('If-Modified-Since', data['headers']['last-modified'])
|
||||
new.add_unredirected_header('Morss', 'from_304')
|
||||
|
||||
return req
|
||||
return self.parent.open(new, timeout=req.timeout)
|
||||
|
||||
def http_open(self, req):
|
||||
# Reminder of how/when this function is called by urllib2:
|
||||
# If 'None' is returned, try your chance with the next-available handler
|
||||
# If a 'resp' is returned, stop there, and proceed with 'http_response'
|
||||
|
||||
# Here, we try to see whether we want to use data from cache (i.e.
|
||||
# return 'resp'), or whether we want to refresh the content (return
|
||||
# 'None')
|
||||
|
||||
data = self.load(req.get_full_url())
|
||||
|
||||
if data is not None:
|
||||
# some info needed to process everything
|
||||
cache_control = parse_http_list(data['headers'].get('cache-control', ()))
|
||||
cache_control += parse_http_list(data['headers'].get('pragma', ()))
|
||||
|
||||
cc_list = [x for x in cache_control if '=' not in x]
|
||||
cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
|
||||
|
||||
cache_age = time.time() - data['timestamp']
|
||||
|
||||
# list in a simple way what to do in special cases
|
||||
|
||||
if data is not None and 'private' in cc_list and self.privacy == 'public':
|
||||
# private data but public cache, do not use cache
|
||||
# privacy concern, so handled first and foremost
|
||||
# (and doesn't need to be addressed anymore afterwards)
|
||||
return None
|
||||
|
||||
elif self.policy == 'offline':
|
||||
# use cache, or return an error
|
||||
return self.cached_response(
|
||||
req,
|
||||
error_response(409, 'Conflict', req.get_full_url())
|
||||
)
|
||||
|
||||
elif self.policy == 'cached':
|
||||
# use cache, or fetch online
|
||||
return self.cached_response(req, None)
|
||||
|
||||
elif self.policy == 'refresh':
|
||||
# force refresh
|
||||
return None
|
||||
|
||||
elif data is None:
|
||||
# we have already settled all the cases that don't need the cache.
|
||||
# all the following ones need the cached item
|
||||
return None
|
||||
|
||||
elif self.force_max is not None and cache_age > self.force_max:
|
||||
# older than we want, refresh
|
||||
return None
|
||||
|
||||
elif self.force_min is not None and cache_age < self.force_min:
|
||||
# recent enough, use cache
|
||||
return self.cached_response(req)
|
||||
|
||||
elif data['code'] == 301 and cache_age < 7*24*3600:
|
||||
# "301 Moved Permanently" has to be cached...as long as we want
|
||||
# (awesome HTTP specs), let's say a week (why not?). Use force_min=0
|
||||
# if you want to bypass this (needed for a proper refresh)
|
||||
return self.cached_response(req)
|
||||
|
||||
elif self.force_min is None and ('no-cache' in cc_list or 'no-store' in cc_list):
|
||||
# kindly follow web servers indications, refresh if the same
|
||||
# settings are used all along, this section shouldn't be of any use,
|
||||
# since the page woudln't be cached in the first place the check is
|
||||
# only performed "just in case"
|
||||
# NB. NOT respected if force_min is set
|
||||
return None
|
||||
|
||||
elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
|
||||
# server says it's still fine (and we trust him, if not, use overrides), use cache
|
||||
return self.cached_response(req)
|
||||
|
||||
else:
|
||||
# according to the www, we have to refresh when nothing is said
|
||||
return None
|
||||
|
||||
def http_response(self, req, resp):
|
||||
# code for after-fetch, to know whether to save to hard-drive (if sticking to http headers' will)
|
||||
|
||||
if resp.code == 304 and resp.url in self.cache:
|
||||
# we are hopefully the first after the HTTP handler, so no need
|
||||
# to re-run all the *_response
|
||||
# here: cached page, returning from cache
|
||||
return self.cached_response(req)
|
||||
|
||||
elif self.force_min is None and ('cache-control' in resp.headers or 'pragma' in resp.headers):
|
||||
cache_control = parse_http_list(resp.headers.get('cache-control', ()))
|
||||
cache_control += parse_http_list(resp.headers.get('pragma', ()))
|
||||
|
||||
cc_list = [x for x in cache_control if '=' not in x]
|
||||
|
||||
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and self.privacy == 'public'):
|
||||
# kindly follow web servers indications (do not save & return)
|
||||
return resp
|
||||
|
||||
else:
|
||||
# save
|
||||
return self.save_response(req, resp)
|
||||
|
||||
else:
|
||||
return self.save_response(req, resp)
|
||||
return None
|
||||
|
||||
https_request = http_request
|
||||
https_open = http_open
|
||||
https_response = http_response
|
||||
|
||||
|
||||
if 'IGNORE_SSL' in os.environ:
|
||||
import ssl
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
class BaseCache:
|
||||
""" Subclasses must behave like a dict """
|
||||
|
||||
def __contains__(self, url):
|
||||
try:
|
||||
self[url]
|
||||
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
import sqlite3
|
||||
|
||||
|
||||
class SQLiteCache(BaseCache):
|
||||
def __init__(self, filename=':memory:'):
|
||||
self.con = sqlite3.connect(filename, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
|
||||
|
||||
with self.con:
|
||||
self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)')
|
||||
self.con.execute('pragma journal_mode=WAL')
|
||||
|
||||
def __del__(self):
|
||||
self.con.close()
|
||||
|
||||
def __getitem__(self, url):
|
||||
row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
|
||||
|
||||
if not row:
|
||||
raise KeyError
|
||||
|
||||
return row[1:]
|
||||
|
||||
def __setitem__(self, url, value): # value = (code, msg, headers, data, timestamp)
|
||||
value = list(value)
|
||||
value[3] = sqlite3.Binary(value[3]) # data
|
||||
value = tuple(value)
|
||||
|
||||
if url in self:
|
||||
with self.con:
|
||||
self.con.execute('UPDATE data SET code=?, msg=?, headers=?, data=?, timestamp=? WHERE url=?',
|
||||
value + (url,))
|
||||
|
||||
else:
|
||||
with self.con:
|
||||
self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?)', (url,) + value)
|
||||
|
||||
|
||||
import pymysql.cursors
|
||||
|
||||
|
||||
class MySQLCacheHandler(BaseCache):
|
||||
def __init__(self, user, password, database, host='localhost'):
|
||||
self.user = user
|
||||
self.password = password
|
||||
self.database = database
|
||||
self.host = host
|
||||
|
||||
with self.cursor() as cursor:
|
||||
cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)')
|
||||
|
||||
def cursor(self):
|
||||
return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
|
||||
|
||||
def __getitem__(self, url):
|
||||
cursor = self.cursor()
|
||||
cursor.execute('SELECT * FROM data WHERE url=%s', (url,))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
raise KeyError
|
||||
|
||||
return row[1:]
|
||||
|
||||
def __setitem__(self, url, value): # (code, msg, headers, data, timestamp)
|
||||
if url in self:
|
||||
with self.cursor() as cursor:
|
||||
cursor.execute('UPDATE data SET code=%s, msg=%s, headers=%s, data=%s, timestamp=%s WHERE url=%s',
|
||||
value + (url,))
|
||||
|
||||
else:
|
||||
with self.cursor() as cursor:
|
||||
cursor.execute('INSERT INTO data VALUES (%s,%s,%s,%s,%s,%s)', (url,) + value)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
req = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
|
||||
data, con, contenttype, encoding = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
|
||||
|
||||
if sys.flags.interactive:
|
||||
print('>>> Interactive shell: try using `req`')
|
||||
|
||||
else:
|
||||
if not sys.flags.interactive:
|
||||
print(req['data'].decode(req['encoding']))
|
||||
|
@@ -73,7 +73,7 @@ item_updated = atom03:updated
|
||||
mode = json
|
||||
|
||||
mimetype = application/json
|
||||
timeformat = %Y-%m-%dT%H:%M:%S%z
|
||||
timeformat = %Y-%m-%dT%H:%M:%SZ
|
||||
base = {}
|
||||
|
||||
title = title
|
||||
@@ -90,6 +90,9 @@ item_updated = updated
|
||||
[html]
|
||||
mode = html
|
||||
|
||||
path =
|
||||
http://localhost/
|
||||
|
||||
title = //div[@id='header']/h1
|
||||
desc = //div[@id='header']/p
|
||||
items = //div[@id='content']/div
|
||||
|
251
morss/feeds.py
251
morss/feeds.py
@@ -1,45 +1,32 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
import sys
|
||||
import os.path
|
||||
|
||||
import csv
|
||||
import json
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
|
||||
import re
|
||||
import json
|
||||
import csv
|
||||
|
||||
from fnmatch import fnmatch
|
||||
|
||||
import dateutil.parser
|
||||
import lxml.html
|
||||
from dateutil import tz
|
||||
from lxml import etree
|
||||
from dateutil import tz
|
||||
import dateutil.parser
|
||||
from copy import deepcopy
|
||||
|
||||
import lxml.html
|
||||
from .readabilite import parse as html_parse
|
||||
from .util import *
|
||||
|
||||
json.encoder.c_make_encoder = None
|
||||
|
||||
try:
|
||||
# python 2
|
||||
from ConfigParser import RawConfigParser
|
||||
from StringIO import StringIO
|
||||
from ConfigParser import RawConfigParser
|
||||
except ImportError:
|
||||
# python 3
|
||||
from configparser import RawConfigParser
|
||||
from io import StringIO
|
||||
from configparser import RawConfigParser
|
||||
|
||||
try:
|
||||
# python 2
|
||||
@@ -51,7 +38,7 @@ except NameError:
|
||||
|
||||
def parse_rules(filename=None):
|
||||
if not filename:
|
||||
filename = pkg_path('feedify.ini')
|
||||
filename = os.path.join(os.path.dirname(__file__), 'feedify.ini')
|
||||
|
||||
config = RawConfigParser()
|
||||
config.read(filename)
|
||||
@@ -65,10 +52,18 @@ def parse_rules(filename=None):
|
||||
# for each rule
|
||||
|
||||
if rules[section][arg].startswith('file:'):
|
||||
path = data_path('www', rules[section][arg][5:])
|
||||
file_raw = open(path).read()
|
||||
file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
|
||||
rules[section][arg] = file_clean
|
||||
paths = [os.path.join(sys.prefix, 'share/morss/www', rules[section][arg][5:]),
|
||||
os.path.join(os.path.dirname(__file__), '../www', rules[section][arg][5:]),
|
||||
os.path.join(os.path.dirname(__file__), '../..', rules[section][arg][5:])]
|
||||
|
||||
for path in paths:
|
||||
try:
|
||||
file_raw = open(path).read()
|
||||
file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
|
||||
rules[section][arg] = file_clean
|
||||
|
||||
except IOError:
|
||||
pass
|
||||
|
||||
elif '\n' in rules[section][arg]:
|
||||
rules[section][arg] = rules[section][arg].split('\n')[1:]
|
||||
@@ -76,25 +71,20 @@ def parse_rules(filename=None):
|
||||
return rules
|
||||
|
||||
|
||||
def parse(data, url=None, encoding=None, ruleset=None):
|
||||
def parse(data, url=None, encoding=None):
|
||||
" Determine which ruleset to use "
|
||||
|
||||
if ruleset is not None:
|
||||
rulesets = [ruleset]
|
||||
|
||||
else:
|
||||
rulesets = parse_rules().values()
|
||||
|
||||
rulesets = parse_rules()
|
||||
parsers = [FeedXML, FeedHTML, FeedJSON]
|
||||
|
||||
# 1) Look for a ruleset based on path
|
||||
|
||||
if url is not None:
|
||||
for ruleset in rulesets:
|
||||
for ruleset in rulesets.values():
|
||||
if 'path' in ruleset:
|
||||
for path in ruleset['path']:
|
||||
if fnmatch(url, path):
|
||||
parser = [x for x in parsers if x.mode == ruleset.get('mode')][0] # FIXME what if no mode specified?
|
||||
parser = [x for x in parsers if x.mode == ruleset['mode']][0]
|
||||
return parser(data, ruleset, encoding=encoding)
|
||||
|
||||
# 2) Try each and every parser
|
||||
@@ -104,6 +94,9 @@ def parse(data, url=None, encoding=None, ruleset=None):
|
||||
# 3b) See if .items matches anything
|
||||
|
||||
for parser in parsers:
|
||||
ruleset_candidates = [x for x in rulesets.values() if x['mode'] == parser.mode and 'path' not in x]
|
||||
# 'path' as they should have been caught beforehands
|
||||
|
||||
try:
|
||||
feed = parser(data, encoding=encoding)
|
||||
|
||||
@@ -114,17 +107,13 @@ def parse(data, url=None, encoding=None, ruleset=None):
|
||||
else:
|
||||
# parsing worked, now we try the rulesets
|
||||
|
||||
ruleset_candidates = [x for x in rulesets if x.get('mode') in (parser.mode, None) and 'path' not in x]
|
||||
# 'path' as they should have been caught beforehands
|
||||
# try anyway if no 'mode' specified
|
||||
|
||||
for ruleset in ruleset_candidates:
|
||||
feed.rules = ruleset
|
||||
|
||||
try:
|
||||
feed.items[0]
|
||||
|
||||
except (AttributeError, IndexError, TypeError):
|
||||
except (AttributeError, IndexError):
|
||||
# parsing and or item picking did not work out
|
||||
pass
|
||||
|
||||
@@ -187,12 +176,11 @@ class ParserBase(object):
|
||||
return self.convert(FeedHTML).tostring(**k)
|
||||
|
||||
def convert(self, TargetParser):
|
||||
target = TargetParser()
|
||||
|
||||
if type(self) == TargetParser and self.rules == target.rules:
|
||||
# check both type *AND* rules (e.g. when going from freeform xml to rss)
|
||||
if type(self) == TargetParser:
|
||||
return self
|
||||
|
||||
target = TargetParser()
|
||||
|
||||
for attr in target.dic:
|
||||
if attr == 'items':
|
||||
for item in self.items:
|
||||
@@ -331,7 +319,7 @@ class ParserXML(ParserBase):
|
||||
return self.root.getparent().remove(self.root)
|
||||
|
||||
def tostring(self, encoding='unicode', **k):
|
||||
return etree.tostring(self.root, encoding=encoding, method='xml', **k)
|
||||
return etree.tostring(self.root, encoding=encoding, **k)
|
||||
|
||||
def _rule_parse(self, rule):
|
||||
test = re.search(r'^(.*)/@([a-z]+)$', rule) # to match //div/a/@href
|
||||
@@ -361,13 +349,7 @@ class ParserXML(ParserBase):
|
||||
|
||||
def rule_search_all(self, rule):
|
||||
try:
|
||||
match = self.root.xpath(rule, namespaces=self.NSMAP)
|
||||
if isinstance(match, str):
|
||||
# some xpath rules return a single string instead of an array (e.g. concatenate() )
|
||||
return [match,]
|
||||
|
||||
else:
|
||||
return match
|
||||
return self.root.xpath(rule, namespaces=self.NSMAP)
|
||||
|
||||
except etree.XPathEvalError:
|
||||
return []
|
||||
@@ -430,7 +412,7 @@ class ParserXML(ParserBase):
|
||||
|
||||
match = self.rule_search(rrule)
|
||||
|
||||
html_rich = ('atom' in rule or self.rules.get('mode') == 'html') \
|
||||
html_rich = ('atom' in rule or self.rules['mode'] == 'html') \
|
||||
and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
|
||||
|
||||
if key is not None:
|
||||
@@ -441,7 +423,7 @@ class ParserXML(ParserBase):
|
||||
self._clean_node(match)
|
||||
match.append(lxml.html.fragment_fromstring(value, create_parent='div'))
|
||||
|
||||
if self.rules.get('mode') == 'html':
|
||||
if self.rules['mode'] == 'html':
|
||||
match.find('div').drop_tag() # not supported by lxml.etree
|
||||
|
||||
else: # i.e. if atom
|
||||
@@ -457,7 +439,7 @@ class ParserXML(ParserBase):
|
||||
def rule_str(self, rule):
|
||||
match = self.rule_search(rule)
|
||||
|
||||
html_rich = ('atom' in rule or self.mode == 'html') \
|
||||
html_rich = ('atom' in rule or self.rules['mode'] == 'html') \
|
||||
and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
|
||||
|
||||
if isinstance(match, etree._Element):
|
||||
@@ -481,7 +463,7 @@ class ParserHTML(ParserXML):
|
||||
return html_parse(raw, encoding=self.encoding)
|
||||
|
||||
def tostring(self, encoding='unicode', **k):
|
||||
return lxml.html.tostring(self.root, encoding=encoding, method='html', **k)
|
||||
return lxml.html.tostring(self.root, encoding=encoding, **k)
|
||||
|
||||
def rule_search_all(self, rule):
|
||||
try:
|
||||
@@ -490,14 +472,7 @@ class ParserHTML(ParserXML):
|
||||
repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
|
||||
rule = re.sub(pattern, repl, rule)
|
||||
|
||||
match = self.root.xpath(rule)
|
||||
|
||||
if isinstance(match, str):
|
||||
# for some xpath rules, see XML parser
|
||||
return [match,]
|
||||
|
||||
else:
|
||||
return match
|
||||
return self.root.xpath(rule)
|
||||
|
||||
except etree.XPathEvalError:
|
||||
return []
|
||||
@@ -516,31 +491,24 @@ class ParserHTML(ParserXML):
|
||||
|
||||
|
||||
def parse_time(value):
|
||||
# parsing per se
|
||||
if value is None or value == 0:
|
||||
time = None
|
||||
return None
|
||||
|
||||
elif isinstance(value, basestring):
|
||||
if re.match(r'^[0-9]+$', value):
|
||||
time = datetime.fromtimestamp(int(value))
|
||||
return datetime.fromtimestamp(int(value), tz.tzutc())
|
||||
|
||||
else:
|
||||
time = dateutil.parser.parse(value)
|
||||
return dateutil.parser.parse(value).replace(tzinfo=tz.tzutc())
|
||||
|
||||
elif isinstance(value, int):
|
||||
time = datetime.fromtimestamp(value)
|
||||
return datetime.fromtimestamp(value, tz.tzutc())
|
||||
|
||||
elif isinstance(value, datetime):
|
||||
time = value
|
||||
return value
|
||||
|
||||
else:
|
||||
time = None
|
||||
|
||||
# add default time zone if none set
|
||||
if time is not None and time.tzinfo is None:
|
||||
time = time.replace(tzinfo=tz.tzutc())
|
||||
|
||||
return time
|
||||
return None
|
||||
|
||||
|
||||
class ParserJSON(ParserBase):
|
||||
@@ -641,41 +609,34 @@ class ParserJSON(ParserBase):
|
||||
return out.replace('\n', '<br/>') if out else out
|
||||
|
||||
|
||||
def wrap_uniq(wrapper_fn_name):
|
||||
" Wraps the output of the function with the specified function "
|
||||
# This is called when parsing "wrap_uniq('wrap_item')"
|
||||
class Uniq(object):
|
||||
_map = {}
|
||||
_id = None
|
||||
|
||||
def decorator(func):
|
||||
# This is called when parsing "@wrap_uniq('wrap_item')"
|
||||
def __new__(cls, *args, **kwargs):
|
||||
# check if a wrapper was already created for it
|
||||
# if so, reuse it
|
||||
# if not, create a new one
|
||||
# note that the item itself (the tree node) is created beforehands
|
||||
|
||||
def wrapped_func(self, *args, **kwargs):
|
||||
# This is called when the wrapped function is called
|
||||
tmp_id = cls._gen_id(*args, **kwargs)
|
||||
if tmp_id in cls._map:
|
||||
return cls._map[tmp_id]
|
||||
|
||||
output = func(self, *args, **kwargs)
|
||||
output_id = id(output)
|
||||
|
||||
try:
|
||||
return self._map[output_id]
|
||||
|
||||
except (KeyError, AttributeError):
|
||||
if not hasattr(self, '_map'):
|
||||
self._map = {}
|
||||
|
||||
wrapper_fn = getattr(self, wrapper_fn_name)
|
||||
obj = wrapper_fn(output)
|
||||
self._map[output_id] = obj
|
||||
|
||||
return obj
|
||||
|
||||
return wrapped_func
|
||||
|
||||
return decorator
|
||||
else:
|
||||
obj = object.__new__(cls) #, *args, **kwargs)
|
||||
cls._map[tmp_id] = obj
|
||||
return obj
|
||||
|
||||
|
||||
class Feed(object):
|
||||
itemsClass = property(lambda x: Item) # because Item is define below, i.e. afterwards
|
||||
itemsClass = 'Item'
|
||||
dic = ('title', 'desc', 'items')
|
||||
|
||||
def wrap_items(self, items):
|
||||
itemsClass = globals()[self.itemsClass]
|
||||
return [itemsClass(x, self.rules, self) for x in items]
|
||||
|
||||
title = property(
|
||||
lambda f: f.get('title'),
|
||||
lambda f,x: f.set('title', x),
|
||||
@@ -691,7 +652,10 @@ class Feed(object):
|
||||
self.rule_create(self.rules['items'])
|
||||
item = self.items[-1]
|
||||
|
||||
for attr in self.itemsClass.dic:
|
||||
if new is None:
|
||||
return
|
||||
|
||||
for attr in globals()[self.itemsClass].dic:
|
||||
try:
|
||||
setattr(item, attr, getattr(new, attr))
|
||||
|
||||
@@ -699,17 +663,11 @@ class Feed(object):
|
||||
try:
|
||||
setattr(item, attr, new[attr])
|
||||
|
||||
except (KeyError, IndexError, TypeError):
|
||||
except (IndexError, TypeError):
|
||||
pass
|
||||
|
||||
return item
|
||||
|
||||
def wrap_item(self, item):
|
||||
return self.itemsClass(item, self.rules, self)
|
||||
|
||||
@wrap_uniq('wrap_item')
|
||||
def __getitem__(self, key):
|
||||
return self.get_raw('items')[key]
|
||||
return self.wrap_items(self.get_raw('items'))[key]
|
||||
|
||||
def __delitem__(self, key):
|
||||
self[key].remove()
|
||||
@@ -718,7 +676,7 @@ class Feed(object):
|
||||
return len(self.get_raw('items'))
|
||||
|
||||
|
||||
class Item(object):
|
||||
class Item(Uniq):
|
||||
dic = ('title', 'link', 'desc', 'content', 'time', 'updated')
|
||||
|
||||
def __init__(self, xml=None, rules=None, parent=None):
|
||||
@@ -757,45 +715,32 @@ class Item(object):
|
||||
lambda f: f.rmv('item_updated') )
|
||||
|
||||
|
||||
class FeedXML(Feed, ParserXML):
|
||||
itemsClass = 'ItemXML'
|
||||
|
||||
def tostring(self, encoding='unicode', **k):
|
||||
# override needed due to "getroottree" inclusion
|
||||
|
||||
if self.root.getprevious() is None:
|
||||
self.root.addprevious(etree.PI('xml-stylesheet', 'type="text/xsl" href="/sheet.xsl"'))
|
||||
|
||||
return etree.tostring(self.root.getroottree(), encoding=encoding, **k)
|
||||
|
||||
|
||||
class ItemXML(Item, ParserXML):
|
||||
pass
|
||||
|
||||
|
||||
class FeedXML(Feed, ParserXML):
|
||||
itemsClass = ItemXML
|
||||
|
||||
def root_siblings(self):
|
||||
out = []
|
||||
current = self.root.getprevious()
|
||||
|
||||
while current is not None:
|
||||
out.append(current)
|
||||
current = current.getprevious()
|
||||
|
||||
return out
|
||||
|
||||
def tostring(self, encoding='unicode', **k):
|
||||
# override needed due to "getroottree" inclusion
|
||||
# and to add stylesheet
|
||||
|
||||
stylesheets = [x for x in self.root_siblings() if isinstance(x, etree.PIBase) and x.target == 'xml-stylesheet']
|
||||
|
||||
for stylesheet in stylesheets:
|
||||
# remove all stylesheets present (be that ours or others')
|
||||
self.root.append(stylesheet) # needed as we can't delete root siblings https://stackoverflow.com/a/60232366
|
||||
self.root.remove(stylesheet)
|
||||
|
||||
self.root.addprevious(etree.PI('xml-stylesheet', 'type="text/xsl" href="/sheet.xsl"'))
|
||||
|
||||
return etree.tostring(self.root.getroottree(), encoding=encoding, method='xml', **k)
|
||||
class FeedHTML(Feed, ParserHTML):
|
||||
itemsClass = 'ItemHTML'
|
||||
|
||||
|
||||
class ItemHTML(Item, ParserHTML):
|
||||
pass
|
||||
|
||||
|
||||
class FeedHTML(Feed, ParserHTML):
|
||||
itemsClass = ItemHTML
|
||||
class FeedJSON(Feed, ParserJSON):
|
||||
itemsClass = 'ItemJSON'
|
||||
|
||||
|
||||
class ItemJSON(Item, ParserJSON):
|
||||
@@ -810,21 +755,13 @@ class ItemJSON(Item, ParserJSON):
|
||||
|
||||
cur = cur[node]
|
||||
|
||||
class FeedJSON(Feed, ParserJSON):
|
||||
itemsClass = ItemJSON
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
|
||||
from . import crawler
|
||||
|
||||
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')
|
||||
feed = parse(req['data'], url=req['url'], encoding=req['encoding'])
|
||||
|
||||
if sys.flags.interactive:
|
||||
print('>>> Interactive shell: try using `feed`')
|
||||
|
||||
else:
|
||||
if not sys.flags.interactive:
|
||||
for item in feed.items:
|
||||
print(item.title, item.link)
|
||||
|
532
morss/morss.py
532
morss/morss.py
@@ -1,66 +1,72 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
import os.path
|
||||
|
||||
import time
|
||||
from datetime import datetime
|
||||
from dateutil import tz
|
||||
|
||||
from fnmatch import fnmatch
|
||||
import re
|
||||
|
||||
import lxml.etree
|
||||
import lxml.html
|
||||
from dateutil import tz
|
||||
|
||||
from . import caching, crawler, feeds, readabilite
|
||||
from . import feeds
|
||||
from . import crawler
|
||||
from . import readabilite
|
||||
|
||||
import wsgiref.simple_server
|
||||
import wsgiref.handlers
|
||||
import cgitb
|
||||
|
||||
|
||||
try:
|
||||
# python 2
|
||||
from httplib import HTTPException
|
||||
from urlparse import parse_qs, urljoin, urlparse
|
||||
from urllib import unquote
|
||||
from urlparse import urlparse, urljoin, parse_qs
|
||||
except ImportError:
|
||||
# python 3
|
||||
from http.client import HTTPException
|
||||
from urllib.parse import parse_qs, urljoin, urlparse
|
||||
from urllib.parse import unquote
|
||||
from urllib.parse import urlparse, urljoin, parse_qs
|
||||
|
||||
MAX_ITEM = 5 # cache-only beyond
|
||||
MAX_TIME = 2 # cache-only after (in sec)
|
||||
|
||||
LIM_ITEM = 10 # deletes what's beyond
|
||||
LIM_TIME = 2.5 # deletes what's after
|
||||
|
||||
DELAY = 10 * 60 # xml cache & ETag cache (in sec)
|
||||
TIMEOUT = 4 # http timeout (in sec)
|
||||
|
||||
DEBUG = False
|
||||
PORT = 8080
|
||||
|
||||
|
||||
MAX_ITEM = int(os.getenv('MAX_ITEM', 5)) # cache-only beyond
|
||||
MAX_TIME = int(os.getenv('MAX_TIME', 2)) # cache-only after (in sec)
|
||||
def filterOptions(options):
|
||||
return options
|
||||
|
||||
LIM_ITEM = int(os.getenv('LIM_ITEM', 10)) # deletes what's beyond
|
||||
LIM_TIME = int(os.getenv('LIM_TIME', 2.5)) # deletes what's after
|
||||
# example of filtering code below
|
||||
|
||||
DELAY = int(os.getenv('DELAY', 10 * 60)) # xml cache & ETag cache (in sec)
|
||||
TIMEOUT = int(os.getenv('TIMEOUT', 4)) # http timeout (in sec)
|
||||
#allowed = ['proxy', 'clip', 'cache', 'force', 'silent', 'pro', 'debug']
|
||||
#filtered = dict([(key,value) for (key,value) in options.items() if key in allowed])
|
||||
|
||||
#return filtered
|
||||
|
||||
|
||||
class MorssException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def log(txt):
|
||||
if 'DEBUG' in os.environ:
|
||||
def log(txt, force=False):
|
||||
if DEBUG or force:
|
||||
if 'REQUEST_URI' in os.environ:
|
||||
# when running on Apache
|
||||
open('morss.log', 'a').write("%s\n" % repr(txt))
|
||||
|
||||
else:
|
||||
# when using internal server or cli
|
||||
print(repr(txt), file=sys.stderr)
|
||||
print(repr(txt))
|
||||
|
||||
|
||||
def len_html(txt):
|
||||
@@ -87,12 +93,12 @@ class Options:
|
||||
else:
|
||||
self.options = options or {}
|
||||
|
||||
def __getattr__(self, key, default=None):
|
||||
def __getattr__(self, key):
|
||||
if key in self.options:
|
||||
return self.options[key]
|
||||
|
||||
else:
|
||||
return default
|
||||
return False
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
self.options[key] = value
|
||||
@@ -100,10 +106,31 @@ class Options:
|
||||
def __contains__(self, key):
|
||||
return key in self.options
|
||||
|
||||
get = __getitem__ = __getattr__
|
||||
|
||||
def parseOptions(options):
|
||||
""" Turns ['md=True'] into {'md':True} """
|
||||
out = {}
|
||||
|
||||
for option in options:
|
||||
split = option.split('=', 1)
|
||||
|
||||
if len(split) > 1:
|
||||
if split[0].lower() == 'true':
|
||||
out[split[0]] = True
|
||||
|
||||
elif split[0].lower() == 'false':
|
||||
out[split[0]] = False
|
||||
|
||||
else:
|
||||
out[split[0]] = split[1]
|
||||
|
||||
else:
|
||||
out[split[0]] = True
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def ItemFix(item, options, feedurl='/'):
|
||||
def ItemFix(item, feedurl='/'):
|
||||
""" Improves feed items (absolute links, resolve feedburner links, etc) """
|
||||
|
||||
# check unwanted uppercase title
|
||||
@@ -122,13 +149,6 @@ def ItemFix(item, options, feedurl='/'):
|
||||
item.link = match[0]
|
||||
log(item.link)
|
||||
|
||||
# at user's election, use first <a>
|
||||
if options.firstlink and (item.desc or item.content):
|
||||
match = lxml.html.fromstring(item.desc or item.content).xpath('//a/@href')
|
||||
if len(match):
|
||||
item.link = match[0]
|
||||
log(item.link)
|
||||
|
||||
# check relative urls
|
||||
item.link = urljoin(feedurl, item.link)
|
||||
|
||||
@@ -190,25 +210,45 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||
|
||||
if not item.link:
|
||||
log('no link')
|
||||
return True
|
||||
return item
|
||||
|
||||
log(item.link)
|
||||
|
||||
link = item.link
|
||||
|
||||
# twitter
|
||||
if urlparse(feedurl).netloc == 'twitter.com':
|
||||
match = lxml.html.fromstring(item.desc).xpath('//a/@data-expanded-url')
|
||||
if len(match):
|
||||
link = match[0]
|
||||
log(link)
|
||||
|
||||
else:
|
||||
link = None
|
||||
|
||||
# facebook
|
||||
if urlparse(feedurl).netloc == 'graph.facebook.com':
|
||||
match = lxml.html.fromstring(item.content).xpath('//a/@href')
|
||||
if len(match) and urlparse(match[0]).netloc != 'www.facebook.com':
|
||||
link = match[0]
|
||||
log(link)
|
||||
|
||||
else:
|
||||
link = None
|
||||
|
||||
if link is None:
|
||||
log('no used link')
|
||||
return True
|
||||
|
||||
# download
|
||||
delay = -1
|
||||
|
||||
if fast or options.cache:
|
||||
# force cache, don't fetch
|
||||
policy = 'offline'
|
||||
|
||||
elif options.force:
|
||||
# force refresh
|
||||
policy = 'refresh'
|
||||
|
||||
else:
|
||||
policy = None
|
||||
if fast:
|
||||
# super-fast mode
|
||||
delay = -2
|
||||
|
||||
try:
|
||||
req = crawler.adv_get(url=item.link, policy=policy, force_min=24*60*60, timeout=TIMEOUT)
|
||||
req = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
|
||||
|
||||
except (IOError, HTTPException) as e:
|
||||
log('http error')
|
||||
@@ -218,18 +258,11 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||
log('non-text page')
|
||||
return True
|
||||
|
||||
if not req['data']:
|
||||
log('empty page')
|
||||
return True
|
||||
|
||||
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode', xpath=options.xpath)
|
||||
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
|
||||
|
||||
if out is not None:
|
||||
item.content = out
|
||||
|
||||
if options.resolve:
|
||||
item.link = req['url']
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@@ -246,7 +279,7 @@ def ItemBefore(item, options):
|
||||
|
||||
def ItemAfter(item, options):
|
||||
if options.clip and item.desc and item.content:
|
||||
item.content = item.desc + "<br/><br/><hr/><br/><br/>" + item.content
|
||||
item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
|
||||
del item.desc
|
||||
|
||||
if options.nolink and item.content:
|
||||
@@ -254,7 +287,7 @@ def ItemAfter(item, options):
|
||||
for link in content.xpath('//a'):
|
||||
log(link.text_content())
|
||||
link.drop_tag()
|
||||
item.content = lxml.etree.tostring(content, method='html')
|
||||
item.content = lxml.etree.tostring(content)
|
||||
|
||||
if options.noref:
|
||||
item.link = ''
|
||||
@@ -266,43 +299,33 @@ def FeedFetch(url, options):
|
||||
# fetch feed
|
||||
delay = DELAY
|
||||
|
||||
if options.cache:
|
||||
policy = 'offline'
|
||||
|
||||
elif options.force:
|
||||
policy = 'refresh'
|
||||
|
||||
else:
|
||||
policy = None
|
||||
if options.theforce:
|
||||
delay = 0
|
||||
|
||||
try:
|
||||
req = crawler.adv_get(url=url, post=options.post, follow=('rss' if not options.items else None), policy=policy, force_min=5*60, force_max=60*60, timeout=TIMEOUT)
|
||||
req = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2)
|
||||
|
||||
except (IOError, HTTPException):
|
||||
raise MorssException('Error downloading feed')
|
||||
|
||||
if options.items:
|
||||
# using custom rules
|
||||
ruleset = {}
|
||||
rss = feeds.FeedHTML(req['data'], encoding=req['encoding'])
|
||||
|
||||
ruleset['items'] = options.items
|
||||
rss.rules['title'] = options.title if options.title else '//head/title'
|
||||
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
|
||||
|
||||
if options.mode:
|
||||
ruleset['mode'] = options.mode
|
||||
rss.rules['items'] = options.items
|
||||
|
||||
ruleset['title'] = options.get('title', '//head/title')
|
||||
ruleset['desc'] = options.get('desc', '//head/meta[@name="description"]/@content')
|
||||
|
||||
ruleset['item_title'] = options.get('item_title', '.')
|
||||
ruleset['item_link'] = options.get('item_link', '(.|.//a|ancestor::a)/@href')
|
||||
rss.rules['item_title'] = options.item_title if options.item_title else './/a|.'
|
||||
rss.rules['item_link'] = options.item_link if options.item_link else './@href|.//a/@href'
|
||||
|
||||
if options.item_content:
|
||||
ruleset['item_content'] = options.item_content
|
||||
rss.rules['item_content'] = options.item_content
|
||||
|
||||
if options.item_time:
|
||||
ruleset['item_time'] = options.item_time
|
||||
rss.rules['item_time'] = options.item_time
|
||||
|
||||
rss = feeds.parse(req['data'], encoding=req['encoding'], ruleset=ruleset)
|
||||
rss = rss.convert(feeds.FeedXML)
|
||||
|
||||
else:
|
||||
@@ -316,7 +339,7 @@ def FeedFetch(url, options):
|
||||
log(req['contenttype'])
|
||||
raise MorssException('Link provided is not a valid feed')
|
||||
|
||||
return req['url'], rss
|
||||
return rss
|
||||
|
||||
|
||||
def FeedGather(rss, url, options):
|
||||
@@ -332,23 +355,9 @@ def FeedGather(rss, url, options):
|
||||
if options.cache:
|
||||
max_time = 0
|
||||
|
||||
# sort
|
||||
sorted_items = list(rss.items)
|
||||
|
||||
if options.order == 'last':
|
||||
# `first` does nothing from a practical standpoint, so only `last` needs
|
||||
# to be addressed
|
||||
sorted_items = reversed(sorted_items)
|
||||
|
||||
elif options.order in ['newest', 'oldest']:
|
||||
now = datetime.now(tz.tzutc())
|
||||
sorted_items = sorted(sorted_items, key=lambda x:x.updated or x.time or now) # oldest to newest
|
||||
|
||||
if options.order == 'newest':
|
||||
sorted_items = reversed(sorted_items)
|
||||
|
||||
now = datetime.now(tz.tzutc())
|
||||
sorted_items = sorted(rss.items, key=lambda x:x.updated or x.time or now, reverse=True)
|
||||
for i, item in enumerate(sorted_items):
|
||||
# hard cap
|
||||
if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0:
|
||||
log('dropped')
|
||||
item.remove()
|
||||
@@ -359,9 +368,8 @@ def FeedGather(rss, url, options):
|
||||
if item is None:
|
||||
continue
|
||||
|
||||
item = ItemFix(item, options, url)
|
||||
item = ItemFix(item, url)
|
||||
|
||||
# soft cap
|
||||
if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
|
||||
if not options.proxy:
|
||||
if ItemFill(item, options, url, True) is False:
|
||||
@@ -396,24 +404,24 @@ def FeedFormat(rss, options, encoding='utf-8'):
|
||||
else:
|
||||
raise MorssException('Invalid callback var name')
|
||||
|
||||
elif options.format == 'json':
|
||||
elif options.json:
|
||||
if options.indent:
|
||||
return rss.tojson(encoding=encoding, indent=4)
|
||||
|
||||
else:
|
||||
return rss.tojson(encoding=encoding)
|
||||
|
||||
elif options.format == 'csv':
|
||||
elif options.csv:
|
||||
return rss.tocsv(encoding=encoding)
|
||||
|
||||
elif options.format == 'html':
|
||||
elif options.html:
|
||||
if options.indent:
|
||||
return rss.tohtml(encoding=encoding, pretty_print=True)
|
||||
|
||||
else:
|
||||
return rss.tohtml(encoding=encoding)
|
||||
|
||||
else: # i.e. format == 'rss'
|
||||
else:
|
||||
if options.indent:
|
||||
return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding, pretty_print=True)
|
||||
|
||||
@@ -428,9 +436,307 @@ def process(url, cache=None, options=None):
|
||||
options = Options(options)
|
||||
|
||||
if cache:
|
||||
caching.default_cache = caching.DiskCacheHandler(cache)
|
||||
crawler.default_cache = crawler.SQLiteCache(cache)
|
||||
|
||||
url, rss = FeedFetch(url, options)
|
||||
rss = FeedFetch(url, options)
|
||||
rss = FeedGather(rss, url, options)
|
||||
|
||||
return FeedFormat(rss, options, 'unicode')
|
||||
|
||||
|
||||
def cgi_parse_environ(environ):
|
||||
# get options
|
||||
|
||||
if 'REQUEST_URI' in environ:
|
||||
url = environ['REQUEST_URI'][1:]
|
||||
else:
|
||||
url = environ['PATH_INFO'][1:]
|
||||
|
||||
if environ['QUERY_STRING']:
|
||||
url += '?' + environ['QUERY_STRING']
|
||||
|
||||
url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
|
||||
|
||||
if url.startswith(':'):
|
||||
split = url.split('/', 1)
|
||||
|
||||
raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:]
|
||||
|
||||
if len(split) > 1:
|
||||
url = split[1]
|
||||
else:
|
||||
url = ''
|
||||
|
||||
else:
|
||||
raw_options = []
|
||||
|
||||
# init
|
||||
options = Options(filterOptions(parseOptions(raw_options)))
|
||||
|
||||
global DEBUG
|
||||
DEBUG = options.debug
|
||||
|
||||
return (url, options)
|
||||
|
||||
|
||||
def cgi_app(environ, start_response):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
headers = {}
|
||||
|
||||
# headers
|
||||
headers['status'] = '200 OK'
|
||||
headers['cache-control'] = 'max-age=%s' % DELAY
|
||||
|
||||
if options.cors:
|
||||
headers['access-control-allow-origin'] = '*'
|
||||
|
||||
if options.html:
|
||||
headers['content-type'] = 'text/html'
|
||||
elif options.txt or options.silent:
|
||||
headers['content-type'] = 'text/plain'
|
||||
elif options.json:
|
||||
headers['content-type'] = 'application/json'
|
||||
elif options.callback:
|
||||
headers['content-type'] = 'application/javascript'
|
||||
elif options.csv:
|
||||
headers['content-type'] = 'text/csv'
|
||||
headers['content-disposition'] = 'attachment; filename="feed.csv"'
|
||||
else:
|
||||
headers['content-type'] = 'text/xml'
|
||||
|
||||
headers['content-type'] += '; charset=utf-8'
|
||||
|
||||
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
|
||||
|
||||
# get the work done
|
||||
rss = FeedFetch(url, options)
|
||||
|
||||
if headers['content-type'] == 'text/xml':
|
||||
headers['content-type'] = rss.mimetype[0]
|
||||
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
|
||||
rss = FeedGather(rss, url, options)
|
||||
out = FeedFormat(rss, options)
|
||||
|
||||
if options.silent:
|
||||
return ['']
|
||||
|
||||
else:
|
||||
return [out]
|
||||
|
||||
|
||||
def middleware(func):
|
||||
" Decorator to turn a function into a wsgi middleware "
|
||||
# This is called when parsing the "@middleware" code
|
||||
|
||||
def app_builder(app):
|
||||
# This is called when doing app = cgi_wrapper(app)
|
||||
|
||||
def app_wrap(environ, start_response):
|
||||
# This is called when a http request is being processed
|
||||
|
||||
return func(environ, start_response, app)
|
||||
|
||||
return app_wrap
|
||||
|
||||
return app_builder
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_file_handler(environ, start_response, app):
|
||||
" Simple HTTP server to serve static files (.html, .css, etc.) "
|
||||
|
||||
files = {
|
||||
'': 'text/html',
|
||||
'index.html': 'text/html',
|
||||
'sheet.xsl': 'text/xsl'}
|
||||
|
||||
if 'REQUEST_URI' in environ:
|
||||
url = environ['REQUEST_URI'][1:]
|
||||
|
||||
else:
|
||||
url = environ['PATH_INFO'][1:]
|
||||
|
||||
if url in files:
|
||||
headers = {}
|
||||
|
||||
if url == '':
|
||||
url = 'index.html'
|
||||
|
||||
paths = [os.path.join(sys.prefix, 'share/morss/www', url),
|
||||
os.path.join(os.path.dirname(__file__), '../www', url)]
|
||||
|
||||
for path in paths:
|
||||
try:
|
||||
body = open(path, 'rb').read()
|
||||
|
||||
headers['status'] = '200 OK'
|
||||
headers['content-type'] = files[url]
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return [body]
|
||||
|
||||
except IOError:
|
||||
continue
|
||||
|
||||
else:
|
||||
# the for loop did not return, so here we are, i.e. no file found
|
||||
headers['status'] = '404 Not found'
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return ['Error %s' % headers['status']]
|
||||
|
||||
else:
|
||||
return app(environ, start_response)
|
||||
|
||||
|
||||
def cgi_get(environ, start_response):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
# get page
|
||||
req = crawler.adv_get(url=url, timeout=TIMEOUT)
|
||||
|
||||
if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
||||
if options.get == 'page':
|
||||
html = readabilite.parse(req['data'], encoding=req['encoding'])
|
||||
html.make_links_absolute(req['url'])
|
||||
|
||||
kill_tags = ['script', 'iframe', 'noscript']
|
||||
|
||||
for tag in kill_tags:
|
||||
for elem in html.xpath('//'+tag):
|
||||
elem.getparent().remove(elem)
|
||||
|
||||
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
|
||||
|
||||
elif options.get == 'article':
|
||||
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
|
||||
|
||||
else:
|
||||
raise MorssException('no :get option passed')
|
||||
|
||||
else:
|
||||
output = req['data']
|
||||
|
||||
# return html page
|
||||
headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8'}
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return [output]
|
||||
|
||||
|
||||
dispatch_table = {
|
||||
'get': cgi_get,
|
||||
}
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_dispatcher(environ, start_response, app):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
for key in dispatch_table.keys():
|
||||
if key in options:
|
||||
return dispatch_table[key](environ, start_response)
|
||||
|
||||
return app(environ, start_response)
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_error_handler(environ, start_response, app):
|
||||
try:
|
||||
return app(environ, start_response)
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
headers = {'status': '500 Oops', 'content-type': 'text/html'}
|
||||
start_response(headers['status'], list(headers.items()), sys.exc_info())
|
||||
log('ERROR: %s' % repr(e), force=True)
|
||||
return [cgitb.html(sys.exc_info())]
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_encode(environ, start_response, app):
|
||||
out = app(environ, start_response)
|
||||
return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
|
||||
|
||||
|
||||
cgi_standalone_app = cgi_encode(cgi_error_handler(cgi_dispatcher(cgi_file_handler(cgi_app))))
|
||||
|
||||
|
||||
def cli_app():
|
||||
options = Options(filterOptions(parseOptions(sys.argv[1:-1])))
|
||||
url = sys.argv[-1]
|
||||
|
||||
global DEBUG
|
||||
DEBUG = options.debug
|
||||
|
||||
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
|
||||
|
||||
rss = FeedFetch(url, options)
|
||||
rss = FeedGather(rss, url, options)
|
||||
out = FeedFormat(rss, options, 'unicode')
|
||||
|
||||
if not options.silent:
|
||||
print(out)
|
||||
|
||||
log('done')
|
||||
|
||||
|
||||
def isInt(string):
|
||||
try:
|
||||
int(string)
|
||||
return True
|
||||
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
if 'REQUEST_URI' in os.environ:
|
||||
# mod_cgi
|
||||
|
||||
app = cgi_app
|
||||
app = cgi_dispatcher(app)
|
||||
app = cgi_error_handler(app)
|
||||
app = cgi_encode(app)
|
||||
|
||||
wsgiref.handlers.CGIHandler().run(app)
|
||||
|
||||
elif len(sys.argv) <= 1 or isInt(sys.argv[1]):
|
||||
# start internal (basic) http server
|
||||
|
||||
if len(sys.argv) > 1 and isInt(sys.argv[1]):
|
||||
argPort = int(sys.argv[1])
|
||||
if argPort > 0:
|
||||
port = argPort
|
||||
|
||||
else:
|
||||
raise MorssException('Port must be positive integer')
|
||||
|
||||
else:
|
||||
port = PORT
|
||||
|
||||
app = cgi_app
|
||||
app = cgi_file_handler(app)
|
||||
app = cgi_dispatcher(app)
|
||||
app = cgi_error_handler(app)
|
||||
app = cgi_encode(app)
|
||||
|
||||
print('Serving http://localhost:%s/' % port)
|
||||
httpd = wsgiref.simple_server.make_server('', port, app)
|
||||
httpd.serve_forever()
|
||||
|
||||
else:
|
||||
# as a CLI app
|
||||
try:
|
||||
cli_app()
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
print('ERROR: %s' % e.message)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@@ -1,36 +1,19 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import re
|
||||
|
||||
import bs4.builder._lxml
|
||||
import lxml.etree
|
||||
import lxml.html
|
||||
import lxml.html.soupparser
|
||||
|
||||
|
||||
class CustomTreeBuilder(bs4.builder._lxml.LXMLTreeBuilder):
|
||||
def default_parser(self, encoding):
|
||||
return lxml.html.HTMLParser(target=self, remove_comments=True, remove_pis=True, encoding=encoding)
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
|
||||
|
||||
def parse(data, encoding=None):
|
||||
kwargs = {'from_encoding': encoding} if encoding else {}
|
||||
return lxml.html.soupparser.fromstring(data, builder=CustomTreeBuilder, **kwargs)
|
||||
if encoding:
|
||||
data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')
|
||||
|
||||
else:
|
||||
data = BeautifulSoup(data, 'lxml').prettify('utf-8')
|
||||
|
||||
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding='utf-8')
|
||||
|
||||
return lxml.html.fromstring(data, parser=parser)
|
||||
|
||||
|
||||
def count_words(string):
|
||||
@@ -43,8 +26,6 @@ def count_words(string):
|
||||
if string is None:
|
||||
return 0
|
||||
|
||||
string = string.strip()
|
||||
|
||||
i = 0
|
||||
count = 0
|
||||
|
||||
@@ -66,6 +47,12 @@ def count_content(node):
|
||||
return count_words(node.text_content()) + len(node.findall('.//img'))
|
||||
|
||||
|
||||
def percentile(N, P):
|
||||
# https://stackoverflow.com/a/7464107
|
||||
n = max(int(round(P * len(N) + 0.5)), 2)
|
||||
return N[n-2]
|
||||
|
||||
|
||||
class_bad = ['comment', 'community', 'extra', 'foot',
|
||||
'sponsor', 'pagination', 'pager', 'tweet', 'twitter', 'com-', 'masthead',
|
||||
'media', 'meta', 'related', 'shopping', 'tags', 'tool', 'author', 'about',
|
||||
@@ -114,7 +101,7 @@ def score_node(node):
|
||||
" Score individual node "
|
||||
|
||||
score = 0
|
||||
class_id = (node.get('class') or '') + (node.get('id') or '')
|
||||
class_id = node.get('class', '') + node.get('id', '')
|
||||
|
||||
if (isinstance(node, lxml.html.HtmlComment)
|
||||
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
|
||||
@@ -144,7 +131,7 @@ def score_node(node):
|
||||
|
||||
if wc != 0:
|
||||
wca = count_words(' '.join([x.text_content() for x in node.findall('.//a')]))
|
||||
score = score * ( 1 - 2 * float(wca)/wc )
|
||||
score = score * ( 1 - float(wca)/wc )
|
||||
|
||||
return score
|
||||
|
||||
@@ -154,20 +141,15 @@ def score_all(node):
|
||||
|
||||
for child in node:
|
||||
score = score_node(child)
|
||||
set_score(child, score, 'morss_own_score')
|
||||
child.attrib['morss_own_score'] = str(float(score))
|
||||
|
||||
if score > 0 or len(list(child.iterancestors())) <= 2:
|
||||
spread_score(child, score)
|
||||
score_all(child)
|
||||
|
||||
|
||||
def set_score(node, value, label='morss_score'):
|
||||
try:
|
||||
node.attrib[label] = str(float(value))
|
||||
|
||||
except KeyError:
|
||||
# catch issues with e.g. html comments
|
||||
pass
|
||||
def set_score(node, value):
|
||||
node.attrib['morss_score'] = str(float(value))
|
||||
|
||||
|
||||
def get_score(node):
|
||||
@@ -207,12 +189,6 @@ def clean_root(root, keep_threshold=None):
|
||||
def clean_node(node, keep_threshold=None):
|
||||
parent = node.getparent()
|
||||
|
||||
# remove comments
|
||||
if (isinstance(node, lxml.html.HtmlComment)
|
||||
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
|
||||
parent.remove(node)
|
||||
return
|
||||
|
||||
if parent is None:
|
||||
# this is <html/> (or a removed element waiting for GC)
|
||||
return
|
||||
@@ -222,8 +198,8 @@ def clean_node(node, keep_threshold=None):
|
||||
parent.remove(node)
|
||||
return
|
||||
|
||||
# high score, so keep
|
||||
if keep_threshold is not None and keep_threshold > 0 and get_score(node) >= keep_threshold:
|
||||
if keep_threshold is not None and get_score(node) >= keep_threshold:
|
||||
# high score, so keep
|
||||
return
|
||||
|
||||
gdparent = parent.getparent()
|
||||
@@ -244,6 +220,11 @@ def clean_node(node, keep_threshold=None):
|
||||
parent.remove(node)
|
||||
return
|
||||
|
||||
# remove comments
|
||||
if isinstance(node, lxml.html.HtmlComment) or isinstance(node, lxml.html.HtmlProcessingInstruction):
|
||||
parent.remove(node)
|
||||
return
|
||||
|
||||
# remove if too many kids & too high link density
|
||||
wc = count_words(node.text_content())
|
||||
if wc != 0 and len(list(node.iter())) > 3:
|
||||
@@ -301,95 +282,74 @@ def clean_node(node, keep_threshold=None):
|
||||
gdparent.insert(gdparent.index(parent)+1, new_node)
|
||||
|
||||
|
||||
def lowest_common_ancestor(node_a, node_b, max_depth=None):
|
||||
ancestors_a = list(node_a.iterancestors())
|
||||
ancestors_b = list(node_b.iterancestors())
|
||||
def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
|
||||
ancestorsA = list(nodeA.iterancestors())
|
||||
ancestorsB = list(nodeB.iterancestors())
|
||||
|
||||
if max_depth is not None:
|
||||
ancestors_a = ancestors_a[:max_depth]
|
||||
ancestors_b = ancestors_b[:max_depth]
|
||||
ancestorsA = ancestorsA[:max_depth]
|
||||
ancestorsB = ancestorsB[:max_depth]
|
||||
|
||||
ancestors_a.insert(0, node_a)
|
||||
ancestors_b.insert(0, node_b)
|
||||
ancestorsA.insert(0, nodeA)
|
||||
ancestorsB.insert(0, nodeB)
|
||||
|
||||
for ancestor_a in ancestors_a:
|
||||
if ancestor_a in ancestors_b:
|
||||
return ancestor_a
|
||||
for ancestorA in ancestorsA:
|
||||
if ancestorA in ancestorsB:
|
||||
return ancestorA
|
||||
|
||||
return node_a # should always find one tho, at least <html/>, but needed for max_depth
|
||||
return nodeA # should always find one tho, at least <html/>, but needed for max_depth
|
||||
|
||||
|
||||
def get_best_node(html, threshold=5):
|
||||
# score all nodes
|
||||
score_all(html)
|
||||
|
||||
# rank all nodes (largest to smallest)
|
||||
ranked_nodes = sorted(html.iter(), key=lambda x: get_score(x), reverse=True)
|
||||
|
||||
# minimum threshold
|
||||
if not len(ranked_nodes) or get_score(ranked_nodes[0]) < threshold:
|
||||
return None
|
||||
|
||||
# take common ancestor or the two highest rated nodes
|
||||
if len(ranked_nodes) > 1:
|
||||
best = lowest_common_ancestor(ranked_nodes[0], ranked_nodes[1], 3)
|
||||
|
||||
else:
|
||||
best = ranked_nodes[0]
|
||||
|
||||
return best
|
||||
def rank_grades(grades):
|
||||
# largest score to smallest
|
||||
return sorted(grades.items(), key=lambda x: x[1], reverse=True)
|
||||
|
||||
|
||||
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5, xpath=None):
|
||||
def get_best_node(ranked_grades):
|
||||
" To pick the best (raw) node. Another function will clean it "
|
||||
|
||||
if len(ranked_grades) == 1:
|
||||
return ranked_grades[0]
|
||||
|
||||
lowest = lowest_common_ancestor(ranked_grades[0][0], ranked_grades[1][0], 3)
|
||||
|
||||
return lowest
|
||||
|
||||
|
||||
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5):
|
||||
" Input a raw html string, returns a raw html string of the article "
|
||||
|
||||
html = parse(data, encoding_in)
|
||||
score_all(html)
|
||||
scores = rank_grades(get_all_scores(html))
|
||||
|
||||
if xpath is not None:
|
||||
xpath_match = html.xpath(xpath)
|
||||
|
||||
if len(xpath_match):
|
||||
best = xpath_match[0]
|
||||
|
||||
else:
|
||||
best = get_best_node(html, threshold)
|
||||
|
||||
else:
|
||||
best = get_best_node(html, threshold)
|
||||
|
||||
if best is None:
|
||||
# if threshold not met
|
||||
if not len(scores) or scores[0][1] < threshold:
|
||||
return None
|
||||
|
||||
# clean up
|
||||
best = get_best_node(scores)
|
||||
|
||||
if not debug:
|
||||
keep_threshold = get_score(best) * 3/4
|
||||
keep_threshold = percentile([x[1] for x in scores], 0.1)
|
||||
clean_root(best, keep_threshold)
|
||||
|
||||
# check for spammy content (links only)
|
||||
wc = count_words(best.text_content())
|
||||
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
|
||||
|
||||
if not debug and (wc - wca < 50 or float(wca) / wc > 0.3):
|
||||
return None
|
||||
|
||||
# fix urls
|
||||
if url:
|
||||
best.make_links_absolute(url)
|
||||
|
||||
return lxml.etree.tostring(best if not debug else html, method='html', encoding=encoding_out)
|
||||
return lxml.etree.tostring(best if not debug else html, pretty_print=True, encoding=encoding_out)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
|
||||
from . import crawler
|
||||
|
||||
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
|
||||
article = get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
|
||||
|
||||
if sys.flags.interactive:
|
||||
print('>>> Interactive shell: try using `article`')
|
||||
|
||||
else:
|
||||
if not sys.flags.interactive:
|
||||
print(article)
|
||||
|
@@ -1,57 +0,0 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
|
||||
def pkg_path(*path_elements):
|
||||
return os.path.join(os.path.dirname(__file__), *path_elements)
|
||||
|
||||
|
||||
data_path_base = None
|
||||
|
||||
|
||||
def data_path(*path_elements):
|
||||
global data_path_base
|
||||
|
||||
path = os.path.join(*path_elements)
|
||||
|
||||
if data_path_base is not None:
|
||||
return os.path.join(data_path_base, path)
|
||||
|
||||
bases = [
|
||||
os.path.join(sys.prefix, 'share/morss'), # when installed as root
|
||||
pkg_path('../../../share/morss'),
|
||||
pkg_path('../../../../share/morss'),
|
||||
pkg_path('../share/morss'), # for `pip install --target=dir morss`
|
||||
pkg_path('..'), # when running from source tree
|
||||
]
|
||||
|
||||
if 'DATA_PATH' in os.environ:
|
||||
bases.append(os.environ['DATA_PATH'])
|
||||
|
||||
for base in bases:
|
||||
full_path = os.path.join(base, path)
|
||||
|
||||
if os.path.isfile(full_path):
|
||||
data_path_base = os.path.abspath(base)
|
||||
return data_path(path)
|
||||
|
||||
else:
|
||||
raise IOError()
|
298
morss/wsgi.py
298
morss/wsgi.py
@@ -1,298 +0,0 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import cgitb
|
||||
import mimetypes
|
||||
import os.path
|
||||
import re
|
||||
import sys
|
||||
import wsgiref.handlers
|
||||
import wsgiref.simple_server
|
||||
import wsgiref.util
|
||||
|
||||
import lxml.etree
|
||||
|
||||
try:
|
||||
# python 2
|
||||
from urllib import unquote
|
||||
except ImportError:
|
||||
# python 3
|
||||
from urllib.parse import unquote
|
||||
|
||||
from . import caching, crawler, readabilite
|
||||
from .morss import (DELAY, TIMEOUT, FeedFetch, FeedFormat, FeedGather,
|
||||
MorssException, Options, log)
|
||||
from .util import data_path
|
||||
|
||||
PORT = int(os.getenv('PORT', 8000))
|
||||
|
||||
|
||||
def parse_options(options):
|
||||
""" Turns ['md=True'] into {'md':True} """
|
||||
out = {}
|
||||
|
||||
for option in options:
|
||||
split = option.split('=', 1)
|
||||
|
||||
if len(split) > 1:
|
||||
out[split[0]] = unquote(split[1]).replace('|', '/') # | -> / for backward compatibility (and Apache)
|
||||
|
||||
else:
|
||||
out[split[0]] = True
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def request_uri(environ):
|
||||
if 'REQUEST_URI' in environ:
|
||||
# when running on Apache/uwsgi
|
||||
url = environ['REQUEST_URI']
|
||||
|
||||
elif 'RAW_URI' in environ:
|
||||
# gunicorn
|
||||
url = environ['RAW_URI']
|
||||
|
||||
else:
|
||||
# when using other servers
|
||||
url = environ['PATH_INFO']
|
||||
|
||||
if environ['QUERY_STRING']:
|
||||
url += '?' + environ['QUERY_STRING']
|
||||
|
||||
return url
|
||||
|
||||
|
||||
def cgi_parse_environ(environ):
|
||||
# get options
|
||||
|
||||
url = request_uri(environ)[1:]
|
||||
url = re.sub(r'^(cgi/)?(morss.py|main.py)/', '', url)
|
||||
|
||||
if url.startswith(':'):
|
||||
parts = url.split('/', 1)
|
||||
raw_options = parts[0].split(':')[1:]
|
||||
url = parts[1] if len(parts) > 1 else ''
|
||||
|
||||
else:
|
||||
raw_options = []
|
||||
|
||||
# init
|
||||
options = Options(parse_options(raw_options))
|
||||
|
||||
return (url, options)
|
||||
|
||||
|
||||
def cgi_app(environ, start_response):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
headers = {}
|
||||
|
||||
# headers
|
||||
headers['status'] = '200 OK'
|
||||
headers['cache-control'] = 'max-age=%s' % DELAY
|
||||
headers['x-content-type-options'] = 'nosniff' # safari work around
|
||||
|
||||
if options.cors:
|
||||
headers['access-control-allow-origin'] = '*'
|
||||
|
||||
if options.format == 'html':
|
||||
headers['content-type'] = 'text/html'
|
||||
elif options.txt or options.silent:
|
||||
headers['content-type'] = 'text/plain'
|
||||
elif options.format == 'json':
|
||||
headers['content-type'] = 'application/json'
|
||||
elif options.callback:
|
||||
headers['content-type'] = 'application/javascript'
|
||||
elif options.format == 'csv':
|
||||
headers['content-type'] = 'text/csv'
|
||||
headers['content-disposition'] = 'attachment; filename="feed.csv"'
|
||||
else:
|
||||
headers['content-type'] = 'text/xml'
|
||||
|
||||
headers['content-type'] += '; charset=utf-8'
|
||||
|
||||
# get the work done
|
||||
url, rss = FeedFetch(url, options)
|
||||
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
|
||||
rss = FeedGather(rss, url, options)
|
||||
out = FeedFormat(rss, options)
|
||||
|
||||
if options.silent:
|
||||
return ['']
|
||||
|
||||
else:
|
||||
return [out]
|
||||
|
||||
|
||||
def middleware(func):
|
||||
" Decorator to turn a function into a wsgi middleware "
|
||||
# This is called when parsing the "@middleware" code
|
||||
|
||||
def app_builder(app):
|
||||
# This is called when doing app = cgi_wrapper(app)
|
||||
|
||||
def app_wrap(environ, start_response):
|
||||
# This is called when a http request is being processed
|
||||
|
||||
return func(environ, start_response, app)
|
||||
|
||||
return app_wrap
|
||||
|
||||
return app_builder
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_file_handler(environ, start_response, app):
|
||||
" Simple HTTP server to serve static files (.html, .css, etc.) "
|
||||
|
||||
url = request_uri(environ)[1:]
|
||||
|
||||
if url == '':
|
||||
url = 'index.html'
|
||||
|
||||
if re.match(r'^/?([a-zA-Z0-9_-][a-zA-Z0-9\._-]+/?)*$', url):
|
||||
# if it is a legitimate url (no funny relative paths)
|
||||
try:
|
||||
path = data_path('www', url)
|
||||
f = open(path, 'rb')
|
||||
|
||||
except IOError:
|
||||
# problem with file (cannot open or not found)
|
||||
pass
|
||||
|
||||
else:
|
||||
# file successfully open
|
||||
headers = {}
|
||||
headers['status'] = '200 OK'
|
||||
headers['content-type'] = mimetypes.guess_type(path)[0] or 'application/octet-stream'
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return wsgiref.util.FileWrapper(f)
|
||||
|
||||
# regex didn't validate or no file found
|
||||
return app(environ, start_response)
|
||||
|
||||
|
||||
def cgi_get(environ, start_response):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
# get page
|
||||
if options['get'] in ('page', 'article'):
|
||||
req = crawler.adv_get(url=url, timeout=TIMEOUT)
|
||||
|
||||
if req['contenttype'] in crawler.MIMETYPE['html']:
|
||||
if options['get'] == 'page':
|
||||
html = readabilite.parse(req['data'], encoding=req['encoding'])
|
||||
html.make_links_absolute(req['url'])
|
||||
|
||||
kill_tags = ['script', 'iframe', 'noscript']
|
||||
|
||||
for tag in kill_tags:
|
||||
for elem in html.xpath('//'+tag):
|
||||
elem.getparent().remove(elem)
|
||||
|
||||
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
|
||||
|
||||
else: # i.e. options['get'] == 'article'
|
||||
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
|
||||
|
||||
elif req['contenttype'] in crawler.MIMETYPE['xml'] + crawler.MIMETYPE['rss'] + crawler.MIMETYPE['json']:
|
||||
output = req['data']
|
||||
|
||||
else:
|
||||
raise MorssException('unsupported mimetype')
|
||||
|
||||
else:
|
||||
raise MorssException('no :get option passed')
|
||||
|
||||
# return html page
|
||||
headers = {'status': '200 OK', 'content-type': req['contenttype'], 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return [output]
|
||||
|
||||
|
||||
dispatch_table = {
|
||||
'get': cgi_get,
|
||||
}
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_dispatcher(environ, start_response, app):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
for key in dispatch_table.keys():
|
||||
if key in options:
|
||||
return dispatch_table[key](environ, start_response)
|
||||
|
||||
return app(environ, start_response)
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_error_handler(environ, start_response, app):
|
||||
try:
|
||||
return app(environ, start_response)
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
headers = {'status': '404 Not Found', 'content-type': 'text/html', 'x-morss-error': repr(e)}
|
||||
start_response(headers['status'], list(headers.items()), sys.exc_info())
|
||||
log('ERROR: %s' % repr(e))
|
||||
return [cgitb.html(sys.exc_info())]
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_encode(environ, start_response, app):
|
||||
out = app(environ, start_response)
|
||||
return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
|
||||
|
||||
|
||||
application = cgi_app
|
||||
application = cgi_file_handler(application)
|
||||
application = cgi_dispatcher(application)
|
||||
application = cgi_error_handler(application)
|
||||
application = cgi_encode(application)
|
||||
|
||||
|
||||
def cgi_handle_request():
|
||||
app = cgi_app
|
||||
app = cgi_dispatcher(app)
|
||||
app = cgi_error_handler(app)
|
||||
app = cgi_encode(app)
|
||||
|
||||
wsgiref.handlers.CGIHandler().run(app)
|
||||
|
||||
|
||||
class WSGIRequestHandlerRequestUri(wsgiref.simple_server.WSGIRequestHandler):
|
||||
def get_environ(self):
|
||||
env = wsgiref.simple_server.WSGIRequestHandler.get_environ(self)
|
||||
env['REQUEST_URI'] = self.path
|
||||
return env
|
||||
|
||||
|
||||
def cgi_start_server():
|
||||
caching.default_cache.autotrim()
|
||||
|
||||
print('Serving http://localhost:%s/' % PORT)
|
||||
httpd = wsgiref.simple_server.make_server('', PORT, application, handler_class=WSGIRequestHandlerRequestUri)
|
||||
httpd.serve_forever()
|
||||
|
||||
|
||||
if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''):
|
||||
caching.default_cache.autotrim()
|
52
setup.py
52
setup.py
@@ -1,60 +1,24 @@
|
||||
from datetime import datetime
|
||||
from glob import glob
|
||||
|
||||
from setuptools import setup
|
||||
|
||||
|
||||
def get_version():
|
||||
with open('morss/__init__.py', 'r+') as file:
|
||||
lines = file.readlines()
|
||||
|
||||
# look for hard coded version number
|
||||
for i in range(len(lines)):
|
||||
if lines[i].startswith('__version__'):
|
||||
version = lines[i].split('"')[1]
|
||||
break
|
||||
|
||||
# create (& save) one if none found
|
||||
if version == '':
|
||||
version = datetime.now().strftime('%Y%m%d.%H%M')
|
||||
lines[i] = '__version__ = "' + version + '"\n'
|
||||
|
||||
file.seek(0)
|
||||
file.writelines(lines)
|
||||
|
||||
# return version number
|
||||
return version
|
||||
from glob import glob
|
||||
|
||||
package_name = 'morss'
|
||||
|
||||
setup(
|
||||
name = package_name,
|
||||
version = get_version(),
|
||||
description = 'Get full-text RSS feeds',
|
||||
long_description = open('README.md').read(),
|
||||
long_description_content_type = 'text/markdown',
|
||||
author = 'pictuga',
|
||||
author_email = 'contact@pictuga.com',
|
||||
author = 'pictuga, Samuel Marks',
|
||||
author_email = 'contact at pictuga dot com',
|
||||
url = 'http://morss.it/',
|
||||
project_urls = {
|
||||
'Source': 'https://git.pictuga.com/pictuga/morss',
|
||||
'Bug Tracker': 'https://github.com/pictuga/morss/issues',
|
||||
},
|
||||
download_url = 'https://git.pictuga.com/pictuga/morss',
|
||||
license = 'AGPL v3',
|
||||
packages = [package_name],
|
||||
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'],
|
||||
extras_require = {
|
||||
'full': ['redis', 'diskcache', 'gunicorn', 'setproctitle'],
|
||||
'dev': ['pylint', 'pyenchant', 'pytest', 'pytest-cov'],
|
||||
},
|
||||
python_requires = '>=2.7',
|
||||
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet', 'pymysql'],
|
||||
package_data = {package_name: ['feedify.ini']},
|
||||
data_files = [
|
||||
('share/' + package_name, ['README.md', 'LICENSE']),
|
||||
('share/' + package_name + '/www', glob('www/*.*')),
|
||||
('share/' + package_name + '/www/cgi', [])
|
||||
],
|
||||
entry_points = {
|
||||
'console_scripts': [package_name + '=' + package_name + '.__main__:main'],
|
||||
},
|
||||
scripts = ['morss-helper'],
|
||||
)
|
||||
'console_scripts': [package_name + '=' + package_name + ':main']
|
||||
})
|
||||
|
@@ -1,60 +0,0 @@
|
||||
import os
|
||||
import os.path
|
||||
import threading
|
||||
|
||||
import pytest
|
||||
|
||||
try:
|
||||
# python2
|
||||
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
|
||||
from SimpleHTTPServer import SimpleHTTPRequestHandler
|
||||
except:
|
||||
# python3
|
||||
from http.server import (BaseHTTPRequestHandler, HTTPServer,
|
||||
SimpleHTTPRequestHandler)
|
||||
|
||||
class HTTPReplayHandler(SimpleHTTPRequestHandler):
|
||||
" Serves pages saved alongside with headers. See `curl --http1.1 -is http://...` "
|
||||
|
||||
directory = os.path.join(os.path.dirname(__file__), './samples/')
|
||||
|
||||
__init__ = BaseHTTPRequestHandler.__init__
|
||||
|
||||
def do_GET(self):
|
||||
path = self.translate_path(self.path)
|
||||
|
||||
if os.path.isdir(path):
|
||||
f = self.list_directory(path)
|
||||
|
||||
else:
|
||||
f = open(path, 'rb')
|
||||
|
||||
try:
|
||||
self.copyfile(f, self.wfile)
|
||||
|
||||
finally:
|
||||
f.close()
|
||||
|
||||
class MuteHTTPServer(HTTPServer):
|
||||
def handle_error(self, request, client_address):
|
||||
# mute errors
|
||||
pass
|
||||
|
||||
def make_server(port=8888):
|
||||
print('Serving http://localhost:%s/' % port)
|
||||
return MuteHTTPServer(('', port), RequestHandlerClass=HTTPReplayHandler)
|
||||
|
||||
@pytest.fixture
|
||||
def replay_server():
|
||||
httpd = make_server()
|
||||
thread = threading.Thread(target=httpd.serve_forever)
|
||||
thread.start()
|
||||
|
||||
yield
|
||||
|
||||
httpd.shutdown()
|
||||
thread.join()
|
||||
|
||||
if __name__ == '__main__':
|
||||
httpd = make_server()
|
||||
httpd.serve_forever()
|
@@ -1,4 +0,0 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/plain
|
||||
|
||||
success
|
@@ -1,3 +0,0 @@
|
||||
HTTP/1.1 301 Moved Permanently
|
||||
location: /200-ok.txt
|
||||
|
@@ -1,3 +0,0 @@
|
||||
HTTP/1.1 301 Moved Permanently
|
||||
location: ./200-ok.txt
|
||||
|
@@ -1,3 +0,0 @@
|
||||
HTTP/1.1 301 Moved Permanently
|
||||
location: http://localhost:8888/200-ok.txt
|
||||
|
@@ -1,4 +0,0 @@
|
||||
HTTP/1.1 308 Permanent Redirect
|
||||
location: /200-ok.txt
|
||||
|
||||
/200-ok.txt
|
@@ -1,8 +0,0 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/html; charset=UTF-8
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><link rel="alternate" type="application/rss+xml" href="/200-ok.txt" /></head>
|
||||
<body>meta redirect</body>
|
||||
</html>
|
@@ -1,4 +0,0 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/plain; charset=gb2312
|
||||
|
||||
<EFBFBD>ɹ<EFBFBD>
|
@@ -1,10 +0,0 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/html
|
||||
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><meta charset="gb2312"/></head>
|
||||
<body>
|
||||
<EFBFBD>ɹ<EFBFBD>
|
||||
</body></html>
|
@@ -1,4 +0,0 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/plain; charset=iso-8859-1
|
||||
|
||||
succ<EFBFBD>s
|
@@ -1,4 +0,0 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/plain
|
||||
|
||||
succ<EFBFBD>s
|
@@ -1,4 +0,0 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/plain; charset=UTF-8
|
||||
|
||||
succès
|
@@ -1,16 +0,0 @@
|
||||
HTTP/1.1 200 OK
|
||||
Content-Type: text/xml; charset=utf-8
|
||||
|
||||
<?xml version='1.0' encoding='utf-8'?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<title>!TITLE!</title>
|
||||
<subtitle>!DESC!</subtitle>
|
||||
<entry>
|
||||
<title>!ITEM_TITLE!</title>
|
||||
<summary>!ITEM_DESC!</summary>
|
||||
<content type="html">!ITEM_CONTENT!</content>
|
||||
<link href="!ITEM_LINK!"/>
|
||||
<updated>2022-01-01T00:00:01+01:00</updated>
|
||||
<published>2022-01-01T00:00:02+01:00</published>
|
||||
</entry>
|
||||
</feed>
|
@@ -1,15 +0,0 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: application/xml
|
||||
|
||||
<?xml version='1.0' encoding='utf-8' ?>
|
||||
<feed version='0.3' xmlns='http://purl.org/atom/ns#'>
|
||||
<title>!TITLE!</title>
|
||||
<subtitle>!DESC!</subtitle>
|
||||
<entry>
|
||||
<title>!ITEM_TITLE!</title>
|
||||
<link rel='alternate' type='text/html' href='!ITEM_LINK!' />
|
||||
<summary>!ITEM_DESC!</summary>
|
||||
<content>!ITEM_CONTENT!</content>
|
||||
<issued>2022-01-01T00:00:01+01:00</issued> <!-- FIXME -->
|
||||
</entry>
|
||||
</feed>
|
@@ -1,22 +0,0 @@
|
||||
HTTP/1.1 200 OK
|
||||
Content-Type: text/html; charset=utf-8
|
||||
|
||||
<html>
|
||||
<head></head>
|
||||
|
||||
<body>
|
||||
<div id="header">
|
||||
<h1>!TITLE!</h1>
|
||||
<p>!DESC!</p>
|
||||
</div>
|
||||
|
||||
<div id="content">
|
||||
<div class="item">
|
||||
<a target="_blank" href="!ITEM_LINK!">!ITEM_TITLE!</a>
|
||||
<div class="desc">!ITEM_DESC!</div>
|
||||
<div class="content">!ITEM_CONTENT!</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
@@ -1,16 +0,0 @@
|
||||
HTTP/1.1 200 OK
|
||||
Content-Type: application/json; charset=utf-8
|
||||
|
||||
{
|
||||
"title": "!TITLE!",
|
||||
"desc": "!DESC!",
|
||||
"items": [
|
||||
{
|
||||
"title": "!ITEM_TITLE!",
|
||||
"time": "2022-01-01T00:00:01+0100",
|
||||
"url": "!ITEM_LINK!",
|
||||
"desc": "!ITEM_DESC!",
|
||||
"content": "!ITEM_CONTENT!"
|
||||
}
|
||||
]
|
||||
}
|
@@ -1,17 +0,0 @@
|
||||
HTTP/1.1 200 OK
|
||||
Content-Type: text/xml; charset=utf-8
|
||||
|
||||
<?xml version='1.0' encoding='utf-8'?>
|
||||
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
|
||||
<channel>
|
||||
<title>!TITLE!</title>
|
||||
<description>!DESC!</description>
|
||||
<item>
|
||||
<title>!ITEM_TITLE!</title>
|
||||
<pubDate>Mon, 01 Jan 2022 00:00:01 +0100</pubDate>
|
||||
<link>!ITEM_LINK!</link>
|
||||
<description>!ITEM_DESC!</description>
|
||||
<content:encoded>!ITEM_CONTENT!</content:encoded>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
Binary file not shown.
@@ -1,3 +0,0 @@
|
||||
HTTP/1.1 200 OK
|
||||
refresh: 0;url=/200-ok.txt
|
||||
|
@@ -1,8 +0,0 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/html; charset=UTF-8
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><meta http-equiv="refresh" content="2; url = /200-ok.txt" /></head>
|
||||
<body>meta redirect</body>
|
||||
</html>
|
@@ -1,8 +0,0 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/html; charset=UTF-8
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><meta http-equiv="refresh" content="2; url = ./200-ok.txt" /></head>
|
||||
<body>meta redirect</body>
|
||||
</html>
|
@@ -1,8 +0,0 @@
|
||||
HTTP/1.1 200 OK
|
||||
content-type: text/html; charset=UTF-8
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><meta http-equiv="refresh" content="2; url = http://localhost:8888/200-ok.txt" /></head>
|
||||
<body>meta redirect</body>
|
||||
</html>
|
File diff suppressed because it is too large
Load Diff
@@ -1,62 +0,0 @@
|
||||
import pytest
|
||||
|
||||
from morss.crawler import *
|
||||
|
||||
|
||||
def test_get(replay_server):
|
||||
assert get('http://localhost:8888/200-ok.txt') == b'success\r\n'
|
||||
|
||||
def test_adv_get(replay_server):
|
||||
assert adv_get('http://localhost:8888/200-ok.txt')['data'] == b'success\r\n'
|
||||
|
||||
@pytest.mark.parametrize('before,after', [
|
||||
(b'http://localhost:8888/', 'http://localhost:8888/'),
|
||||
('localhost:8888/', 'http://localhost:8888/'),
|
||||
('http:/localhost:8888/', 'http://localhost:8888/'),
|
||||
('http://localhost:8888/&/', 'http://localhost:8888/&/'),
|
||||
('http://localhost:8888/ /', 'http://localhost:8888/%20/'),
|
||||
('http://localhost-€/€/', 'http://xn--localhost--077e/%E2%82%AC/'),
|
||||
('http://localhost-€:8888/€/', 'http://xn--localhost--077e:8888/%E2%82%AC/'),
|
||||
])
|
||||
def test_sanitize_url(before, after):
|
||||
assert sanitize_url(before) == after
|
||||
|
||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(SizeLimitHandler(500*1024))])
|
||||
def test_size_limit_handler(replay_server, opener):
|
||||
assert len(opener.open('http://localhost:8888/size-1MiB.txt').read()) == 500*1024
|
||||
|
||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(GZIPHandler())])
|
||||
def test_gzip_handler(replay_server, opener):
|
||||
assert opener.open('http://localhost:8888/gzip.txt').read() == b'success\n'
|
||||
|
||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(EncodingFixHandler())])
|
||||
@pytest.mark.parametrize('url', [
|
||||
'enc-gb2312-header.txt', 'enc-gb2312-meta.txt', #'enc-gb2312-missing.txt',
|
||||
'enc-iso-8859-1-header.txt', 'enc-iso-8859-1-missing.txt',
|
||||
'enc-utf-8-header.txt',
|
||||
])
|
||||
def test_encoding_fix_handler(replay_server, opener, url):
|
||||
out = adv_get('http://localhost:8888/%s' % url)
|
||||
out = out['data'].decode(out['encoding'])
|
||||
assert 'succes' in out or 'succès' in out or '成功' in out
|
||||
|
||||
@pytest.mark.parametrize('opener', [custom_opener(follow='rss'), build_opener(AlternateHandler(MIMETYPE['rss']))])
|
||||
def test_alternate_handler(replay_server, opener):
|
||||
assert opener.open('http://localhost:8888/alternate-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||
|
||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPEquivHandler(), HTTPRefreshHandler())])
|
||||
def test_http_equiv_handler(replay_server, opener):
|
||||
assert opener.open('http://localhost:8888/meta-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||
assert opener.open('http://localhost:8888/meta-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||
assert opener.open('http://localhost:8888/meta-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||
|
||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPAllRedirectHandler())])
|
||||
def test_http_all_redirect_handler(replay_server, opener):
|
||||
assert opener.open('http://localhost:8888/308-redirect.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||
assert opener.open('http://localhost:8888/301-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||
assert opener.open('http://localhost:8888/301-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||
assert opener.open('http://localhost:8888/301-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
||||
|
||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPRefreshHandler())])
|
||||
def test_http_refresh_handler(replay_server, opener):
|
||||
assert opener.open('http://localhost:8888/header-refresh.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
@@ -1,108 +0,0 @@
|
||||
import pytest
|
||||
|
||||
from morss.crawler import adv_get
|
||||
from morss.feeds import *
|
||||
|
||||
|
||||
def get_feed(url):
|
||||
url = 'http://localhost:8888/%s' % url
|
||||
out = adv_get(url)
|
||||
feed = parse(out['data'], url=url, encoding=out['encoding'])
|
||||
return feed
|
||||
|
||||
def check_feed(feed):
|
||||
# NB. time and updated not covered
|
||||
assert feed.title == '!TITLE!'
|
||||
assert feed.desc == '!DESC!'
|
||||
assert feed[0] == feed.items[0]
|
||||
assert feed[0].title == '!ITEM_TITLE!'
|
||||
assert feed[0].link == '!ITEM_LINK!'
|
||||
assert '!ITEM_DESC!' in feed[0].desc # broader test due to possible inclusion of surrounding <div> in xml
|
||||
assert '!ITEM_CONTENT!' in feed[0].content
|
||||
|
||||
def check_output(feed):
|
||||
output = feed.tostring()
|
||||
assert '!TITLE!' in output
|
||||
assert '!DESC!' in output
|
||||
assert '!ITEM_TITLE!' in output
|
||||
assert '!ITEM_LINK!' in output
|
||||
assert '!ITEM_DESC!' in output
|
||||
assert '!ITEM_CONTENT!' in output
|
||||
|
||||
def check_change(feed):
|
||||
feed.title = '!TITLE2!'
|
||||
feed.desc = '!DESC2!'
|
||||
feed[0].title = '!ITEM_TITLE2!'
|
||||
feed[0].link = '!ITEM_LINK2!'
|
||||
feed[0].desc = '!ITEM_DESC2!'
|
||||
feed[0].content = '!ITEM_CONTENT2!'
|
||||
|
||||
assert feed.title == '!TITLE2!'
|
||||
assert feed.desc == '!DESC2!'
|
||||
assert feed[0].title == '!ITEM_TITLE2!'
|
||||
assert feed[0].link == '!ITEM_LINK2!'
|
||||
assert '!ITEM_DESC2!' in feed[0].desc
|
||||
assert '!ITEM_CONTENT2!' in feed[0].content
|
||||
|
||||
def check_add(feed):
|
||||
feed.append({
|
||||
'title': '!ITEM_TITLE3!',
|
||||
'link': '!ITEM_LINK3!',
|
||||
'desc': '!ITEM_DESC3!',
|
||||
'content': '!ITEM_CONTENT3!',
|
||||
})
|
||||
|
||||
assert feed[1].title == '!ITEM_TITLE3!'
|
||||
assert feed[1].link == '!ITEM_LINK3!'
|
||||
assert '!ITEM_DESC3!' in feed[1].desc
|
||||
assert '!ITEM_CONTENT3!' in feed[1].content
|
||||
|
||||
each_format = pytest.mark.parametrize('url', [
|
||||
'feed-rss-channel-utf-8.txt', 'feed-atom-utf-8.txt',
|
||||
'feed-atom03-utf-8.txt', 'feed-json-utf-8.txt', 'feed-html-utf-8.txt',
|
||||
])
|
||||
|
||||
each_check = pytest.mark.parametrize('check', [
|
||||
check_feed, check_output, check_change, check_add,
|
||||
])
|
||||
|
||||
@each_format
|
||||
@each_check
|
||||
def test_parse(replay_server, url, check):
|
||||
feed = get_feed(url)
|
||||
check(feed)
|
||||
|
||||
@each_format
|
||||
@each_check
|
||||
def test_convert_rss(replay_server, url, check):
|
||||
feed = get_feed(url)
|
||||
feed = feed.convert(FeedXML)
|
||||
check(feed)
|
||||
|
||||
@each_format
|
||||
@each_check
|
||||
def test_convert_json(replay_server, url, check):
|
||||
feed = get_feed(url)
|
||||
feed = feed.convert(FeedJSON)
|
||||
check(feed)
|
||||
|
||||
@each_format
|
||||
@each_check
|
||||
def test_convert_html(replay_server, url, check):
|
||||
feed = get_feed(url)
|
||||
feed = feed.convert(FeedHTML)
|
||||
if len(feed) > 1:
|
||||
# remove the 'blank' default html item
|
||||
del feed[0]
|
||||
check(feed)
|
||||
|
||||
@each_format
|
||||
def test_convert_csv(replay_server, url):
|
||||
# only csv output, not csv feed, check therefore differnet
|
||||
feed = get_feed(url)
|
||||
output = feed.tocsv()
|
||||
|
||||
assert '!ITEM_TITLE!' in output
|
||||
assert '!ITEM_LINK!' in output
|
||||
assert '!ITEM_DESC!' in output
|
||||
assert '!ITEM_CONTENT!' in output
|
9
www/.htaccess
Normal file
9
www/.htaccess
Normal file
@@ -0,0 +1,9 @@
|
||||
Options -Indexes
|
||||
|
||||
ErrorDocument 403 "Access forbidden"
|
||||
ErrorDocument 404 /cgi/main.py
|
||||
ErrorDocument 500 "A very nasty bug found his way onto this very server"
|
||||
|
||||
<Files ~ "\.(py|pyc|db|log)$">
|
||||
deny from all
|
||||
</Files>
|
9
www/cgi/.htaccess
Normal file
9
www/cgi/.htaccess
Normal file
@@ -0,0 +1,9 @@
|
||||
order allow,deny
|
||||
|
||||
deny from all
|
||||
|
||||
<Files main.py>
|
||||
allow from all
|
||||
AddHandler cgi-script .py
|
||||
Options +ExecCGI
|
||||
</Files>
|
@@ -4,7 +4,6 @@
|
||||
<title>morss</title>
|
||||
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
|
||||
<meta charset="UTF-8" />
|
||||
<link rel="shortcut icon" type="image/svg+xml" href="/logo.svg" sizes="any" />
|
||||
<style type="text/css">
|
||||
body
|
||||
{
|
||||
|
17
www/logo.svg
17
www/logo.svg
@@ -1,17 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<svg width="16" height="16" viewBox="0 0 16 16" shape-rendering="crispEdges" fill="black" version="1.1" xmlns="http://www.w3.org/2000/svg">
|
||||
<rect x="2" y="4" width="2" height="2" />
|
||||
<rect x="5" y="4" width="6" height="2" />
|
||||
<rect x="12" y="4" width="2" height="2" />
|
||||
|
||||
<rect x="2" y="7" width="2" height="2" />
|
||||
<rect x="7" y="7" width="2" height="2" />
|
||||
<rect x="12" y="7" width="2" height="2" />
|
||||
|
||||
<rect x="2" y="10" width="2" height="2" />
|
||||
<rect x="7" y="10" width="2" height="2" />
|
||||
<rect x="12" y="10" width="2" height="2" />
|
||||
</svg>
|
||||
|
||||
<!-- This work by pictuga is licensed under CC BY-NC-SA 4.0. To view a copy of
|
||||
this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0 -->
|
Before Width: | Height: | Size: 735 B |
@@ -14,23 +14,14 @@
|
||||
<html>
|
||||
<head>
|
||||
<title>RSS feed by morss</title>
|
||||
<meta name="viewport" content="width=device-width; initial-scale=1.0;" />
|
||||
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
|
||||
<meta name="robots" content="noindex" />
|
||||
<link rel="shortcut icon" type="image/svg+xml" href="/logo.svg" sizes="any" />
|
||||
|
||||
<style type="text/css">
|
||||
body * {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
body {
|
||||
overflow-wrap: anywhere;
|
||||
word-wrap: anywhere;
|
||||
word-break: break-word;
|
||||
|
||||
font-family: sans-serif;
|
||||
|
||||
-webkit-tap-highlight-color: transparent; /* safari work around */
|
||||
font-family: sans;
|
||||
}
|
||||
|
||||
input, select {
|
||||
@@ -40,8 +31,7 @@
|
||||
}
|
||||
|
||||
header {
|
||||
text-align: justify;
|
||||
text-align-last: center;
|
||||
text-align: center;
|
||||
border-bottom: 1px solid silver;
|
||||
}
|
||||
|
||||
@@ -120,6 +110,7 @@
|
||||
}
|
||||
|
||||
header > form {
|
||||
text-align: center;
|
||||
margin: 1%;
|
||||
}
|
||||
|
||||
@@ -140,10 +131,6 @@
|
||||
padding: 1%;
|
||||
}
|
||||
|
||||
.item > *:empty {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.item > :not(:last-child) {
|
||||
border-bottom: 1px solid silver;
|
||||
}
|
||||
@@ -187,28 +174,16 @@
|
||||
<select>
|
||||
<option value="">full-text</option>
|
||||
<option value=":proxy">original</option>
|
||||
<option value=":clip" title="original + full-text: keep the original description above the full article. Useful for reddit feeds for example, to keep the comment links">combined (?)</option>
|
||||
<option value=":clip">original + full-text</option>
|
||||
</select>
|
||||
feed as
|
||||
<select>
|
||||
<option value="">RSS</option>
|
||||
<option value=":format=json:cors">JSON</option>
|
||||
<option value=":format=html">HTML</option>
|
||||
<option value=":format=csv">CSV</option>
|
||||
<option value=":json:cors">JSON</option>
|
||||
<option value=":html">HTML</option>
|
||||
<option value=":csv">CSV</option>
|
||||
</select>
|
||||
using the
|
||||
<select>
|
||||
<option value="">standard</option>
|
||||
<option value=":firstlink" title="Pull the article from the first available link in the description, instead of the standard link. Useful for Twitter feeds for example, to get the articles referred to in tweets rather than the tweet itself">first (?)</option>
|
||||
</select>
|
||||
link of the
|
||||
<select>
|
||||
<option value="">first</option>
|
||||
<option value=":order=newest" title="Select feed items by publication date (instead of appearing order)">newest (?)</option>
|
||||
<option value=":order=last">last</option>
|
||||
<option value=":order=oldest">oldest</option>
|
||||
</select>
|
||||
items and
|
||||
and
|
||||
<select>
|
||||
<option value="">keep</option>
|
||||
<option value=":nolink:noref">remove</option>
|
||||
@@ -217,11 +192,10 @@
|
||||
<input type="hidden" value="" name="extra_options"/>
|
||||
</form>
|
||||
|
||||
<p>You can find a <em>preview</em> of the feed below. You need a <em>feed reader</em> for optimal use</p>
|
||||
<p>Click <a href="/">here</a> to go back to morss and/or to use the tool on another feed</p>
|
||||
<p>Click <a href="/">here</a> to go back to morss</p>
|
||||
</header>
|
||||
|
||||
<div id="header" dir="auto">
|
||||
<div id="header">
|
||||
<h1>
|
||||
<xsl:value-of select="rdf:RDF/rssfake:channel/rssfake:title|rss/channel/title|atom:feed/atom:title|atom03:feed/atom03:title"/>
|
||||
</h1>
|
||||
@@ -233,8 +207,8 @@
|
||||
|
||||
<div id="content">
|
||||
<xsl:for-each select="rdf:RDF/rssfake:channel/rssfake:item|rss/channel/item|atom:feed/atom:entry|atom03:feed/atom03:entry">
|
||||
<div class="item" dir="auto">
|
||||
<a target="_blank"><xsl:attribute name="href"><xsl:value-of select="rssfake:link|link|atom:link/@href|atom03:link/@href"/></xsl:attribute>
|
||||
<div class="item">
|
||||
<a href="/" target="_blank"><xsl:attribute name="href"><xsl:value-of select="rssfake:link|link|atom:link/@href|atom03:link/@href"/></xsl:attribute>
|
||||
<xsl:value-of select="rssfake:title|title|atom:title|atom03:title"/>
|
||||
</a>
|
||||
|
||||
@@ -250,12 +224,11 @@
|
||||
</div>
|
||||
|
||||
<script>
|
||||
//<![CDATA[
|
||||
document.getElementById("url").value = window.location.href
|
||||
|
||||
if (!/:html/.test(window.location.href))
|
||||
for (var content of document.querySelectorAll(".desc,.content"))
|
||||
content.innerHTML = (content.innerText.match(/>/g) || []).length > 3 ? content.innerText : content.innerHTML
|
||||
content.innerHTML = (content.innerText.match(/>/g) || []).length > 10 ? content.innerText : content.innerHTML
|
||||
|
||||
var options = parse_location()[0]
|
||||
|
||||
@@ -263,11 +236,12 @@
|
||||
for (var select of document.forms[0].elements)
|
||||
if (select.tagName == 'SELECT')
|
||||
for (var option of select)
|
||||
if (option.value && options.match(option.value)) {
|
||||
select.value = option.value
|
||||
options = options.replace(option.value, '')
|
||||
break
|
||||
}
|
||||
if (option.value)
|
||||
if (options.match(option.value)) {
|
||||
select.value = option.value
|
||||
options = options.replace(option.value, '')
|
||||
break
|
||||
}
|
||||
|
||||
document.forms[0]['extra_options'].value = options
|
||||
}
|
||||
@@ -276,7 +250,6 @@
|
||||
input.focus()
|
||||
input.select()
|
||||
document.execCommand('copy')
|
||||
input.blur()
|
||||
}
|
||||
|
||||
function copy_link() {
|
||||
@@ -296,7 +269,6 @@
|
||||
if (target != window.location.pathname)
|
||||
window.location.href = target
|
||||
}
|
||||
//]]>
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
|
Reference in New Issue
Block a user