Compare commits
251 Commits
v1.1
...
0365232a73
Author | SHA1 | Date | |
---|---|---|---|
0365232a73 | |||
a523518ae8 | |||
52c48b899f | |||
9649cabb1b | |||
0c29102788 | |||
10535a17c5 | |||
7d86972e58 | |||
62e04549ac | |||
5da7121a77 | |||
bb82902ad1 | |||
04afa28fe7 | |||
75bb69f0fd | |||
97d9dda547 | |||
0c31d9f6db | |||
49e29208ef | |||
d8d608a4de | |||
5437e40a15 | |||
6c1f8da692 | |||
a1a26d8209 | |||
edbb580f33 | |||
4fd730b983 | |||
198353d6b9 | |||
0b3e6d7749 | |||
06e0ada95b | |||
71d9c7a027 | |||
37f5a92b05 | |||
24c26d3850 | |||
8f24214915 | |||
d5942fe5a7 | |||
6f50443995 | |||
5582fbef31 | |||
da5442a1dc | |||
f9d7794bcc | |||
e37c8346d0 | |||
3a1d564992 | |||
6880a443e0 | |||
7342ab26d2 | |||
981da9e66a | |||
6ea9d012a2 | |||
95d6143636 | |||
03cad120d0 | |||
01a7667032 | |||
3e886caaab | |||
ad927e03a7 | |||
0efb096fa7 | |||
9ab2e488ef | |||
b525ab0d26 | |||
fb19b1241f | |||
9d062ef24b | |||
447f62dc45 | |||
18ec10fe44 | |||
891c385b69 | |||
0629bb98fb | |||
ae7ba458ce | |||
bd0bca69fc | |||
8abd951d40 | |||
2514fabd38 | |||
8cb7002fe6 | |||
6966e03bef | |||
03a122c41f | |||
5cd6c22d73 | |||
e1b41b5f64 | |||
9ce6acba20 | |||
6192ff4081 | |||
056a1b143f | |||
eed949736a | |||
2fc7cd391c | |||
d9f46b23a6 | |||
bbada0436a | |||
039a672f4e | |||
b290568e14 | |||
9ecf856f10 | |||
504ede624d | |||
0d89f0e6f2 | |||
56e0c2391d | |||
679f406a12 | |||
f6d641eeef | |||
2456dd9bbc | |||
0f33db248a | |||
d57f543c7b | |||
fba112147c | |||
8697c3f0df | |||
75935114e4 | |||
5bd2557619 | |||
598a2591f1 | |||
e76ab2b631 | |||
aa9143302b | |||
0d62a7625b | |||
bd0efb1529 | |||
47a17614ef | |||
4dfebe78f7 | |||
dcd3e4a675 | |||
e968b2ea7f | |||
0ac590c798 | |||
fa1b5aef09 | |||
7f6309f618 | |||
f65fb45030 | |||
6dd40e5cc4 | |||
0acfce5a22 | |||
97ccc15db0 | |||
7a560181f7 | |||
baccd3b22b | |||
f79938ab11 | |||
5b8bd47829 | |||
b5b355aa6e | |||
94097f481a | |||
8161baa7ae | |||
bd182bcb85 | |||
c7c2c5d749 | |||
c6b52e625f | |||
c6d3a0eb53 | |||
c628ee802c | |||
6021b912ff | |||
f18a128ee6 | |||
64af86c11e | |||
15951d228c | |||
c1b1f5f58a | |||
985185f47f | |||
3190d1ec5a | |||
9815794a97 | |||
758b6861b9 | |||
ce4cf01aa6 | |||
dcfdb75a15 | |||
4ccc0dafcd | |||
2fe3e0b8ee | |||
ad3ba9de1a | |||
68c46a1823 | |||
91be2d229e | |||
038f267ea2 | |||
22005065e8 | |||
7d0d416610 | |||
5dac4c69a1 | |||
36e2a1c3fd | |||
83dd2925d3 | |||
e09d0abf54 | |||
ff26a560cb | |||
74d7a1eca2 | |||
eba295cba8 | |||
f27631954e | |||
c74abfa2f4 | |||
1d5272c299 | |||
f685139137 | |||
73b477665e | |||
b425992783 | |||
271ac8f80f | |||
64e41b807d | |||
a2c4691090 | |||
b6000923bc | |||
27a42c47aa | |||
c27c38f7c7 | |||
a1dc96cb50 | |||
749acc87fc | |||
c186188557 | |||
cb69e3167f | |||
c3f06da947 | |||
44a3e0edc4 | |||
4a9b505499 | |||
818cdaaa9b | |||
2806c64326 | |||
d39d7bb19d | |||
e5e3746fc6 | |||
960c9d10d6 | |||
0e7a5b9780 | |||
186bedcf62 | |||
5847e18e42 | |||
f6bc23927f | |||
c86572374e | |||
59ef5af9e2 | |||
6a0531ca03 | |||
8187876a06 | |||
325a373e3e | |||
2719bd6776 | |||
285e1e5f42 | |||
41a63900c2 | |||
ec8edb02f1 | |||
d01b943597 | |||
b361aa2867 | |||
4ce3c7cb32 | |||
7e45b2611d | |||
036e5190f1 | |||
e99c5b3b71 | |||
4f44df8d63 | |||
497c14db81 | |||
a4e1dba8b7 | |||
7375adce33 | |||
663212de0a | |||
4a2ea1bce9 | |||
fe82b19c91 | |||
0b31e97492 | |||
b0ad7c259d | |||
bffb23f884 | |||
59139272fd | |||
39b0a1d7cc | |||
65803b328d | |||
e6b7c0eb33 | |||
67c096ad5b | |||
f018437544 | |||
8e5e8d24a4 | |||
ee78a7875a | |||
9e7b9d95ee | |||
987a719c4e | |||
47b33f4baa | |||
3c7f512583 | |||
a32f5a8536 | |||
63a06524b7 | |||
b0f80c6d3c | |||
78cea10ead | |||
e5a82ff1f4 | |||
f3d1f92b39 | |||
7691df5257 | |||
0ae0dbc175 | |||
f1d0431e68 | |||
a09831415f | |||
bfad6b7a4a | |||
6b8c3e51e7 | |||
dc9e425247 | |||
2f48e18bb1 | |||
31cac921c7 | |||
a82ec96eb7 | |||
aad2398e69 | |||
eeac630855 | |||
e136b0feb2 | |||
6cf32af6c0 | |||
568e7d7dd2 | |||
3617f86e9d | |||
d90756b337 | |||
40c69f17d2 | |||
99461ea185 | |||
bf86c1e962 | |||
d20f6237bd | |||
8a4d68d72c | |||
e6811138fd | |||
35b702fffd | |||
4a88886767 | |||
1653394cf7 | |||
a8a90cf414 | |||
bdbaf0f8a7 | |||
d0e447a2a6 | |||
e6817e01b4 | |||
7c3091d64c | |||
37b4e144a9 | |||
bd4b7b5bb2 | |||
68d920d4b5 | |||
758ff404a8 | |||
463530f02c | |||
ec0a28a91d | |||
421acb439d | |||
42c5d09ccb | |||
056de12484 | |||
961a31141f | |||
a7b01ee85e |
15
.drone.yml
Normal file
15
.drone.yml
Normal file
@@ -0,0 +1,15 @@
|
||||
kind: pipeline
|
||||
name: default
|
||||
|
||||
steps:
|
||||
- name: isort
|
||||
image: python:alpine
|
||||
commands:
|
||||
- pip install isort
|
||||
- isort --check-only --diff .
|
||||
- name: pylint
|
||||
image: alpine
|
||||
commands:
|
||||
- apk add --no-cache python3 py3-lxml py3-pip py3-wheel py3-pylint py3-enchant hunspell-en
|
||||
- pip3 install --no-cache-dir .[full]
|
||||
- pylint morss --rcfile=.pylintrc --disable=C,R,W --fail-under=8
|
50
.pylintrc
Normal file
50
.pylintrc
Normal file
@@ -0,0 +1,50 @@
|
||||
[MASTER]
|
||||
ignore=CVS
|
||||
suggestion-mode=yes
|
||||
extension-pkg-allow-list=lxml.etree
|
||||
|
||||
[MESSAGES CONTROL]
|
||||
disable=missing-function-docstring,
|
||||
missing-class-docstring,
|
||||
missing-module-docstring,
|
||||
wrong-spelling-in-comment,
|
||||
|
||||
[REPORTS]
|
||||
reports=yes
|
||||
score=yes
|
||||
|
||||
[SPELLING]
|
||||
spelling-dict=en_GB
|
||||
spelling-ignore-words=morss
|
||||
|
||||
[STRING]
|
||||
check-quote-consistency=yes
|
||||
check-str-concat-over-line-jumps=yes
|
||||
|
||||
[VARIABLES]
|
||||
allow-global-unused-variables=no
|
||||
init-import=no
|
||||
|
||||
[FORMAT]
|
||||
expected-line-ending-format=LF
|
||||
indent-string=' '
|
||||
max-line-length=120
|
||||
max-module-lines=1000
|
||||
|
||||
[BASIC]
|
||||
argument-naming-style=snake_case
|
||||
attr-naming-style=snake_case
|
||||
class-attribute-naming-style=snake_case
|
||||
class-const-naming-style=UPPER_CASE
|
||||
class-naming-style=PascalCase
|
||||
const-naming-style=UPPER_CASE
|
||||
function-naming-style=snake_case
|
||||
inlinevar-naming-style=snake_case
|
||||
method-naming-style=snake_case
|
||||
module-naming-style=snake_case
|
||||
variable-naming-style=snake_case
|
||||
|
||||
include-naming-hint=yes
|
||||
|
||||
bad-names=foo, bar
|
||||
good-names=i, j, k
|
8
Dockerfile
Normal file
8
Dockerfile
Normal file
@@ -0,0 +1,8 @@
|
||||
FROM alpine:latest
|
||||
|
||||
RUN apk add --no-cache python3 py3-lxml py3-pip py3-wheel git
|
||||
|
||||
ADD . /app
|
||||
RUN pip3 install --no-cache-dir /app[full] gunicorn
|
||||
|
||||
CMD gunicorn --bind 0.0.0.0:8080 -w 4 --preload --access-logfile - morss
|
360
README.md
360
README.md
@@ -1,6 +1,9 @@
|
||||
# Morss - Get full-text RSS feeds
|
||||
|
||||
_GNU AGPLv3 code_
|
||||
[](https://ci.pictuga.com/pictuga/morss)
|
||||
|
||||
_GNU AGPLv3 code_
|
||||
_Provided logo is CC BY-NC-SA 4.0_
|
||||
|
||||
Upstream source code: https://git.pictuga.com/pictuga/morss
|
||||
Github mirror (for Issues & Pull requests): https://github.com/pictuga/morss
|
||||
@@ -18,21 +21,20 @@ Morss also provides additional features, such as: .csv and json export, extended
|
||||
control over output. A strength of morss is its ability to deal with broken
|
||||
feeds, and to replace tracking links with direct links to the actual content.
|
||||
|
||||
Morss can also generate feeds from html and json files (see `feedify.py`), which
|
||||
Morss can also generate feeds from html and json files (see `feeds.py`), which
|
||||
for instance makes it possible to get feeds for Facebook or Twitter, using
|
||||
hand-written rules (ie. there's no automatic detection of links to build feeds).
|
||||
Please mind that feeds based on html files may stop working unexpectedly, due to
|
||||
html structure changes on the target website.
|
||||
|
||||
Additionally morss can grab the source xml feed of iTunes podcast, and detect
|
||||
rss feeds in html pages' `<meta>`.
|
||||
Additionally morss can detect rss feeds in html pages' `<meta>`.
|
||||
|
||||
You can use this program online for free at **[morss.it](https://morss.it/)**.
|
||||
|
||||
Some features of morss:
|
||||
|
||||
- Read RSS/Atom feeds
|
||||
- Create RSS feeds from json/html pages
|
||||
- Convert iTunes podcast links into xml links
|
||||
- Export feeds as RSS/JSON/CSV/HTML
|
||||
- Fetch full-text content of feed items
|
||||
- Follow 301/meta redirects
|
||||
@@ -42,76 +44,115 @@ Some features of morss:
|
||||
- Works as server/cli tool
|
||||
- Deobfuscate various tracking links
|
||||
|
||||
## Dependencies
|
||||
## Install
|
||||
|
||||
You do need:
|
||||
### Python package
|
||||
|
||||
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
|
||||
- [lxml](http://lxml.de/) for xml parsing
|
||||
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
|
||||
- [chardet](https://pypi.python.org/pypi/chardet)
|
||||
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
|
||||
- pymysql
|
||||
|
||||
Simplest way to get these:
|
||||
Simple install (without optional dependencies)
|
||||
|
||||
```shell
|
||||
pip install -r requirements.txt
|
||||
pip install git+https://git.pictuga.com/pictuga/morss.git
|
||||
```
|
||||
|
||||
You may also need:
|
||||
Full installation (including optional dependencies)
|
||||
|
||||
- Apache, with python-cgi support, to run on a server
|
||||
- a fast internet connection
|
||||
```shell
|
||||
pip install git+https://git.pictuga.com/pictuga/morss.git#[full]
|
||||
```
|
||||
|
||||
## Arguments
|
||||
The full install includes mysql and redis (possible cache backends). Otherwise,
|
||||
only in-memory and sqlite3 caches are available.
|
||||
|
||||
morss accepts some arguments, to lightly alter the output of morss. Arguments
|
||||
may need to have a value (usually a string or a number). In the different "Use
|
||||
cases" below is detailed how to pass those arguments to morss.
|
||||
The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
|
||||
C code needs to be compiled). If possible on your distribution, try installing
|
||||
it with the system package manager.
|
||||
|
||||
The arguments are:
|
||||
### Docker
|
||||
|
||||
- Change what morss does
|
||||
- `json`: output as JSON
|
||||
- `proxy`: doesn't fill the articles
|
||||
- `clip`: stick the full article content under the original feed content (useful for twitter)
|
||||
- `keep`: by default, morss does drop feed description whenever the full-content is found (so as not to mislead users who use Firefox, since the latter only shows the description in the feed preview, so they might believe morss doens't work), but with this argument, the description is kept
|
||||
- `search=STRING`: does a basic case-sensitive search in the feed
|
||||
- Advanced
|
||||
- `csv`: export to csv
|
||||
- `indent`: returns indented XML or JSON, takes more place, but human-readable
|
||||
- `nolink`: drop links, but keeps links' inner text
|
||||
- `noref`: drop items' link
|
||||
- `cache`: only take articles from the cache (ie. don't grab new articles' content), so as to save time
|
||||
- `debug`: to have some feedback from the script execution. Useful for debugging
|
||||
- `mono`: disable multithreading while fetching, makes debugging easier
|
||||
- `theforce`: force download the rss feed and ignore cached http errros
|
||||
- `silent`: don't output the final RSS (useless on its own, but can be nice when debugging)
|
||||
- `encoding=ENCODING`: overrides the encoding auto-detection of the crawler. Some web developers did not quite understand the importance of setting charset/encoding tags correctly...
|
||||
- http server only
|
||||
- `callback=NAME`: for JSONP calls
|
||||
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers)
|
||||
- `html`: changes the http content-type to html, so that python cgi erros (written in html) are readable in a web browser
|
||||
- `txt`: changes the http content-type to txt (for faster "`view-source:`")
|
||||
- Custom feeds: you can turn any HTML page into a RSS feed using morss, using xpath rules. The article content will be fetched as usual (with readabilite). Please note that you will have to **replace** any `/` in your rule with a `|` when using morss as a webserver
|
||||
- `items`: (**mandatory** to activate the custom feeds function) xpath rule to match all the RSS entries
|
||||
- `item_link`: xpath rule relative to `items` to point to the entry's link
|
||||
- `item_title`: entry's title
|
||||
- `item_content`: entry's description
|
||||
- `item_time`: entry's date & time (accepts a wide range of time formats)
|
||||
Build & run
|
||||
|
||||
## Use cases
|
||||
```shell
|
||||
docker build --tag morss https://git.pictuga.com/pictuga/morss.git --no-cache --pull
|
||||
docker run -p 8080:8080 morss
|
||||
```
|
||||
|
||||
With docker-compose:
|
||||
|
||||
```yml
|
||||
services:
|
||||
app:
|
||||
build: https://git.pictuga.com/pictuga/morss.git
|
||||
image: morss
|
||||
ports:
|
||||
- '8080:8080'
|
||||
```
|
||||
|
||||
Then execute
|
||||
|
||||
```shell
|
||||
docker-compose build --no-cache --pull
|
||||
docker-compose up
|
||||
```
|
||||
|
||||
## Run
|
||||
|
||||
morss will auto-detect what "mode" to use.
|
||||
|
||||
### Running on a server
|
||||
### Running on/as a server
|
||||
|
||||
Set up the server as indicated below, then visit:
|
||||
|
||||
```
|
||||
http://PATH/TO/MORSS/[main.py/][:argwithoutvalue[:argwithvalue=value[...]]]/FEEDURL
|
||||
```
|
||||
|
||||
For example: `http://morss.example/:clip/https://twitter.com/pictuga`
|
||||
|
||||
*(Brackets indicate optional text)*
|
||||
|
||||
The `main.py` part is only needed if your server doesn't support the Apache
|
||||
redirect rule set in the provided `.htaccess`.
|
||||
|
||||
Works like a charm with [Tiny Tiny RSS](https://tt-rss.org/), and most probably
|
||||
other clients.
|
||||
|
||||
|
||||
#### Via Docker
|
||||
|
||||
See above (in Install)
|
||||
|
||||
#### Using Gunicorn
|
||||
|
||||
```shell
|
||||
gunicorn --preload morss
|
||||
```
|
||||
|
||||
#### Using uWSGI
|
||||
|
||||
Running this command should do:
|
||||
|
||||
```shell
|
||||
uwsgi --http :8080 --plugin python --wsgi-file main.py
|
||||
```
|
||||
|
||||
#### Using morss' internal HTTP server
|
||||
|
||||
Morss can run its own, **very basic**, HTTP server, meant for debugging mostly.
|
||||
The latter should start when you run morss without any argument, on port 8080.
|
||||
I'd highly recommend you to use gunicorn or something similar for better
|
||||
performance.
|
||||
|
||||
```shell
|
||||
morss
|
||||
```
|
||||
|
||||
You can change the port using environment variables like this `PORT=9000 morss`.
|
||||
|
||||
#### Via mod_cgi/FastCGI with Apache/nginx
|
||||
|
||||
For this, you'll want to change a bit the architecture of the files, for example
|
||||
into something like this.
|
||||
|
||||
|
||||
```
|
||||
/
|
||||
├── cgi
|
||||
@@ -138,47 +179,15 @@ method uses HTTP calls to fetch the RSS feeds, which will be handled through
|
||||
Please pay attention to `main.py` permissions for it to be executable. Also
|
||||
ensure that the provided `/www/.htaccess` works well with your server.
|
||||
|
||||
#### Using uWSGI
|
||||
|
||||
Running this command should do:
|
||||
|
||||
```shell
|
||||
uwsgi --http :9090 --plugin python --wsgi-file main.py
|
||||
```
|
||||
|
||||
However, one problem might be how to serve the provided `index.html` file if it
|
||||
isn't in the same directory. Therefore you can add this at the end of the
|
||||
command to point to another directory `--pyargv '--root ../../www/'`.
|
||||
|
||||
|
||||
#### Using morss' internal HTTP server
|
||||
|
||||
Morss can run its own HTTP server. The later should start when you run morss
|
||||
without any argument, on port 8080.
|
||||
|
||||
You can change the port and the location of the `www/` folder like this `python -m morss 9000 --root ../../www`.
|
||||
|
||||
#### Passing arguments
|
||||
|
||||
Then visit:
|
||||
```
|
||||
http://PATH/TO/MORSS/[main.py/][:argwithoutvalue[:argwithvalue=value[...]]]/FEEDURL
|
||||
```
|
||||
For example: `http://morss.example/:clip/https://twitter.com/pictuga`
|
||||
|
||||
*(Brackets indicate optional text)*
|
||||
|
||||
The `main.py` part is only needed if your server doesn't support the Apache redirect rule set in the provided `.htaccess`.
|
||||
|
||||
Works like a charm with [Tiny Tiny RSS](http://tt-rss.org/redmine/projects/tt-rss/wiki), and most probably other clients.
|
||||
|
||||
### As a CLI application
|
||||
|
||||
Run:
|
||||
|
||||
```
|
||||
python[2.7] -m morss [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
|
||||
morss [--argwithoutvalue] [--argwithvalue=value] [...] FEEDURL
|
||||
```
|
||||
For example: `python -m morss debug http://feeds.bbci.co.uk/news/rss.xml`
|
||||
|
||||
For example: `morss --clip http://feeds.bbci.co.uk/news/rss.xml`
|
||||
|
||||
*(Brackets indicate optional text)*
|
||||
|
||||
@@ -189,17 +198,21 @@ To use it, the newsreader [Liferea](http://lzone.de/liferea/) is required
|
||||
scripts can be run on top of the RSS feed, using its
|
||||
[output](http://lzone.de/liferea/scraping.htm) as an RSS feed.
|
||||
|
||||
To use this script, you have to enable "(Unix) command" in liferea feed settings, and use the command:
|
||||
To use this script, you have to enable "(Unix) command" in liferea feed
|
||||
settings, and use the command:
|
||||
|
||||
```
|
||||
[python[2.7]] PATH/TO/MORSS/main.py [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
|
||||
morss [--argwithoutvalue] [--argwithvalue=value] [...] FEEDURL
|
||||
```
|
||||
For example: `python2.7 PATH/TO/MORSS/main.py http://feeds.bbci.co.uk/news/rss.xml`
|
||||
|
||||
For example: `morss http://feeds.bbci.co.uk/news/rss.xml`
|
||||
|
||||
*(Brackets indicate optional text)*
|
||||
|
||||
### As a python library
|
||||
|
||||
Quickly get a full-text feed:
|
||||
|
||||
```python
|
||||
>>> import morss
|
||||
>>> xml_string = morss.process('http://feeds.bbci.co.uk/news/rss.xml')
|
||||
@@ -208,6 +221,7 @@ Quickly get a full-text feed:
|
||||
```
|
||||
|
||||
Using cache and passing arguments:
|
||||
|
||||
```python
|
||||
>>> import morss
|
||||
>>> url = 'http://feeds.bbci.co.uk/news/rss.xml'
|
||||
@@ -223,6 +237,7 @@ possible to call the simpler functions, to have more control on what's happening
|
||||
under the hood.
|
||||
|
||||
Doing it step-by-step:
|
||||
|
||||
```python
|
||||
import morss, morss.crawler
|
||||
|
||||
@@ -230,46 +245,152 @@ url = 'http://newspaper.example/feed.xml'
|
||||
options = morss.Options(csv=True) # arguments
|
||||
morss.crawler.sqlite_default = '/tmp/morss-cache.db' # sqlite cache location
|
||||
|
||||
rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
|
||||
url, rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
|
||||
rss = morss.FeedGather(rss, url, options) # this fills the feed and cleans it up
|
||||
|
||||
output = morss.Format(rss, options) # formats final feed
|
||||
output = morss.FeedFormat(rss, options, 'unicode') # formats final feed
|
||||
```
|
||||
|
||||
## Cache information
|
||||
## Arguments and settings
|
||||
|
||||
morss uses caching to make loading faster. There are 2 possible cache backends
|
||||
(visible in `morss/crawler.py`):
|
||||
### Arguments
|
||||
|
||||
- `SQLiteCache`: sqlite3 cache. Default file location is in-memory (i.e. it will
|
||||
be cleared every time the program is run
|
||||
- `MySQLCacheHandler`: /!\ Does NOT support multi-threading
|
||||
morss accepts some arguments, to lightly alter the output of morss. Arguments
|
||||
may need to have a value (usually a string or a number). How to pass those
|
||||
arguments to morss is explained in Run above.
|
||||
|
||||
## Configuration
|
||||
### Length limitation
|
||||
The list of arguments can be obtained by running `morss --help`
|
||||
|
||||
```
|
||||
usage: morss [-h] [--post STRING] [--xpath XPATH]
|
||||
[--format {rss,json,html,csv}] [--search STRING] [--clip]
|
||||
[--indent] [--cache] [--force] [--proxy] [--newest] [--firstlink]
|
||||
[--resolve] [--items XPATH] [--item_link XPATH]
|
||||
[--item_title XPATH] [--item_content XPATH] [--item_time XPATH]
|
||||
[--nolink] [--noref] [--silent]
|
||||
url
|
||||
|
||||
Get full-text RSS feeds
|
||||
|
||||
positional arguments:
|
||||
url feed url
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--post STRING POST request
|
||||
--xpath XPATH xpath rule to manually detect the article
|
||||
|
||||
output:
|
||||
--format {rss,json,html,csv}
|
||||
output format
|
||||
--search STRING does a basic case-sensitive search in the feed
|
||||
--clip stick the full article content under the original feed
|
||||
content (useful for twitter)
|
||||
--indent returns indented XML or JSON, takes more place, but
|
||||
human-readable
|
||||
|
||||
action:
|
||||
--cache only take articles from the cache (ie. don't grab new
|
||||
articles' content), so as to save time
|
||||
--force force refetch the rss feed and articles
|
||||
--proxy doesn't fill the articles
|
||||
--newest return the feed items in chronological order (morss
|
||||
ohterwise shows the items by appearing order)
|
||||
--firstlink pull the first article mentioned in the description
|
||||
instead of the default link
|
||||
--resolve replace tracking links with direct links to articles
|
||||
(not compatible with --proxy)
|
||||
|
||||
custom feeds:
|
||||
--items XPATH (mandatory to activate the custom feeds function)
|
||||
xpath rule to match all the RSS entries
|
||||
--item_link XPATH xpath rule relative to items to point to the entry's
|
||||
link
|
||||
--item_title XPATH entry's title
|
||||
--item_content XPATH entry's content
|
||||
--item_time XPATH entry's date & time (accepts a wide range of time
|
||||
formats)
|
||||
|
||||
misc:
|
||||
--nolink drop links, but keeps links' inner text
|
||||
--noref drop items' link
|
||||
--silent don't output the final RSS (useless on its own, but
|
||||
can be nice when debugging)
|
||||
|
||||
GNU AGPLv3 code
|
||||
```
|
||||
|
||||
Further HTTP-only options:
|
||||
|
||||
- `callback=NAME`: for JSONP calls
|
||||
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other
|
||||
servers)
|
||||
- `txt`: changes the http content-type to txt (for faster "`view-source:`")
|
||||
|
||||
### Environment variables
|
||||
|
||||
To pass environment variables:
|
||||
|
||||
- Docker-cli: `docker run -p 8080:8080 morss --env KEY=value`
|
||||
- docker-compose: add an `environment:` section in the .yml file
|
||||
- Gunicorn/uWSGI/CLI: prepend `KEY=value` before the command
|
||||
- Apache: via the `SetEnv` instruction (see sample `.htaccess` provided)
|
||||
|
||||
Generic:
|
||||
|
||||
- `DEBUG=1`: to have some feedback from the script execution. Useful for
|
||||
debugging.
|
||||
- `IGNORE_SSL=1`: to ignore SSL certs when fetch feeds and articles
|
||||
- `DELAY` (seconds) sets the browser cache delay, only for HTTP clients
|
||||
- `TIMEOUT` (seconds) sets the HTTP timeout when fetching rss feeds and articles
|
||||
|
||||
When parsing long feeds, with a lot of items (100+), morss might take a lot of
|
||||
time to parse it, or might even run into a memory overflow on some shared
|
||||
hosting plans (limits around 10Mb), in which case you might want to adjust the
|
||||
different values at the top of the script.
|
||||
below settings via environment variables.
|
||||
|
||||
- `MAX_TIME` sets the maximum amount of time spent *fetching* articles, more time might be spent taking older articles from cache. `-1` for unlimited.
|
||||
- `MAX_ITEM` sets the maximum number of articles to fetch. `-1` for unlimited. More articles will be taken from cache following the nexts settings.
|
||||
- `LIM_TIME` sets the maximum amount of time spent working on the feed (whether or not it's already cached). Articles beyond that limit will be dropped from the feed. `-1` for unlimited.
|
||||
- `LIM_ITEM` sets the maximum number of article checked, limiting both the number of articles fetched and taken from cache. Articles beyond that limit will be dropped from the feed, even if they're cached. `-1` for unlimited.
|
||||
Also, if the request takes too long to process, the http request might be
|
||||
discarded. See relevant config for
|
||||
[gunicorn](https://docs.gunicorn.org/en/stable/settings.html#timeout) or
|
||||
[nginx](http://nginx.org/en/docs/http/ngx_http_proxy_module.html#proxy_read_timeout).
|
||||
|
||||
### Other settings
|
||||
- `MAX_TIME` (seconds) sets the maximum amount of time spent *fetching*
|
||||
articles, more time might be spent taking older articles from cache. `-1` for
|
||||
unlimited.
|
||||
- `MAX_ITEM` sets the maximum number of articles to fetch. `-1` for unlimited.
|
||||
More articles will be taken from cache following the nexts settings.
|
||||
- `LIM_TIME` (seconds) sets the maximum amount of time spent working on the feed
|
||||
(whether or not it's already cached). Articles beyond that limit will be dropped
|
||||
from the feed. `-1` for unlimited.
|
||||
- `LIM_ITEM` sets the maximum number of article checked, limiting both the
|
||||
number of articles fetched and taken from cache. Articles beyond that limit will
|
||||
be dropped from the feed, even if they're cached. `-1` for unlimited.
|
||||
|
||||
- `DELAY` sets the browser cache delay, only for HTTP clients
|
||||
- `TIMEOUT` sets the HTTP timeout when fetching rss feeds and articles
|
||||
- `THREADS` sets the number of threads to use. `1` makes no use of multithreading.
|
||||
morss uses caching to make loading faster. There are 3 possible cache backends:
|
||||
|
||||
- `(nothing/default)`: a simple python in-memory dict-like object.
|
||||
- `CACHE=sqlite`: sqlite3 cache. Default file location is in-memory (i.e. it
|
||||
will be cleared every time the program is run). Path can be defined with
|
||||
`SQLITE_PATH`.
|
||||
- `CACHE=mysql`: MySQL cache. Connection can be defined with the following
|
||||
environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
|
||||
- `CACHE=redis`: Redis cache. Connection can be defined with the following
|
||||
environment variables: `REDIS_HOST`, `REDIS_PORT`, `REDIS_DB`, `REDIS_PWD`
|
||||
|
||||
To limit the size of the cache:
|
||||
|
||||
- `CACHE_SIZE` sets the target number of items in the cache (further items will
|
||||
be deleted but the cache might be temporarily bigger than that). Defaults to 1k
|
||||
entries.
|
||||
- `CACHE_LIFESPAN` (seconds) sets how often the cache must be trimmed (i.e. cut
|
||||
down to the number of items set in `CACHE_SIZE`). Defaults to 1min.
|
||||
|
||||
### Content matching
|
||||
|
||||
The content of articles is grabbed with our own readability fork. This means
|
||||
that most of the time the right content is matched. However sometimes it fails,
|
||||
therefore some tweaking is required. Most of the time, what has to be done is to
|
||||
add some "rules" in the main script file in *readability* (not in morss).
|
||||
add some "rules" in the main script file in `readabilite.py` (not in morss).
|
||||
|
||||
Most of the time when hardly nothing is matched, it means that the main content
|
||||
of the article is made of images, videos, pictures, etc., which readability
|
||||
@@ -280,14 +401,3 @@ morss will also try to figure out whether the full content is already in place
|
||||
(for those websites which understood the whole point of RSS feeds). However this
|
||||
detection is very simple, and only works if the actual content is put in the
|
||||
"content" section in the feed and not in the "summary" section.
|
||||
|
||||
***
|
||||
|
||||
## Todo
|
||||
|
||||
You can contribute to this project. If you're not sure what to do, you can pick
|
||||
from this list:
|
||||
|
||||
- Add ability to run morss.py as an update daemon
|
||||
- Add ability to use custom xpath rule instead of readability
|
||||
- More ideas here <https://github.com/pictuga/morss/issues/15>
|
||||
|
20
main.py
20
main.py
@@ -1,6 +1,24 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from morss import main, cgi_wrapper as application
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
from morss.__main__ import main
|
||||
from morss.wsgi import application
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@@ -1,2 +1,23 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
# ran on `import morss`
|
||||
|
||||
# pylint: disable=unused-import,unused-variable
|
||||
|
||||
from .morss import *
|
||||
from .wsgi import application
|
||||
|
@@ -1,5 +1,48 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
# ran on `python -m morss`
|
||||
from .morss import main
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
from . import cli, wsgi
|
||||
from .morss import MorssException
|
||||
|
||||
|
||||
def main():
|
||||
if 'REQUEST_URI' in os.environ:
|
||||
# mod_cgi (w/o file handler)
|
||||
wsgi.cgi_handle_request()
|
||||
|
||||
elif len(sys.argv) <= 1:
|
||||
# start internal (basic) http server (w/ file handler)
|
||||
wsgi.cgi_start_server()
|
||||
|
||||
else:
|
||||
# as a CLI app
|
||||
try:
|
||||
cli.cli_app()
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
print('ERROR: %s' % e.message)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
188
morss/caching.py
Normal file
188
morss/caching.py
Normal file
@@ -0,0 +1,188 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import threading
|
||||
import time
|
||||
from collections import OrderedDict
|
||||
|
||||
CACHE_SIZE = int(os.getenv('CACHE_SIZE', 1000)) # max number of items in cache (default: 1k items)
|
||||
CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60)) # how often to auto-clear the cache (default: 1min)
|
||||
|
||||
|
||||
class BaseCache:
|
||||
""" Subclasses must behave like a dict """
|
||||
|
||||
def trim(self):
|
||||
pass
|
||||
|
||||
def autotrim(self, delay=CACHE_LIFESPAN):
|
||||
# trim the cache every so often
|
||||
|
||||
self.trim()
|
||||
|
||||
t = threading.Timer(delay, self.autotrim)
|
||||
t.daemon = True
|
||||
t.start()
|
||||
|
||||
def __contains__(self, url):
|
||||
try:
|
||||
self[url]
|
||||
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
try:
|
||||
import sqlite3 # isort:skip
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class SQLiteCache(BaseCache):
|
||||
def __init__(self, filename=':memory:'):
|
||||
self.con = sqlite3.connect(filename, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
|
||||
|
||||
with self.con:
|
||||
self.con.execute('CREATE TABLE IF NOT EXISTS data (ky UNICODE PRIMARY KEY, data BLOB, timestamp INT)')
|
||||
self.con.execute('pragma journal_mode=WAL')
|
||||
|
||||
self.trim()
|
||||
|
||||
def __del__(self):
|
||||
self.con.close()
|
||||
|
||||
def trim(self):
|
||||
with self.con:
|
||||
self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,))
|
||||
|
||||
def __getitem__(self, key):
|
||||
row = self.con.execute('SELECT * FROM data WHERE ky=?', (key,)).fetchone()
|
||||
|
||||
if not row:
|
||||
raise KeyError
|
||||
|
||||
return row[1]
|
||||
|
||||
def __setitem__(self, key, data):
|
||||
with self.con:
|
||||
self.con.execute('INSERT INTO data VALUES (?,?,?) ON CONFLICT(ky) DO UPDATE SET data=?, timestamp=?', (key, data, time.time(), data, time.time()))
|
||||
|
||||
|
||||
try:
|
||||
import pymysql.cursors # isort:skip
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class MySQLCacheHandler(BaseCache):
|
||||
def __init__(self, user, password, database, host='localhost'):
|
||||
self.user = user
|
||||
self.password = password
|
||||
self.database = database
|
||||
self.host = host
|
||||
|
||||
with self.cursor() as cursor:
|
||||
cursor.execute('CREATE TABLE IF NOT EXISTS data (ky VARCHAR(255) NOT NULL PRIMARY KEY, data MEDIUMBLOB, timestamp INT)')
|
||||
|
||||
self.trim()
|
||||
|
||||
def cursor(self):
|
||||
return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
|
||||
|
||||
def trim(self):
|
||||
with self.cursor() as cursor:
|
||||
cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,))
|
||||
|
||||
def __getitem__(self, key):
|
||||
cursor = self.cursor()
|
||||
cursor.execute('SELECT * FROM data WHERE ky=%s', (key,))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
raise KeyError
|
||||
|
||||
return row[1]
|
||||
|
||||
def __setitem__(self, key, data):
|
||||
with self.cursor() as cursor:
|
||||
cursor.execute('INSERT INTO data VALUES (%s,%s,%s) ON DUPLICATE KEY UPDATE data=%s, timestamp=%s',
|
||||
(key, data, time.time(), data, time.time()))
|
||||
|
||||
|
||||
class CappedDict(OrderedDict, BaseCache):
|
||||
def trim(self):
|
||||
if CACHE_SIZE >= 0:
|
||||
for i in range( max( len(self) - CACHE_SIZE , 0 )):
|
||||
self.popitem(False)
|
||||
|
||||
def __setitem__(self, key, data):
|
||||
# https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
|
||||
if key in self:
|
||||
del self[key]
|
||||
OrderedDict.__setitem__(self, key, data)
|
||||
|
||||
|
||||
try:
|
||||
import redis # isort:skip
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class RedisCacheHandler(BaseCache):
|
||||
def __init__(self, host='localhost', port=6379, db=0, password=None):
|
||||
self.r = redis.Redis(host=host, port=port, db=db, password=password)
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.r.get(key)
|
||||
|
||||
def __setitem__(self, key, data):
|
||||
self.r.set(key, data)
|
||||
|
||||
|
||||
if 'CACHE' in os.environ:
|
||||
if os.environ['CACHE'] == 'mysql':
|
||||
default_cache = MySQLCacheHandler(
|
||||
user = os.getenv('MYSQL_USER'),
|
||||
password = os.getenv('MYSQL_PWD'),
|
||||
database = os.getenv('MYSQL_DB'),
|
||||
host = os.getenv('MYSQL_HOST', 'localhost')
|
||||
)
|
||||
|
||||
elif os.environ['CACHE'] == 'sqlite':
|
||||
if 'SQLITE_PATH' in os.environ:
|
||||
path = os.getenv('SQLITE_PATH')
|
||||
|
||||
else:
|
||||
path = ':memory:'
|
||||
|
||||
default_cache = SQLiteCache(path)
|
||||
|
||||
elif os.environ['CACHE'] == 'redis':
|
||||
default_cache = RedisCacheHandler(
|
||||
host = os.getenv('REDIS_HOST', 'localhost'),
|
||||
port = int(os.getenv('REDIS_PORT', 6379)),
|
||||
db = int(os.getenv('REDIS_DB', 0)),
|
||||
password = os.getenv('REDIS_PWD', None)
|
||||
)
|
||||
|
||||
else:
|
||||
default_cache = CappedDict()
|
71
morss/cli.py
Normal file
71
morss/cli.py
Normal file
@@ -0,0 +1,71 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import argparse
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
from .morss import FeedFetch, FeedFormat, FeedGather, Options
|
||||
|
||||
|
||||
def cli_app():
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='morss',
|
||||
description='Get full-text RSS feeds',
|
||||
epilog='GNU AGPLv3 code'
|
||||
)
|
||||
|
||||
parser.add_argument('url', help='feed url')
|
||||
|
||||
parser.add_argument('--post', action='store', type=str, metavar='STRING', help='POST request')
|
||||
parser.add_argument('--xpath', action='store', type=str, metavar='XPATH', help='xpath rule to manually detect the article')
|
||||
|
||||
group = parser.add_argument_group('output')
|
||||
group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
|
||||
group.add_argument('--search', action='store', type=str, metavar='STRING', help='does a basic case-sensitive search in the feed')
|
||||
group.add_argument('--clip', action='store_true', help='stick the full article content under the original feed content (useful for twitter)')
|
||||
group.add_argument('--indent', action='store_true', help='returns indented XML or JSON, takes more place, but human-readable')
|
||||
|
||||
group = parser.add_argument_group('action')
|
||||
group.add_argument('--cache', action='store_true', help='only take articles from the cache (ie. don\'t grab new articles\' content), so as to save time')
|
||||
group.add_argument('--force', action='store_true', help='force refetch the rss feed and articles')
|
||||
group.add_argument('--proxy', action='store_true', help='doesn\'t fill the articles')
|
||||
group.add_argument('--newest', action='store_true', help='return the feed items in chronological order (morss ohterwise shows the items by appearing order)')
|
||||
group.add_argument('--firstlink', action='store_true', help='pull the first article mentioned in the description instead of the default link')
|
||||
group.add_argument('--resolve', action='store_true', help='replace tracking links with direct links to articles (not compatible with --proxy)')
|
||||
|
||||
group = parser.add_argument_group('custom feeds')
|
||||
group.add_argument('--items', action='store', type=str, metavar='XPATH', help='(mandatory to activate the custom feeds function) xpath rule to match all the RSS entries')
|
||||
group.add_argument('--item_link', action='store', type=str, metavar='XPATH', help='xpath rule relative to items to point to the entry\'s link')
|
||||
group.add_argument('--item_title', action='store', type=str, metavar='XPATH', help='entry\'s title')
|
||||
group.add_argument('--item_content', action='store', type=str, metavar='XPATH', help='entry\'s content')
|
||||
group.add_argument('--item_time', action='store', type=str, metavar='XPATH', help='entry\'s date & time (accepts a wide range of time formats)')
|
||||
|
||||
group = parser.add_argument_group('misc')
|
||||
group.add_argument('--nolink', action='store_true', help='drop links, but keeps links\' inner text')
|
||||
group.add_argument('--noref', action='store_true', help='drop items\' link')
|
||||
group.add_argument('--silent', action='store_true', help='don\'t output the final RSS (useless on its own, but can be nice when debugging)')
|
||||
|
||||
options = Options(vars(parser.parse_args()))
|
||||
url = options.url
|
||||
|
||||
url, rss = FeedFetch(url, options)
|
||||
rss = FeedGather(rss, url, options)
|
||||
out = FeedFormat(rss, options, 'unicode')
|
||||
|
||||
if not options.silent:
|
||||
print(out)
|
695
morss/crawler.py
695
morss/crawler.py
@@ -1,21 +1,51 @@
|
||||
import sys
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import zlib
|
||||
from io import BytesIO, StringIO
|
||||
import os
|
||||
import pickle
|
||||
import random
|
||||
import re
|
||||
import chardet
|
||||
from cgi import parse_header
|
||||
import lxml.html
|
||||
import sys
|
||||
import time
|
||||
import zlib
|
||||
from cgi import parse_header
|
||||
from collections import OrderedDict
|
||||
from io import BytesIO, StringIO
|
||||
|
||||
import chardet
|
||||
|
||||
from .caching import default_cache
|
||||
|
||||
try:
|
||||
# python 2
|
||||
from urllib2 import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
|
||||
from urllib import quote
|
||||
|
||||
import mimetools
|
||||
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
|
||||
Request, addinfourl, build_opener, parse_http_list,
|
||||
parse_keqv_list)
|
||||
from urlparse import urlparse, urlunparse
|
||||
except ImportError:
|
||||
# python 3
|
||||
from urllib.request import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
|
||||
import email
|
||||
from urllib.parse import quote, urlparse, urlunparse
|
||||
from urllib.request import (BaseHandler, HTTPCookieProcessor,
|
||||
HTTPRedirectHandler, Request, addinfourl,
|
||||
build_opener, parse_http_list, parse_keqv_list)
|
||||
|
||||
try:
|
||||
# python 2
|
||||
@@ -27,13 +57,59 @@ except NameError:
|
||||
|
||||
MIMETYPE = {
|
||||
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
|
||||
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
||||
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
||||
|
||||
|
||||
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
||||
DEFAULT_UAS = [
|
||||
#https://gist.github.com/fijimunkii/952acac988f2d25bef7e0284bc63c406
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",
|
||||
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
|
||||
]
|
||||
|
||||
|
||||
def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=False):
|
||||
PROTOCOL = ['http', 'https']
|
||||
|
||||
|
||||
def get(*args, **kwargs):
|
||||
return adv_get(*args, **kwargs)['data']
|
||||
|
||||
|
||||
def adv_get(url, post=None, timeout=None, *args, **kwargs):
|
||||
url = sanitize_url(url)
|
||||
|
||||
if post is not None:
|
||||
post = post.encode('utf-8')
|
||||
|
||||
if timeout is None:
|
||||
con = custom_opener(*args, **kwargs).open(url, data=post)
|
||||
|
||||
else:
|
||||
con = custom_opener(*args, **kwargs).open(url, data=post, timeout=timeout)
|
||||
|
||||
data = con.read()
|
||||
|
||||
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
||||
encoding= detect_encoding(data, con)
|
||||
|
||||
return {
|
||||
'data':data,
|
||||
'url': con.geturl(),
|
||||
'con': con,
|
||||
'contenttype': contenttype,
|
||||
'encoding': encoding
|
||||
}
|
||||
|
||||
|
||||
def custom_opener(follow=None, delay=None):
|
||||
handlers = []
|
||||
|
||||
# as per urllib2 source code, these Handelers are added first
|
||||
@@ -43,28 +119,122 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F
|
||||
# HTTPDefaultErrorHandler, HTTPRedirectHandler,
|
||||
# FTPHandler, FileHandler, HTTPErrorProcessor]
|
||||
# & HTTPSHandler
|
||||
#
|
||||
# when processing a request:
|
||||
# (1) all the *_request are run
|
||||
# (2) the *_open are run until sth is returned (other than None)
|
||||
# (3) all the *_response are run
|
||||
#
|
||||
# During (3), if an http error occurs (i.e. not a 2XX response code), the
|
||||
# http_error_* are run until sth is returned (other than None). If they all
|
||||
# return nothing, a python error is raised
|
||||
|
||||
#handlers.append(DebugHandler())
|
||||
handlers.append(SizeLimitHandler(100*1024)) # 100KiB
|
||||
handlers.append(SizeLimitHandler(500*1024)) # 500KiB
|
||||
handlers.append(HTTPCookieProcessor())
|
||||
handlers.append(GZIPHandler())
|
||||
handlers.append(HTTPAllRedirectHandler())
|
||||
handlers.append(HTTPEquivHandler())
|
||||
handlers.append(HTTPRefreshHandler())
|
||||
handlers.append(UAHandler(DEFAULT_UA))
|
||||
handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
|
||||
handlers.append(BrowserlyHeaderHandler())
|
||||
handlers.append(EncodingFixHandler())
|
||||
|
||||
if not basic:
|
||||
handlers.append(AutoRefererHandler())
|
||||
|
||||
handlers.append(EncodingFixHandler(encoding))
|
||||
|
||||
if accept:
|
||||
handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict))
|
||||
if follow:
|
||||
handlers.append(AlternateHandler(MIMETYPE[follow]))
|
||||
|
||||
handlers.append(CacheHandler(force_min=delay))
|
||||
|
||||
return build_opener(*handlers)
|
||||
|
||||
|
||||
def is_ascii(string):
|
||||
# there's a native function in py3, but home-made fix for backward compatibility
|
||||
try:
|
||||
string.encode('ascii')
|
||||
|
||||
except UnicodeError:
|
||||
return False
|
||||
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
def sanitize_url(url):
|
||||
# make sure the url is unicode, i.e. not bytes
|
||||
if isinstance(url, bytes):
|
||||
url = url.decode()
|
||||
|
||||
# make sure there's a protocol (http://)
|
||||
if url.split(':', 1)[0] not in PROTOCOL:
|
||||
url = 'http://' + url
|
||||
|
||||
# turns out some websites have really badly fomatted urls (fix http:/badurl)
|
||||
url = re.sub('^(https?):/([^/])', r'\1://\2', url)
|
||||
|
||||
# escape spaces
|
||||
url = url.replace(' ', '%20')
|
||||
|
||||
# escape non-ascii unicode characters
|
||||
# https://stackoverflow.com/a/4391299
|
||||
parts = list(urlparse(url))
|
||||
|
||||
for i in range(len(parts)):
|
||||
if not is_ascii(parts[i]):
|
||||
if i == 1:
|
||||
parts[i] = parts[i].encode('idna').decode('ascii')
|
||||
|
||||
else:
|
||||
parts[i] = quote(parts[i].encode('utf-8'))
|
||||
|
||||
return urlunparse(parts)
|
||||
|
||||
|
||||
class RespDataHandler(BaseHandler):
|
||||
" Make it easier to use the reponse body "
|
||||
|
||||
def data_reponse(self, req, resp, data):
|
||||
pass
|
||||
|
||||
def http_response(self, req, resp):
|
||||
# read data
|
||||
data = resp.read()
|
||||
|
||||
# process data and use returned content (if any)
|
||||
data = self.data_response(req, resp, data) or data
|
||||
|
||||
# reformat the stuff
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
|
||||
|
||||
class RespStrHandler(RespDataHandler):
|
||||
" Make it easier to use the _decoded_ reponse body "
|
||||
|
||||
def str_reponse(self, req, resp, data_str):
|
||||
pass
|
||||
|
||||
def data_response(self, req, resp, data):
|
||||
#decode
|
||||
enc = detect_encoding(data, resp)
|
||||
data_str = data.decode(enc, 'replace')
|
||||
|
||||
#process
|
||||
data_str = self.str_response(req, resp, data_str)
|
||||
|
||||
# return
|
||||
data = data_str.encode(enc) if data_str is not None else data
|
||||
|
||||
#return
|
||||
return data
|
||||
|
||||
|
||||
class DebugHandler(BaseHandler):
|
||||
handler_order = 2000
|
||||
|
||||
@@ -85,7 +255,7 @@ class SizeLimitHandler(BaseHandler):
|
||||
|
||||
handler_order = 450
|
||||
|
||||
def __init__(self, limit=5*1024^2):
|
||||
def __init__(self, limit=5*1024**2):
|
||||
self.limit = limit
|
||||
|
||||
def http_response(self, req, resp):
|
||||
@@ -106,32 +276,29 @@ def UnGzip(data):
|
||||
return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data)
|
||||
|
||||
|
||||
class GZIPHandler(BaseHandler):
|
||||
class GZIPHandler(RespDataHandler):
|
||||
def http_request(self, req):
|
||||
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
||||
return req
|
||||
|
||||
def http_response(self, req, resp):
|
||||
def data_response(self, req, resp, data):
|
||||
if 200 <= resp.code < 300:
|
||||
if resp.headers.get('Content-Encoding') == 'gzip':
|
||||
data = resp.read()
|
||||
|
||||
data = UnGzip(data)
|
||||
|
||||
resp.headers['Content-Encoding'] = 'identity'
|
||||
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
https_request = http_request
|
||||
return UnGzip(data)
|
||||
|
||||
|
||||
def detect_encoding(data, resp=None):
|
||||
enc = detect_raw_encoding(data, resp)
|
||||
|
||||
if enc.lower() == 'gb2312':
|
||||
enc = 'gbk'
|
||||
|
||||
return enc
|
||||
|
||||
|
||||
def detect_raw_encoding(data, resp=None):
|
||||
if resp is not None:
|
||||
enc = resp.headers.get('charset')
|
||||
if enc is not None:
|
||||
@@ -156,32 +323,9 @@ def detect_encoding(data, resp=None):
|
||||
return 'utf-8'
|
||||
|
||||
|
||||
class EncodingFixHandler(BaseHandler):
|
||||
def __init__(self, encoding=None):
|
||||
self.encoding = encoding
|
||||
|
||||
def http_response(self, req, resp):
|
||||
maintype = resp.info().get('Content-Type', '').split('/')[0]
|
||||
if 200 <= resp.code < 300 and maintype == 'text':
|
||||
data = resp.read()
|
||||
|
||||
if not self.encoding:
|
||||
enc = detect_encoding(data, resp)
|
||||
else:
|
||||
enc = self.encoding
|
||||
|
||||
if enc:
|
||||
data = data.decode(enc, 'replace')
|
||||
data = data.encode(enc)
|
||||
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
class EncodingFixHandler(RespStrHandler):
|
||||
def str_response(self, req, resp, data_str):
|
||||
return data_str
|
||||
|
||||
|
||||
class UAHandler(BaseHandler):
|
||||
@@ -196,83 +340,69 @@ class UAHandler(BaseHandler):
|
||||
https_request = http_request
|
||||
|
||||
|
||||
class AutoRefererHandler(BaseHandler):
|
||||
class BrowserlyHeaderHandler(BaseHandler):
|
||||
""" Add more headers to look less suspicious """
|
||||
|
||||
def http_request(self, req):
|
||||
req.add_unredirected_header('Referer', 'http://%s' % req.host)
|
||||
req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
|
||||
req.add_unredirected_header('Accept-Language', 'en-US,en;q=0.5')
|
||||
return req
|
||||
|
||||
https_request = http_request
|
||||
|
||||
|
||||
class ContentNegociationHandler(BaseHandler):
|
||||
" Handler for content negociation. Also parses <link rel='alternate' type='application/rss+xml' href='...' /> "
|
||||
def iter_html_tag(html_str, tag_name):
|
||||
" To avoid parsing whole pages when looking for a simple tag "
|
||||
|
||||
def __init__(self, accept=None, strict=False):
|
||||
self.accept = accept
|
||||
self.strict = strict
|
||||
re_tag = r'<%s(\s*[^>])*>' % tag_name
|
||||
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
|
||||
|
||||
def http_request(self, req):
|
||||
if self.accept is not None:
|
||||
if isinstance(self.accept, basestring):
|
||||
self.accept = (self.accept,)
|
||||
for tag_match in re.finditer(re_tag, html_str):
|
||||
attr_match = re.findall(re_attr, tag_match.group(0))
|
||||
|
||||
string = ','.join(self.accept)
|
||||
if attr_match is not None:
|
||||
yield dict(attr_match)
|
||||
|
||||
if self.strict:
|
||||
string += ',*/*;q=0.9'
|
||||
|
||||
req.add_unredirected_header('Accept', string)
|
||||
class AlternateHandler(RespStrHandler):
|
||||
" Follow <link rel='alternate' type='application/rss+xml' href='...' /> "
|
||||
|
||||
return req
|
||||
def __init__(self, follow=None):
|
||||
self.follow = follow or []
|
||||
|
||||
def http_response(self, req, resp):
|
||||
def str_response(self, req, resp, data_str):
|
||||
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
||||
if 200 <= resp.code < 300 and self.accept is not None and self.strict and contenttype in MIMETYPE['html'] and contenttype not in self.accept:
|
||||
|
||||
if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
|
||||
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
|
||||
|
||||
data = resp.read()
|
||||
links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
|
||||
|
||||
for link in links:
|
||||
if link.get('type', '') in self.accept:
|
||||
for link in iter_html_tag(data_str[:10000], 'link'):
|
||||
if (link.get('rel') == 'alternate'
|
||||
and link.get('type') in self.follow
|
||||
and 'href' in link):
|
||||
resp.code = 302
|
||||
resp.msg = 'Moved Temporarily'
|
||||
resp.headers['location'] = link.get('href')
|
||||
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_request = http_request
|
||||
https_response = http_response
|
||||
break
|
||||
|
||||
|
||||
class HTTPEquivHandler(BaseHandler):
|
||||
class HTTPEquivHandler(RespStrHandler):
|
||||
" Handler to support <meta http-equiv='...' content='...' />, since it defines HTTP headers "
|
||||
|
||||
handler_order = 600
|
||||
|
||||
def http_response(self, req, resp):
|
||||
def str_response(self, req, resp, data_str):
|
||||
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
||||
if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
|
||||
data = resp.read()
|
||||
|
||||
headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
|
||||
for meta in iter_html_tag(data_str[:10000], 'meta'):
|
||||
if 'http-equiv' in meta and 'content' in meta:
|
||||
resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
|
||||
|
||||
for header in headers:
|
||||
resp.headers[header.get('http-equiv').lower()] = header.get('content')
|
||||
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
class HTTPAllRedirectHandler(HTTPRedirectHandler):
|
||||
def http_error_308(self, req, fp, code, msg, headers):
|
||||
return self.http_error_301(req, fp, 301, msg, headers)
|
||||
|
||||
|
||||
class HTTPRefreshHandler(BaseHandler):
|
||||
@@ -297,139 +427,74 @@ class HTTPRefreshHandler(BaseHandler):
|
||||
https_response = http_response
|
||||
|
||||
|
||||
default_cache = {}
|
||||
|
||||
|
||||
class CacheHandler(BaseHandler):
|
||||
" Cache based on etags/last-modified "
|
||||
|
||||
private_cache = False # False to behave like a CDN (or if you just don't care), True like a PC
|
||||
private_cache = False # Websites can indicate whether the page should be
|
||||
# cached by CDNs (e.g. shouldn't be the case for
|
||||
# private/confidential/user-specific pages.
|
||||
# With this setting, decide whether (False) you want
|
||||
# the cache to behave like a CDN (i.e. don't cache
|
||||
# private pages), or (True) to behave like a end-cache
|
||||
# private pages. If unsure, False is the safest bet.
|
||||
handler_order = 499
|
||||
|
||||
def __init__(self, cache=None, force_min=None):
|
||||
self.cache = cache or default_cache
|
||||
self.force_min = force_min # force_min (seconds) to bypass http headers, -1 forever, 0 never, -2 do nothing if not in cache
|
||||
self.force_min = force_min
|
||||
# Servers indicate how long they think their content is "valid".
|
||||
# With this parameter (force_min, expressed in seconds), we can
|
||||
# override the validity period (i.e. bypassing http headers)
|
||||
# Special values:
|
||||
# -1: valid forever, i.e. use the cache no matter what (and fetch
|
||||
# the page online if not present in cache)
|
||||
# 0: valid zero second, i.e. force refresh
|
||||
# -2: same as -1, i.e. use the cache no matter what, but do NOT
|
||||
# fetch the page online if not present in cache, throw an
|
||||
# error instead
|
||||
|
||||
def load(self, url):
|
||||
try:
|
||||
out = list(self.cache[url])
|
||||
data = pickle.loads(self.cache[url])
|
||||
|
||||
except KeyError:
|
||||
out = [None, None, unicode(), bytes(), 0]
|
||||
data = None
|
||||
|
||||
if sys.version_info[0] >= 3:
|
||||
out[2] = email.message_from_string(out[2] or unicode()) # headers
|
||||
else:
|
||||
out[2] = mimetools.Message(StringIO(out[2] or unicode()))
|
||||
|
||||
return out
|
||||
|
||||
def save(self, url, code, msg, headers, data, timestamp):
|
||||
self.cache[url] = (code, msg, unicode(headers), data, timestamp)
|
||||
|
||||
def http_request(self, req):
|
||||
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
|
||||
|
||||
if 'etag' in headers:
|
||||
req.add_unredirected_header('If-None-Match', headers['etag'])
|
||||
|
||||
if 'last-modified' in headers:
|
||||
req.add_unredirected_header('If-Modified-Since', headers.get('last-modified'))
|
||||
|
||||
return req
|
||||
|
||||
def http_open(self, req):
|
||||
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
|
||||
|
||||
# some info needed to process everything
|
||||
cache_control = parse_http_list(headers.get('cache-control', ()))
|
||||
cache_control += parse_http_list(headers.get('pragma', ()))
|
||||
|
||||
cc_list = [x for x in cache_control if '=' not in x]
|
||||
cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
|
||||
|
||||
cache_age = time.time() - timestamp
|
||||
|
||||
# list in a simple way what to do when
|
||||
if req.get_header('Morss') == 'from_304': # for whatever reason, we need an uppercase
|
||||
# we're just in the middle of a dirty trick, use cache
|
||||
pass
|
||||
|
||||
elif self.force_min == -2:
|
||||
if code is not None:
|
||||
# already in cache, perfect, use cache
|
||||
pass
|
||||
|
||||
if sys.version_info[0] >= 3:
|
||||
data['headers'] = email.message_from_string(data['headers'] or unicode()) # headers
|
||||
else:
|
||||
headers['Morss'] = 'from_cache'
|
||||
resp = addinfourl(BytesIO(), headers, req.get_full_url(), 409)
|
||||
resp.msg = 'Conflict'
|
||||
return resp
|
||||
data['headers'] = mimetools.Message(StringIO(data['headers'] or unicode()))
|
||||
|
||||
elif code is None:
|
||||
# cache empty, refresh
|
||||
return None
|
||||
return data
|
||||
|
||||
elif self.force_min == -1:
|
||||
# force use cache
|
||||
pass
|
||||
def save(self, key, data):
|
||||
data['headers'] = unicode(data['headers'])
|
||||
self.cache[key] = pickle.dumps(data, 0)
|
||||
|
||||
elif self.force_min == 0:
|
||||
# force refresh
|
||||
return None
|
||||
def is_cached(self, key):
|
||||
return self.load(key) is not None
|
||||
|
||||
elif code == 301 and cache_age < 7*24*3600:
|
||||
# "301 Moved Permanently" has to be cached...as long as we want (awesome HTTP specs), let's say a week (why not?)
|
||||
# use force_min=0 if you want to bypass this (needed for a proper refresh)
|
||||
pass
|
||||
|
||||
elif self.force_min is None and ('no-cache' in cc_list
|
||||
or 'no-store' in cc_list
|
||||
or ('private' in cc_list and not self.private)):
|
||||
# kindly follow web servers indications, refresh
|
||||
return None
|
||||
|
||||
elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
|
||||
# server says it's still fine (and we trust him, if not, use force_min=0), use cache
|
||||
pass
|
||||
|
||||
elif self.force_min is not None and self.force_min > cache_age:
|
||||
# still recent enough for us, use cache
|
||||
pass
|
||||
|
||||
else:
|
||||
# according to the www, we have to refresh when nothing is said
|
||||
return None
|
||||
def cached_response(self, req):
|
||||
# this does NOT check whether it's already cached, use with care
|
||||
data = self.load(req.get_full_url())
|
||||
|
||||
# return the cache as a response
|
||||
headers['morss'] = 'from_cache' # TODO delete the morss header from incoming pages, to avoid websites messing up with us
|
||||
resp = addinfourl(BytesIO(data), headers, req.get_full_url(), code)
|
||||
resp.msg = msg
|
||||
resp = addinfourl(BytesIO(data['data']), data['headers'], req.get_full_url(), data['code'])
|
||||
resp.msg = data['msg']
|
||||
|
||||
return resp
|
||||
|
||||
def http_response(self, req, resp):
|
||||
# code for after-fetch, to know whether to save to hard-drive (if stiking to http headers' will)
|
||||
|
||||
if resp.code == 304:
|
||||
return resp
|
||||
|
||||
if ('cache-control' in resp.headers or 'pragma' in resp.headers) and self.force_min is None:
|
||||
cache_control = parse_http_list(resp.headers.get('cache-control', ()))
|
||||
cache_control += parse_http_list(resp.headers.get('pragma', ()))
|
||||
|
||||
cc_list = [x for x in cache_control if '=' not in x]
|
||||
|
||||
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private):
|
||||
# kindly follow web servers indications
|
||||
return resp
|
||||
|
||||
if resp.headers.get('Morss') == 'from_cache':
|
||||
# it comes from cache, so no need to save it again
|
||||
return resp
|
||||
|
||||
# save to disk
|
||||
def save_response(self, req, resp):
|
||||
data = resp.read()
|
||||
self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
|
||||
|
||||
self.save(req.get_full_url(), {
|
||||
'code': resp.code,
|
||||
'msg': resp.msg,
|
||||
'headers': resp.headers,
|
||||
'data': data,
|
||||
'timestamp': time.time()
|
||||
})
|
||||
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
@@ -438,107 +503,125 @@ class CacheHandler(BaseHandler):
|
||||
|
||||
return resp
|
||||
|
||||
def http_error_304(self, req, fp, code, msg, headers):
|
||||
cache = list(self.load(req.get_full_url()))
|
||||
def http_request(self, req):
|
||||
data = self.load(req.get_full_url())
|
||||
|
||||
if cache[0]:
|
||||
cache[-1] = time.time()
|
||||
self.save(req.get_full_url(), *cache)
|
||||
if data is not None:
|
||||
if 'etag' in data['headers']:
|
||||
req.add_unredirected_header('If-None-Match', data['headers']['etag'])
|
||||
|
||||
new = Request(req.get_full_url(),
|
||||
headers=req.headers,
|
||||
unverifiable=True)
|
||||
if 'last-modified' in data['headers']:
|
||||
req.add_unredirected_header('If-Modified-Since', data['headers']['last-modified'])
|
||||
|
||||
new.add_unredirected_header('Morss', 'from_304')
|
||||
return req
|
||||
|
||||
return self.parent.open(new, timeout=req.timeout)
|
||||
def http_open(self, req):
|
||||
# Reminder of how/when this function is called by urllib2:
|
||||
# If 'None' is returned, try your chance with the next-available handler
|
||||
# If a 'resp' is returned, stop there, and proceed with 'http_response'
|
||||
|
||||
return None
|
||||
data = self.load(req.get_full_url())
|
||||
|
||||
if data is None:
|
||||
# cache empty, refresh
|
||||
return None
|
||||
|
||||
# some info needed to process everything
|
||||
cache_control = parse_http_list(data['headers'].get('cache-control', ()))
|
||||
cache_control += parse_http_list(data['headers'].get('pragma', ()))
|
||||
|
||||
cc_list = [x for x in cache_control if '=' not in x]
|
||||
cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
|
||||
|
||||
cache_age = time.time() - data['timestamp']
|
||||
|
||||
# list in a simple way what to do when
|
||||
if self.force_min == -2:
|
||||
if data['code'] is not None:
|
||||
# already in cache, perfect, use cache
|
||||
return self.cached_response(req)
|
||||
|
||||
else:
|
||||
# raise an error, via urllib handlers
|
||||
resp = addinfourl(BytesIO(), data['headers'], req.get_full_url(), 409)
|
||||
resp.msg = 'Conflict'
|
||||
return resp
|
||||
|
||||
elif self.force_min == -1:
|
||||
# force use cache
|
||||
return self.cached_response(req)
|
||||
|
||||
elif self.force_min == 0:
|
||||
# force refresh
|
||||
return None
|
||||
|
||||
elif data['code'] == 301 and cache_age < 7*24*3600:
|
||||
# "301 Moved Permanently" has to be cached...as long as we want
|
||||
# (awesome HTTP specs), let's say a week (why not?). Use force_min=0
|
||||
# if you want to bypass this (needed for a proper refresh)
|
||||
return self.cached_response(req)
|
||||
|
||||
elif (self.force_min is None or self.force_min > 0) and ('no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache)):
|
||||
# kindly follow web servers indications, refresh
|
||||
# if the same settings are used all along, this section shouldn't be
|
||||
# of any use, since the page woudln't be cached in the first place
|
||||
# the check is only performed "just in case"
|
||||
return None
|
||||
|
||||
elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
|
||||
# server says it's still fine (and we trust him, if not, use force_min=0), use cache
|
||||
return self.cached_response(req)
|
||||
|
||||
elif self.force_min is not None and self.force_min > cache_age:
|
||||
# still recent enough for us, use cache
|
||||
return self.cached_response(req)
|
||||
|
||||
else:
|
||||
# according to the www, we have to refresh when nothing is said
|
||||
return None
|
||||
|
||||
def http_response(self, req, resp):
|
||||
# code for after-fetch, to know whether to save to hard-drive (if stiking to http headers' will)
|
||||
# NB. It might re-save requests pulled from cache, which will re-set the time() to the latest, i.e. lenghten its useful life
|
||||
|
||||
if resp.code == 304 and self.is_cached(resp.url):
|
||||
# we are hopefully the first after the HTTP handler, so no need
|
||||
# to re-run all the *_response
|
||||
# here: cached page, returning from cache
|
||||
return self.cached_response(req)
|
||||
|
||||
elif ('cache-control' in resp.headers or 'pragma' in resp.headers) and self.force_min is None:
|
||||
cache_control = parse_http_list(resp.headers.get('cache-control', ()))
|
||||
cache_control += parse_http_list(resp.headers.get('pragma', ()))
|
||||
|
||||
cc_list = [x for x in cache_control if '=' not in x]
|
||||
|
||||
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache):
|
||||
# kindly follow web servers indications (do not save & return)
|
||||
return resp
|
||||
|
||||
else:
|
||||
# save
|
||||
return self.save_response(req, resp)
|
||||
|
||||
else:
|
||||
return self.save_response(req, resp)
|
||||
|
||||
https_request = http_request
|
||||
https_open = http_open
|
||||
https_response = http_response
|
||||
|
||||
|
||||
class BaseCache:
|
||||
def __contains__(self, url):
|
||||
try:
|
||||
self[url]
|
||||
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
else:
|
||||
return True
|
||||
if 'IGNORE_SSL' in os.environ:
|
||||
import ssl
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
|
||||
|
||||
import sqlite3
|
||||
if __name__ == '__main__':
|
||||
req = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
|
||||
|
||||
if sys.flags.interactive:
|
||||
print('>>> Interactive shell: try using `req`')
|
||||
|
||||
class SQLiteCache(BaseCache):
|
||||
def __init__(self, filename=':memory:'):
|
||||
self.con = sqlite3.connect(filename or sqlite_default, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
|
||||
|
||||
with self.con:
|
||||
self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)')
|
||||
self.con.execute('pragma journal_mode=WAL')
|
||||
|
||||
def __del__(self):
|
||||
self.con.close()
|
||||
|
||||
def __getitem__(self, url):
|
||||
row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
|
||||
|
||||
if not row:
|
||||
raise KeyError
|
||||
|
||||
return row[1:]
|
||||
|
||||
def __setitem__(self, url, value): # value = (code, msg, headers, data, timestamp)
|
||||
value = list(value)
|
||||
value[3] = sqlite3.Binary(value[3]) # data
|
||||
value = tuple(value)
|
||||
|
||||
if url in self:
|
||||
with self.con:
|
||||
self.con.execute('UPDATE data SET code=?, msg=?, headers=?, data=?, timestamp=? WHERE url=?',
|
||||
value + (url,))
|
||||
|
||||
else:
|
||||
with self.con:
|
||||
self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?)', (url,) + value)
|
||||
|
||||
|
||||
import pymysql.cursors
|
||||
|
||||
|
||||
class MySQLCacheHandler(BaseCache):
|
||||
" NB. Requires mono-threading, as pymysql isn't thread-safe "
|
||||
def __init__(self, user, password, database, host='localhost'):
|
||||
self.con = pymysql.connect(host=host, user=user, password=password, database=database, charset='utf8', autocommit=True)
|
||||
|
||||
with self.con.cursor() as cursor:
|
||||
cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)')
|
||||
|
||||
def __del__(self):
|
||||
self.con.close()
|
||||
|
||||
def __getitem__(self, url):
|
||||
cursor = self.con.cursor()
|
||||
cursor.execute('SELECT * FROM data WHERE url=%s', (url,))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
raise KeyError
|
||||
|
||||
return row[1:]
|
||||
|
||||
def __setitem__(self, url, value): # (code, msg, headers, data, timestamp)
|
||||
if url in self:
|
||||
with self.con.cursor() as cursor:
|
||||
cursor.execute('UPDATE data SET code=%s, msg=%s, headers=%s, data=%s, timestamp=%s WHERE url=%s',
|
||||
value + (url,))
|
||||
|
||||
else:
|
||||
with self.con.cursor() as cursor:
|
||||
cursor.execute('INSERT INTO data VALUES (%s,%s,%s,%s,%s,%s)', (url,) + value)
|
||||
else:
|
||||
print(req['data'].decode(req['encoding']))
|
||||
|
@@ -73,7 +73,7 @@ item_updated = atom03:updated
|
||||
mode = json
|
||||
|
||||
mimetype = application/json
|
||||
timeformat = %Y-%m-%dT%H:%M:%SZ
|
||||
timeformat = %Y-%m-%dT%H:%M:%S%z
|
||||
base = {}
|
||||
|
||||
title = title
|
||||
@@ -90,8 +90,11 @@ item_updated = updated
|
||||
[html]
|
||||
mode = html
|
||||
|
||||
path =
|
||||
http://localhost/
|
||||
|
||||
title = //div[@id='header']/h1
|
||||
desc = //div[@id='header']/h2
|
||||
desc = //div[@id='header']/p
|
||||
items = //div[@id='content']/div
|
||||
|
||||
item_title = ./a
|
||||
@@ -99,7 +102,7 @@ item_link = ./a/@href
|
||||
item_desc = ./div[class=desc]
|
||||
item_content = ./div[class=content]
|
||||
|
||||
base = <!DOCTYPE html> <html> <head> <title>Feed reader by morss</title> <meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" /> </head> <body> <div id="header"> <h1>@feed.title</h1> <h2>@feed.desc</h2> <p>- via morss</p> </div> <div id="content"> <div class="item"> <a class="title link" href="@item.link" target="_blank">@item.title</a> <div class="desc">@item.desc</div> <div class="content">@item.content</div> </div> </div> <script> var items = document.getElementsByClassName('item') for (var i in items) items[i].onclick = function() { this.classList.toggle('active') document.body.classList.toggle('noscroll') } </script> </body> </html>
|
||||
base = file:sheet.xsl
|
||||
|
||||
[twitter]
|
||||
mode = html
|
||||
|
@@ -1,28 +0,0 @@
|
||||
import re
|
||||
import json
|
||||
|
||||
from . import crawler
|
||||
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
basestring = str
|
||||
|
||||
|
||||
def pre_worker(url):
|
||||
if url.startswith('http://itunes.apple.com/') or url.startswith('https://itunes.apple.com/'):
|
||||
match = re.search('/id([0-9]+)(\?.*)?$', url)
|
||||
if match:
|
||||
iid = match.groups()[0]
|
||||
redirect = 'https://itunes.apple.com/lookup?id=%s' % iid
|
||||
|
||||
try:
|
||||
con = crawler.custom_handler(basic=True).open(redirect, timeout=4)
|
||||
data = con.read()
|
||||
|
||||
except (IOError, HTTPException):
|
||||
raise
|
||||
|
||||
return json.loads(data.decode('utf-8', 'replace'))['results'][0]['feedUrl']
|
||||
|
||||
return None
|
298
morss/feeds.py
298
morss/feeds.py
@@ -1,31 +1,47 @@
|
||||
import sys
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
import re
|
||||
import json
|
||||
import csv
|
||||
|
||||
import json
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from fnmatch import fnmatch
|
||||
|
||||
from lxml import etree
|
||||
from dateutil import tz
|
||||
import dateutil.parser
|
||||
from copy import deepcopy
|
||||
|
||||
import lxml.html
|
||||
from dateutil import tz
|
||||
from lxml import etree
|
||||
|
||||
from .readabilite import parse as html_parse
|
||||
|
||||
json.encoder.c_make_encoder = None
|
||||
|
||||
try:
|
||||
# python 2
|
||||
from StringIO import StringIO
|
||||
from ConfigParser import RawConfigParser
|
||||
from StringIO import StringIO
|
||||
except ImportError:
|
||||
# python 3
|
||||
from io import StringIO
|
||||
from configparser import RawConfigParser
|
||||
from io import StringIO
|
||||
|
||||
try:
|
||||
# python 2
|
||||
@@ -45,62 +61,80 @@ def parse_rules(filename=None):
|
||||
rules = dict([(x, dict(config.items(x))) for x in config.sections()])
|
||||
|
||||
for section in rules.keys():
|
||||
# for each ruleset
|
||||
|
||||
for arg in rules[section].keys():
|
||||
if '\n' in rules[section][arg]:
|
||||
# for each rule
|
||||
|
||||
if rules[section][arg].startswith('file:'):
|
||||
paths = [os.path.join(sys.prefix, 'share/morss/www', rules[section][arg][5:]),
|
||||
os.path.join(os.path.dirname(__file__), '../www', rules[section][arg][5:]),
|
||||
os.path.join(os.path.dirname(__file__), '../..', rules[section][arg][5:])]
|
||||
|
||||
for path in paths:
|
||||
try:
|
||||
file_raw = open(path).read()
|
||||
file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
|
||||
rules[section][arg] = file_clean
|
||||
|
||||
except IOError:
|
||||
pass
|
||||
|
||||
elif '\n' in rules[section][arg]:
|
||||
rules[section][arg] = rules[section][arg].split('\n')[1:]
|
||||
|
||||
return rules
|
||||
|
||||
|
||||
def parse(data, url=None, mimetype=None):
|
||||
def parse(data, url=None, encoding=None, ruleset=None):
|
||||
" Determine which ruleset to use "
|
||||
|
||||
rulesets = parse_rules()
|
||||
if ruleset is not None:
|
||||
rulesets = [ruleset]
|
||||
|
||||
else:
|
||||
rulesets = parse_rules().values()
|
||||
|
||||
parsers = [FeedXML, FeedHTML, FeedJSON]
|
||||
|
||||
# 1) Look for a ruleset based on path
|
||||
|
||||
if url is not None:
|
||||
for ruleset in rulesets.values():
|
||||
for ruleset in rulesets:
|
||||
if 'path' in ruleset:
|
||||
for path in ruleset['path']:
|
||||
if fnmatch(url, path):
|
||||
parser = [x for x in parsers if x.mode == ruleset['mode']][0]
|
||||
return parser(data, ruleset)
|
||||
return parser(data, ruleset, encoding=encoding)
|
||||
|
||||
# 2) Look for a parser based on mimetype
|
||||
|
||||
if mimetype is not None:
|
||||
parser_candidates = [x for x in parsers if mimetype in x.mimetype]
|
||||
|
||||
if mimetype is None or parser_candidates is None:
|
||||
parser_candidates = parsers
|
||||
# 2) Try each and every parser
|
||||
|
||||
# 3) Look for working ruleset for given parser
|
||||
# 3a) See if parsing works
|
||||
# 3b) See if .items matches anything
|
||||
|
||||
for parser in parser_candidates:
|
||||
ruleset_candidates = [x for x in rulesets.values() if x['mode'] == parser.mode and 'path' not in x]
|
||||
# 'path' as they should have been caught beforehands
|
||||
|
||||
for parser in parsers:
|
||||
try:
|
||||
feed = parser(data)
|
||||
feed = parser(data, encoding=encoding)
|
||||
|
||||
except (ValueError):
|
||||
except (ValueError, SyntaxError):
|
||||
# parsing did not work
|
||||
pass
|
||||
|
||||
else:
|
||||
# parsing worked, now we try the rulesets
|
||||
|
||||
ruleset_candidates = [x for x in rulesets if x.get('mode', None) in (parser.mode, None) and 'path' not in x]
|
||||
# 'path' as they should have been caught beforehands
|
||||
# try anyway if no 'mode' specified
|
||||
|
||||
for ruleset in ruleset_candidates:
|
||||
feed.rules = ruleset
|
||||
|
||||
try:
|
||||
feed.items[0]
|
||||
|
||||
except (AttributeError, IndexError):
|
||||
except (AttributeError, IndexError, TypeError):
|
||||
# parsing and or item picking did not work out
|
||||
pass
|
||||
|
||||
@@ -112,7 +146,7 @@ def parse(data, url=None, mimetype=None):
|
||||
|
||||
|
||||
class ParserBase(object):
|
||||
def __init__(self, data=None, rules=None, parent=None):
|
||||
def __init__(self, data=None, rules=None, parent=None, encoding=None):
|
||||
if rules is None:
|
||||
rules = parse_rules()[self.default_ruleset]
|
||||
|
||||
@@ -121,9 +155,10 @@ class ParserBase(object):
|
||||
if data is None:
|
||||
data = rules['base']
|
||||
|
||||
self.root = self.parse(data)
|
||||
self.parent = parent
|
||||
self.encoding = encoding
|
||||
|
||||
self.root = self.parse(data)
|
||||
|
||||
def parse(self, raw):
|
||||
pass
|
||||
@@ -148,15 +183,15 @@ class ParserBase(object):
|
||||
c = csv.writer(out, dialect=csv.excel)
|
||||
|
||||
for item in self.items:
|
||||
row = [getattr(item, x) for x in item.dic]
|
||||
|
||||
if encoding != 'unicode':
|
||||
row = [x.encode(encoding) if isinstance(x, unicode) else x for x in row]
|
||||
|
||||
c.writerow(row)
|
||||
c.writerow([getattr(item, x) for x in item.dic])
|
||||
|
||||
out.seek(0)
|
||||
return out.read()
|
||||
out = out.read()
|
||||
|
||||
if encoding != 'unicode':
|
||||
out = out.encode(encoding)
|
||||
|
||||
return out
|
||||
|
||||
def tohtml(self, **k):
|
||||
return self.convert(FeedHTML).tostring(**k)
|
||||
@@ -267,8 +302,15 @@ class ParserBase(object):
|
||||
|
||||
except AttributeError:
|
||||
# does not exist, have to create it
|
||||
self.rule_create(self.rules[rule_name])
|
||||
self.rule_set(self.rules[rule_name], value)
|
||||
try:
|
||||
self.rule_create(self.rules[rule_name])
|
||||
|
||||
except AttributeError:
|
||||
# no way to create it, give up
|
||||
pass
|
||||
|
||||
else:
|
||||
self.rule_set(self.rules[rule_name], value)
|
||||
|
||||
def rmv(self, rule_name):
|
||||
# easy deleter
|
||||
@@ -286,10 +328,7 @@ class ParserXML(ParserBase):
|
||||
|
||||
NSMAP = {'atom': 'http://www.w3.org/2005/Atom',
|
||||
'atom03': 'http://purl.org/atom/ns#',
|
||||
'media': 'http://search.yahoo.com/mrss/',
|
||||
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
|
||||
'slash': 'http://purl.org/rss/1.0/modules/slash/',
|
||||
'dc': 'http://purl.org/dc/elements/1.1/',
|
||||
'content': 'http://purl.org/rss/1.0/modules/content/',
|
||||
'rssfake': 'http://purl.org/rss/1.0/'}
|
||||
|
||||
@@ -301,7 +340,7 @@ class ParserXML(ParserBase):
|
||||
return self.root.getparent().remove(self.root)
|
||||
|
||||
def tostring(self, encoding='unicode', **k):
|
||||
return etree.tostring(self.root, encoding=encoding, **k)
|
||||
return etree.tostring(self.root, encoding=encoding, method='xml', **k)
|
||||
|
||||
def _rule_parse(self, rule):
|
||||
test = re.search(r'^(.*)/@([a-z]+)$', rule) # to match //div/a/@href
|
||||
@@ -383,7 +422,8 @@ class ParserXML(ParserBase):
|
||||
return
|
||||
|
||||
elif key is not None:
|
||||
del x.attrib[key]
|
||||
if key in match.attrib:
|
||||
del match.attrib[key]
|
||||
|
||||
else:
|
||||
match.getparent().remove(match)
|
||||
@@ -401,13 +441,14 @@ class ParserXML(ParserBase):
|
||||
|
||||
else:
|
||||
if html_rich:
|
||||
# atom stuff
|
||||
if 'atom' in rule:
|
||||
match.attrib['type'] = 'xhtml'
|
||||
|
||||
self._clean_node(match)
|
||||
match.append(lxml.html.fragment_fromstring(value, create_parent='div'))
|
||||
match.find('div').drop_tag()
|
||||
|
||||
if self.rules['mode'] == 'html':
|
||||
match.find('div').drop_tag() # not supported by lxml.etree
|
||||
|
||||
else: # i.e. if atom
|
||||
match.attrib['type'] = 'xhtml'
|
||||
|
||||
else:
|
||||
if match is not None and len(match):
|
||||
@@ -419,7 +460,7 @@ class ParserXML(ParserBase):
|
||||
def rule_str(self, rule):
|
||||
match = self.rule_search(rule)
|
||||
|
||||
html_rich = ('atom' in rule or self.rules['mode'] == 'html') \
|
||||
html_rich = ('atom' in rule or self.mode == 'html') \
|
||||
and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
|
||||
|
||||
if isinstance(match, etree._Element):
|
||||
@@ -440,11 +481,10 @@ class ParserHTML(ParserXML):
|
||||
mimetype = ['text/html', 'application/xhtml+xml']
|
||||
|
||||
def parse(self, raw):
|
||||
parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
|
||||
return etree.fromstring(raw, parser)
|
||||
return html_parse(raw, encoding=self.encoding)
|
||||
|
||||
def tostring(self, encoding='unicode', **k):
|
||||
return lxml.html.tostring(self.root, encoding=encoding, **k)
|
||||
return lxml.html.tostring(self.root, encoding=encoding, method='html', **k)
|
||||
|
||||
def rule_search_all(self, rule):
|
||||
try:
|
||||
@@ -467,26 +507,36 @@ class ParserHTML(ParserXML):
|
||||
element = deepcopy(match)
|
||||
match.getparent().append(element)
|
||||
|
||||
else:
|
||||
raise AttributeError('no way to create item')
|
||||
|
||||
|
||||
def parse_time(value):
|
||||
# parsing per se
|
||||
if value is None or value == 0:
|
||||
return None
|
||||
time = None
|
||||
|
||||
elif isinstance(value, basestring):
|
||||
if re.match(r'^[0-9]+$', value):
|
||||
return datetime.fromtimestamp(int(value), tz.UTC)
|
||||
time = datetime.fromtimestamp(int(value))
|
||||
|
||||
else:
|
||||
return dateutil.parser.parse(value)
|
||||
time = dateutil.parser.parse(value)
|
||||
|
||||
elif isinstance(value, int):
|
||||
return datetime.fromtimestamp(value, tz.UTC)
|
||||
time = datetime.fromtimestamp(value)
|
||||
|
||||
elif isinstance(value, datetime):
|
||||
return value
|
||||
time = value
|
||||
|
||||
else:
|
||||
return None
|
||||
time = None
|
||||
|
||||
# add default time zone if none set
|
||||
if time is not None and time.tzinfo is None:
|
||||
time = time.replace(tzinfo=tz.tzutc())
|
||||
|
||||
return time
|
||||
|
||||
|
||||
class ParserJSON(ParserBase):
|
||||
@@ -587,34 +637,41 @@ class ParserJSON(ParserBase):
|
||||
return out.replace('\n', '<br/>') if out else out
|
||||
|
||||
|
||||
class Uniq(object):
|
||||
_map = {}
|
||||
_id = None
|
||||
def wrap_uniq(wrapper_fn_name):
|
||||
" Wraps the output of the function with the specified function "
|
||||
# This is called when parsing "wrap_uniq('wrap_item')"
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
# check if a wrapper was already created for it
|
||||
# if so, reuse it
|
||||
# if not, create a new one
|
||||
# note that the item itself (the tree node) is created beforehands
|
||||
def decorator(func):
|
||||
# This is called when parsing "@wrap_uniq('wrap_item')"
|
||||
|
||||
tmp_id = cls._gen_id(*args, **kwargs)
|
||||
if tmp_id in cls._map:
|
||||
return cls._map[tmp_id]
|
||||
def wrapped_func(self, *args, **kwargs):
|
||||
# This is called when the wrapped function is called
|
||||
|
||||
else:
|
||||
obj = object.__new__(cls) #, *args, **kwargs)
|
||||
cls._map[tmp_id] = obj
|
||||
return obj
|
||||
output = func(self, *args, **kwargs)
|
||||
output_id = id(output)
|
||||
|
||||
try:
|
||||
return self._map[output_id]
|
||||
|
||||
except (KeyError, AttributeError):
|
||||
if not hasattr(self, '_map'):
|
||||
self._map = {}
|
||||
|
||||
wrapper_fn = getattr(self, wrapper_fn_name)
|
||||
obj = wrapper_fn(output)
|
||||
self._map[output_id] = obj
|
||||
|
||||
return obj
|
||||
|
||||
return wrapped_func
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
class Feed(object):
|
||||
itemsClass = 'Item'
|
||||
itemsClass = property(lambda x: Item) # because Item is define below, i.e. afterwards
|
||||
dic = ('title', 'desc', 'items')
|
||||
|
||||
def wrap_items(self, items):
|
||||
itemsClass = globals()[self.itemsClass]
|
||||
return [itemsClass(x, self.rules, self) for x in items]
|
||||
|
||||
title = property(
|
||||
lambda f: f.get('title'),
|
||||
lambda f,x: f.set('title', x),
|
||||
@@ -630,10 +687,7 @@ class Feed(object):
|
||||
self.rule_create(self.rules['items'])
|
||||
item = self.items[-1]
|
||||
|
||||
if new is None:
|
||||
return
|
||||
|
||||
for attr in globals()[self.itemsClass].dic:
|
||||
for attr in self.itemsClass.dic:
|
||||
try:
|
||||
setattr(item, attr, getattr(new, attr))
|
||||
|
||||
@@ -644,8 +698,14 @@ class Feed(object):
|
||||
except (IndexError, TypeError):
|
||||
pass
|
||||
|
||||
return item
|
||||
|
||||
def wrap_item(self, item):
|
||||
return self.itemsClass(item, self.rules, self)
|
||||
|
||||
@wrap_uniq('wrap_item')
|
||||
def __getitem__(self, key):
|
||||
return self.wrap_items(self.get_raw('items'))[key]
|
||||
return self.get_raw('items')[key]
|
||||
|
||||
def __delitem__(self, key):
|
||||
self[key].remove()
|
||||
@@ -654,7 +714,7 @@ class Feed(object):
|
||||
return len(self.get_raw('items'))
|
||||
|
||||
|
||||
class Item(Uniq):
|
||||
class Item(object):
|
||||
dic = ('title', 'link', 'desc', 'content', 'time', 'updated')
|
||||
|
||||
def __init__(self, xml=None, rules=None, parent=None):
|
||||
@@ -693,32 +753,45 @@ class Item(Uniq):
|
||||
lambda f: f.rmv('item_updated') )
|
||||
|
||||
|
||||
class FeedXML(Feed, ParserXML):
|
||||
itemsClass = 'ItemXML'
|
||||
|
||||
def tostring(self, encoding='unicode', **k):
|
||||
# override needed due to "getroottree" inclusion
|
||||
|
||||
if self.root.getprevious() is None:
|
||||
self.root.addprevious(etree.PI('xml-stylesheet', 'type="text/xsl" href="/sheet.xsl"'))
|
||||
|
||||
return etree.tostring(self.root.getroottree(), encoding=encoding, **k)
|
||||
|
||||
|
||||
class ItemXML(Item, ParserXML):
|
||||
pass
|
||||
|
||||
|
||||
class FeedHTML(Feed, ParserHTML):
|
||||
itemsClass = 'ItemHTML'
|
||||
class FeedXML(Feed, ParserXML):
|
||||
itemsClass = ItemXML
|
||||
|
||||
def root_siblings(self):
|
||||
out = []
|
||||
current = self.root.getprevious()
|
||||
|
||||
while current is not None:
|
||||
out.append(current)
|
||||
current = current.getprevious()
|
||||
|
||||
return out
|
||||
|
||||
def tostring(self, encoding='unicode', **k):
|
||||
# override needed due to "getroottree" inclusion
|
||||
# and to add stylesheet
|
||||
|
||||
stylesheets = [x for x in self.root_siblings() if isinstance(x, etree.PIBase) and x.target == 'xml-stylesheet']
|
||||
|
||||
for stylesheet in stylesheets:
|
||||
# remove all stylesheets present (be that ours or others')
|
||||
self.root.append(stylesheet) # needed as we can't delete root siblings https://stackoverflow.com/a/60232366
|
||||
self.root.remove(stylesheet)
|
||||
|
||||
self.root.addprevious(etree.PI('xml-stylesheet', 'type="text/xsl" href="/sheet.xsl"'))
|
||||
|
||||
return etree.tostring(self.root.getroottree(), encoding=encoding, method='xml', **k)
|
||||
|
||||
|
||||
class ItemHTML(Item, ParserHTML):
|
||||
pass
|
||||
|
||||
|
||||
class FeedJSON(Feed, ParserJSON):
|
||||
itemsClass = 'ItemJSON'
|
||||
class FeedHTML(Feed, ParserHTML):
|
||||
itemsClass = ItemHTML
|
||||
|
||||
|
||||
class ItemJSON(Item, ParserJSON):
|
||||
@@ -732,3 +805,20 @@ class ItemJSON(Item, ParserJSON):
|
||||
return
|
||||
|
||||
cur = cur[node]
|
||||
|
||||
class FeedJSON(Feed, ParserJSON):
|
||||
itemsClass = ItemJSON
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from . import crawler
|
||||
|
||||
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')
|
||||
feed = parse(req['data'], url=req['url'], encoding=req['encoding'])
|
||||
|
||||
if sys.flags.interactive:
|
||||
print('>>> Interactive shell: try using `feed`')
|
||||
|
||||
else:
|
||||
for item in feed.items:
|
||||
print(item.title, item.link)
|
||||
|
496
morss/morss.py
496
morss/morss.py
@@ -1,78 +1,71 @@
|
||||
import sys
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import os.path
|
||||
import time
|
||||
|
||||
import threading
|
||||
|
||||
from fnmatch import fnmatch
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime
|
||||
from fnmatch import fnmatch
|
||||
|
||||
import lxml.etree
|
||||
import lxml.html
|
||||
from dateutil import tz
|
||||
|
||||
from . import feeds
|
||||
from . import feedify
|
||||
from . import crawler
|
||||
from . import readabilite
|
||||
|
||||
import wsgiref.simple_server
|
||||
import wsgiref.handlers
|
||||
|
||||
from . import caching, crawler, feeds, readabilite
|
||||
|
||||
try:
|
||||
# python 2
|
||||
from Queue import Queue
|
||||
from httplib import HTTPException
|
||||
from urllib import quote_plus
|
||||
from urlparse import urlparse, urljoin, parse_qs
|
||||
from urlparse import parse_qs, urljoin, urlparse
|
||||
except ImportError:
|
||||
# python 3
|
||||
from queue import Queue
|
||||
from http.client import HTTPException
|
||||
from urllib.parse import quote_plus
|
||||
from urllib.parse import urlparse, urljoin, parse_qs
|
||||
|
||||
LIM_ITEM = 100 # deletes what's beyond
|
||||
LIM_TIME = 7 # deletes what's after
|
||||
MAX_ITEM = 50 # cache-only beyond
|
||||
MAX_TIME = 7 # cache-only after (in sec)
|
||||
DELAY = 10 * 60 # xml cache & ETag cache (in sec)
|
||||
TIMEOUT = 4 # http timeout (in sec)
|
||||
THREADS = 10 # number of threads (1 for single-threaded)
|
||||
|
||||
DEBUG = False
|
||||
PORT = 8080
|
||||
|
||||
PROTOCOL = ['http', 'https', 'ftp']
|
||||
from urllib.parse import parse_qs, urljoin, urlparse
|
||||
|
||||
|
||||
def filterOptions(options):
|
||||
return options
|
||||
MAX_ITEM = int(os.getenv('MAX_ITEM', 5)) # cache-only beyond
|
||||
MAX_TIME = int(os.getenv('MAX_TIME', 2)) # cache-only after (in sec)
|
||||
|
||||
# example of filtering code below
|
||||
LIM_ITEM = int(os.getenv('LIM_ITEM', 10)) # deletes what's beyond
|
||||
LIM_TIME = int(os.getenv('LIM_TIME', 2.5)) # deletes what's after
|
||||
|
||||
#allowed = ['proxy', 'clip', 'keep', 'cache', 'force', 'silent', 'pro', 'debug']
|
||||
#filtered = dict([(key,value) for (key,value) in options.items() if key in allowed])
|
||||
|
||||
#return filtered
|
||||
DELAY = int(os.getenv('DELAY', 10 * 60)) # xml cache & ETag cache (in sec)
|
||||
TIMEOUT = int(os.getenv('TIMEOUT', 4)) # http timeout (in sec)
|
||||
|
||||
|
||||
class MorssException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def log(txt, force=False):
|
||||
if DEBUG or force:
|
||||
def log(txt):
|
||||
if 'DEBUG' in os.environ:
|
||||
if 'REQUEST_URI' in os.environ:
|
||||
# when running on Apache
|
||||
open('morss.log', 'a').write("%s\n" % repr(txt))
|
||||
|
||||
else:
|
||||
# when using internal server or cli
|
||||
print(repr(txt))
|
||||
|
||||
|
||||
def len_html(txt):
|
||||
if len(txt):
|
||||
return len(lxml.html.fromstring(txt).text_content())
|
||||
|
||||
else:
|
||||
return 0
|
||||
|
||||
@@ -80,6 +73,7 @@ def len_html(txt):
|
||||
def count_words(txt):
|
||||
if len(txt):
|
||||
return len(lxml.html.fromstring(txt).text_content().split())
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
@@ -88,14 +82,16 @@ class Options:
|
||||
if len(args):
|
||||
self.options = args
|
||||
self.options.update(options or {})
|
||||
|
||||
else:
|
||||
self.options = options or {}
|
||||
|
||||
def __getattr__(self, key):
|
||||
def __getattr__(self, key, default=None):
|
||||
if key in self.options:
|
||||
return self.options[key]
|
||||
|
||||
else:
|
||||
return False
|
||||
return default
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
self.options[key] = value
|
||||
@@ -103,29 +99,14 @@ class Options:
|
||||
def __contains__(self, key):
|
||||
return key in self.options
|
||||
|
||||
|
||||
def parseOptions(options):
|
||||
""" Turns ['md=True'] into {'md':True} """
|
||||
out = {}
|
||||
for option in options:
|
||||
split = option.split('=', 1)
|
||||
if len(split) > 1:
|
||||
if split[0].lower() == 'true':
|
||||
out[split[0]] = True
|
||||
elif split[0].lower() == 'false':
|
||||
out[split[0]] = False
|
||||
else:
|
||||
out[split[0]] = split[1]
|
||||
else:
|
||||
out[split[0]] = True
|
||||
return out
|
||||
get = __getitem__ = __getattr__
|
||||
|
||||
|
||||
def ItemFix(item, feedurl='/'):
|
||||
def ItemFix(item, options, feedurl='/'):
|
||||
""" Improves feed items (absolute links, resolve feedburner links, etc) """
|
||||
|
||||
# check unwanted uppercase title
|
||||
if len(item.title) > 20 and item.title.isupper():
|
||||
if item.title is not None and len(item.title) > 20 and item.title.isupper():
|
||||
item.title = item.title.title()
|
||||
|
||||
# check if it includes link
|
||||
@@ -140,6 +121,13 @@ def ItemFix(item, feedurl='/'):
|
||||
item.link = match[0]
|
||||
log(item.link)
|
||||
|
||||
# at user's election, use first <a>
|
||||
if options.firstlink and (item.desc or item.content):
|
||||
match = lxml.html.fromstring(item.desc or item.content).xpath('//a/@href')
|
||||
if len(match):
|
||||
item.link = match[0]
|
||||
log(item.link)
|
||||
|
||||
# check relative urls
|
||||
item.link = urljoin(feedurl, item.link)
|
||||
|
||||
@@ -158,6 +146,11 @@ def ItemFix(item, feedurl='/'):
|
||||
item.link = parse_qs(urlparse(item.link).query)['url'][0]
|
||||
log(item.link)
|
||||
|
||||
# pocket
|
||||
if fnmatch(item.link, 'https://getpocket.com/redirect?url=*'):
|
||||
item.link = parse_qs(urlparse(item.link).query)['url'][0]
|
||||
log(item.link)
|
||||
|
||||
# facebook
|
||||
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
|
||||
item.link = parse_qs(urlparse(item.link).query)['u'][0]
|
||||
@@ -183,7 +176,7 @@ def ItemFix(item, feedurl='/'):
|
||||
|
||||
# reddit
|
||||
if urlparse(feedurl).netloc == 'www.reddit.com':
|
||||
match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
|
||||
match = lxml.html.fromstring(item.content).xpath('//a[text()="[link]"]/@href')
|
||||
if len(match):
|
||||
item.link = match[0]
|
||||
log(item.link)
|
||||
@@ -196,59 +189,47 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||
|
||||
if not item.link:
|
||||
log('no link')
|
||||
return item
|
||||
return True
|
||||
|
||||
log(item.link)
|
||||
|
||||
link = item.link
|
||||
|
||||
# twitter
|
||||
if urlparse(feedurl).netloc == 'twitter.com':
|
||||
match = lxml.html.fromstring(item.desc).xpath('//a/@data-expanded-url')
|
||||
if len(match):
|
||||
link = match[0]
|
||||
log(link)
|
||||
else:
|
||||
link = None
|
||||
|
||||
# facebook
|
||||
if urlparse(feedurl).netloc == 'graph.facebook.com':
|
||||
match = lxml.html.fromstring(item.content).xpath('//a/@href')
|
||||
if len(match) and urlparse(match[0]).netloc != 'www.facebook.com':
|
||||
link = match[0]
|
||||
log(link)
|
||||
else:
|
||||
link = None
|
||||
|
||||
if link is None:
|
||||
log('no used link')
|
||||
return True
|
||||
|
||||
# download
|
||||
delay = -1
|
||||
|
||||
if fast:
|
||||
# super-fast mode
|
||||
if fast or options.fast:
|
||||
# force cache, don't fetch
|
||||
delay = -2
|
||||
|
||||
elif options.force:
|
||||
# force refresh
|
||||
delay = 0
|
||||
|
||||
else:
|
||||
delay = 24*60*60 # 24h
|
||||
|
||||
try:
|
||||
con = crawler.custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT)
|
||||
data = con.read()
|
||||
req = crawler.adv_get(url=item.link, delay=delay, timeout=TIMEOUT)
|
||||
|
||||
except (IOError, HTTPException) as e:
|
||||
log('http error')
|
||||
return False # let's just delete errors stuff when in cache mode
|
||||
|
||||
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
||||
if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
|
||||
if req['contenttype'] not in crawler.MIMETYPE['html'] and req['contenttype'] != 'text/plain':
|
||||
log('non-text page')
|
||||
return True
|
||||
|
||||
out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con))
|
||||
if not req['data']:
|
||||
log('empty page')
|
||||
return True
|
||||
|
||||
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode', xpath=options.xpath)
|
||||
|
||||
if out is not None:
|
||||
item.content = out
|
||||
|
||||
if options.resolve:
|
||||
item.link = req['url']
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@@ -265,10 +246,7 @@ def ItemBefore(item, options):
|
||||
|
||||
def ItemAfter(item, options):
|
||||
if options.clip and item.desc and item.content:
|
||||
item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
|
||||
del item.desc
|
||||
|
||||
if not options.keep and not options.proxy:
|
||||
item.content = item.desc + "<br/><br/><hr/><br/><br/>" + item.content
|
||||
del item.desc
|
||||
|
||||
if options.nolink and item.content:
|
||||
@@ -276,7 +254,7 @@ def ItemAfter(item, options):
|
||||
for link in content.xpath('//a'):
|
||||
log(link.text_content())
|
||||
link.drop_tag()
|
||||
item.content = lxml.etree.tostring(content)
|
||||
item.content = lxml.etree.tostring(content, method='html')
|
||||
|
||||
if options.noref:
|
||||
item.link = ''
|
||||
@@ -285,71 +263,51 @@ def ItemAfter(item, options):
|
||||
|
||||
|
||||
def FeedFetch(url, options):
|
||||
# basic url clean-up
|
||||
if url is None:
|
||||
raise MorssException('No url provided')
|
||||
|
||||
if urlparse(url).scheme not in PROTOCOL:
|
||||
url = 'http://' + url
|
||||
log(url)
|
||||
|
||||
url = url.replace(' ', '%20')
|
||||
|
||||
if isinstance(url, bytes):
|
||||
url = url.decode()
|
||||
|
||||
# allow for code execution for feedify
|
||||
pre = feedify.pre_worker(url)
|
||||
if pre:
|
||||
url = pre
|
||||
log('url redirect')
|
||||
log(url)
|
||||
|
||||
# fetch feed
|
||||
delay = DELAY
|
||||
|
||||
if options.theforce:
|
||||
if options.force:
|
||||
delay = 0
|
||||
|
||||
try:
|
||||
con = crawler.custom_handler(accept='xml', strict=True, delay=delay,
|
||||
encoding=options.encoding, basic=not options.items) \
|
||||
.open(url, timeout=TIMEOUT * 2)
|
||||
xml = con.read()
|
||||
req = crawler.adv_get(url=url, post=options.post, follow=('rss' if not options.items else None), delay=delay, timeout=TIMEOUT * 2)
|
||||
|
||||
except (IOError, HTTPException):
|
||||
raise MorssException('Error downloading feed')
|
||||
|
||||
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
||||
|
||||
if options.items:
|
||||
# using custom rules
|
||||
rss = feeds.FeedHTML(xml, url, contenttype)
|
||||
feed.rule
|
||||
ruleset = {}
|
||||
|
||||
rss.rules['items'] = options.items
|
||||
ruleset['items'] = options.items
|
||||
|
||||
ruleset['title'] = options.get('title', '//head/title')
|
||||
ruleset['desc'] = options.get('desc', '//head/meta[@name="description"]/@content')
|
||||
|
||||
ruleset['item_title'] = options.get('item_title', '.')
|
||||
ruleset['item_link'] = options.get('item_link', './@href|.//a/@href|ancestor::a/@href')
|
||||
|
||||
if options.item_title:
|
||||
rss.rules['item_title'] = options.item_title
|
||||
if options.item_link:
|
||||
rss.rules['item_link'] = options.item_link
|
||||
if options.item_content:
|
||||
rss.rules['item_content'] = options.item_content
|
||||
ruleset['item_content'] = options.item_content
|
||||
|
||||
if options.item_time:
|
||||
rss.rules['item_time'] = options.item_time
|
||||
ruleset['item_time'] = options.item_time
|
||||
|
||||
rss = feeds.parse(req['data'], encoding=req['encoding'], ruleset=ruleset)
|
||||
rss = rss.convert(feeds.FeedXML)
|
||||
|
||||
else:
|
||||
try:
|
||||
rss = feeds.parse(xml, url, contenttype)
|
||||
rss = feeds.parse(req['data'], url=url, encoding=req['encoding'])
|
||||
rss = rss.convert(feeds.FeedXML)
|
||||
# contains all fields, otherwise much-needed data can be lost
|
||||
|
||||
except TypeError:
|
||||
log('random page')
|
||||
log(contenttype)
|
||||
log(req['contenttype'])
|
||||
raise MorssException('Link provided is not a valid feed')
|
||||
|
||||
return rss
|
||||
return req['url'], rss
|
||||
|
||||
|
||||
def FeedGather(rss, url, options):
|
||||
@@ -361,42 +319,37 @@ def FeedGather(rss, url, options):
|
||||
lim_time = LIM_TIME
|
||||
max_item = MAX_ITEM
|
||||
max_time = MAX_TIME
|
||||
threads = THREADS
|
||||
|
||||
if options.cache:
|
||||
max_time = 0
|
||||
|
||||
if options.mono:
|
||||
threads = 1
|
||||
if options.newest:
|
||||
# :newest take the newest items
|
||||
now = datetime.now(tz.tzutc())
|
||||
sorted_items = sorted(rss.items, key=lambda x:x.updated or x.time or now, reverse=True)
|
||||
|
||||
# set
|
||||
def runner(queue):
|
||||
while True:
|
||||
value = queue.get()
|
||||
try:
|
||||
worker(*value)
|
||||
except Exception as e:
|
||||
log('Thread Error: %s' % e.message)
|
||||
queue.task_done()
|
||||
else:
|
||||
# default behavior, take the first items (in appearing order)
|
||||
sorted_items = list(rss.items)
|
||||
|
||||
def worker(i, item):
|
||||
for i, item in enumerate(sorted_items):
|
||||
if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0:
|
||||
log('dropped')
|
||||
item.remove()
|
||||
return
|
||||
continue
|
||||
|
||||
item = ItemBefore(item, options)
|
||||
|
||||
if item is None:
|
||||
return
|
||||
continue
|
||||
|
||||
item = ItemFix(item, url)
|
||||
item = ItemFix(item, options, url)
|
||||
|
||||
if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
|
||||
if not options.proxy:
|
||||
if ItemFill(item, options, url, True) is False:
|
||||
item.remove()
|
||||
return
|
||||
continue
|
||||
|
||||
else:
|
||||
if not options.proxy:
|
||||
@@ -404,22 +357,6 @@ def FeedGather(rss, url, options):
|
||||
|
||||
item = ItemAfter(item, options)
|
||||
|
||||
queue = Queue()
|
||||
|
||||
for i in range(threads):
|
||||
t = threading.Thread(target=runner, args=(queue,))
|
||||
t.daemon = True
|
||||
t.start()
|
||||
|
||||
for i, item in enumerate(list(rss.items)):
|
||||
if threads == 1:
|
||||
worker(*[i, item])
|
||||
else:
|
||||
queue.put([i, item])
|
||||
|
||||
if threads != 1:
|
||||
queue.join()
|
||||
|
||||
if options.ad:
|
||||
new = rss.items.append()
|
||||
new.title = "Are you hungry?"
|
||||
@@ -433,37 +370,38 @@ def FeedGather(rss, url, options):
|
||||
return rss
|
||||
|
||||
|
||||
def FeedFormat(rss, options):
|
||||
def FeedFormat(rss, options, encoding='utf-8'):
|
||||
if options.callback:
|
||||
if re.match(r'^[a-zA-Z0-9\.]+$', options.callback) is not None:
|
||||
return '%s(%s)' % (options.callback, rss.tojson())
|
||||
out = '%s(%s)' % (options.callback, rss.tojson(encoding='unicode'))
|
||||
return out if encoding == 'unicode' else out.encode(encoding)
|
||||
|
||||
else:
|
||||
raise MorssException('Invalid callback var name')
|
||||
|
||||
elif options.json:
|
||||
elif options.format == 'json':
|
||||
if options.indent:
|
||||
return rss.tojson(encoding='UTF-8', indent=4)
|
||||
return rss.tojson(encoding=encoding, indent=4)
|
||||
|
||||
else:
|
||||
return rss.tojson(encoding='UTF-8')
|
||||
return rss.tojson(encoding=encoding)
|
||||
|
||||
elif options.csv:
|
||||
return rss.tocsv(encoding='UTF-8')
|
||||
elif options.format == 'csv':
|
||||
return rss.tocsv(encoding=encoding)
|
||||
|
||||
elif options.reader:
|
||||
elif options.format == 'html':
|
||||
if options.indent:
|
||||
return rss.tohtml(encoding='UTF-8', pretty_print=True)
|
||||
return rss.tohtml(encoding=encoding, pretty_print=True)
|
||||
|
||||
else:
|
||||
return rss.tohtml(encoding='UTF-8')
|
||||
return rss.tohtml(encoding=encoding)
|
||||
|
||||
else:
|
||||
else: # i.e. format == 'rss'
|
||||
if options.indent:
|
||||
return rss.torss(xml_declaration=True, encoding='UTF-8', pretty_print=True)
|
||||
return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding, pretty_print=True)
|
||||
|
||||
else:
|
||||
return rss.torss(xml_declaration=True, encoding='UTF-8')
|
||||
return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding)
|
||||
|
||||
|
||||
def process(url, cache=None, options=None):
|
||||
@@ -473,189 +411,9 @@ def process(url, cache=None, options=None):
|
||||
options = Options(options)
|
||||
|
||||
if cache:
|
||||
crawler.default_cache = crawler.SQLiteCache(cache)
|
||||
caching.default_cache = caching.SQLiteCache(cache)
|
||||
|
||||
rss = FeedFetch(url, options)
|
||||
url, rss = FeedFetch(url, options)
|
||||
rss = FeedGather(rss, url, options)
|
||||
|
||||
return FeedFormat(rss, options)
|
||||
|
||||
|
||||
def cgi_app(environ, start_response):
|
||||
# get options
|
||||
if 'REQUEST_URI' in environ:
|
||||
url = environ['REQUEST_URI'][1:]
|
||||
else:
|
||||
url = environ['PATH_INFO'][1:]
|
||||
|
||||
if environ['QUERY_STRING']:
|
||||
url += '?' + environ['QUERY_STRING']
|
||||
|
||||
url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
|
||||
|
||||
if url.startswith(':'):
|
||||
split = url.split('/', 1)
|
||||
|
||||
options = split[0].replace('|', '/').replace('\\\'', '\'').split(':')[1:]
|
||||
|
||||
if len(split) > 1:
|
||||
url = split[1]
|
||||
else:
|
||||
url = ''
|
||||
|
||||
else:
|
||||
options = []
|
||||
|
||||
# init
|
||||
options = Options(filterOptions(parseOptions(options)))
|
||||
headers = {}
|
||||
|
||||
global DEBUG
|
||||
DEBUG = options.debug
|
||||
|
||||
# headers
|
||||
headers['status'] = '200 OK'
|
||||
headers['cache-control'] = 'max-age=%s' % DELAY
|
||||
|
||||
if options.cors:
|
||||
headers['access-control-allow-origin'] = '*'
|
||||
|
||||
if options.html or options.reader:
|
||||
headers['content-type'] = 'text/html'
|
||||
elif options.txt or options.silent:
|
||||
headers['content-type'] = 'text/plain'
|
||||
elif options.json:
|
||||
headers['content-type'] = 'application/json'
|
||||
elif options.callback:
|
||||
headers['content-type'] = 'application/javascript'
|
||||
elif options.csv:
|
||||
headers['content-type'] = 'text/csv'
|
||||
headers['content-disposition'] = 'attachment; filename="feed.csv"'
|
||||
else:
|
||||
headers['content-type'] = 'text/xml'
|
||||
|
||||
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
|
||||
|
||||
# get the work done
|
||||
rss = FeedFetch(url, options)
|
||||
|
||||
if headers['content-type'] == 'text/xml':
|
||||
headers['content-type'] = rss.mimetype[0]
|
||||
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
|
||||
rss = FeedGather(rss, url, options)
|
||||
out = FeedFormat(rss, options)
|
||||
|
||||
if not options.silent:
|
||||
return out
|
||||
|
||||
|
||||
def cgi_wrapper(environ, start_response):
|
||||
# simple http server for html and css
|
||||
files = {
|
||||
'': 'text/html',
|
||||
'index.html': 'text/html'}
|
||||
|
||||
if 'REQUEST_URI' in environ:
|
||||
url = environ['REQUEST_URI'][1:]
|
||||
else:
|
||||
url = environ['PATH_INFO'][1:]
|
||||
|
||||
if url in files:
|
||||
headers = {}
|
||||
|
||||
if url == '':
|
||||
url = 'index.html'
|
||||
|
||||
if '--root' in sys.argv[1:]:
|
||||
path = os.path.join(sys.argv[-1], url)
|
||||
|
||||
else:
|
||||
path = url
|
||||
|
||||
try:
|
||||
body = open(path, 'rb').read()
|
||||
|
||||
headers['status'] = '200 OK'
|
||||
headers['content-type'] = files[url]
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return [body]
|
||||
|
||||
except IOError:
|
||||
headers['status'] = '404 Not found'
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return ['Error %s' % headers['status']]
|
||||
|
||||
# actual morss use
|
||||
try:
|
||||
return [cgi_app(environ, start_response) or '(empty)']
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise
|
||||
except Exception as e:
|
||||
headers = {'status': '500 Oops', 'content-type': 'text/plain'}
|
||||
start_response(headers['status'], list(headers.items()), sys.exc_info())
|
||||
log('ERROR <%s>: %s' % (url, e.message), force=True)
|
||||
return ['An error happened:\n%s' % e.message]
|
||||
|
||||
|
||||
def cli_app():
|
||||
options = Options(filterOptions(parseOptions(sys.argv[1:-1])))
|
||||
url = sys.argv[-1]
|
||||
|
||||
global DEBUG
|
||||
DEBUG = options.debug
|
||||
|
||||
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
|
||||
|
||||
rss = FeedFetch(url, options)
|
||||
rss = FeedGather(rss, url, options)
|
||||
out = FeedFormat(rss, options)
|
||||
|
||||
if not options.silent:
|
||||
print(out.decode('utf-8', 'replace') if isinstance(out, bytes) else out)
|
||||
|
||||
log('done')
|
||||
|
||||
|
||||
def isInt(string):
|
||||
try:
|
||||
int(string)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
if 'REQUEST_URI' in os.environ:
|
||||
# mod_cgi
|
||||
wsgiref.handlers.CGIHandler().run(cgi_wrapper)
|
||||
|
||||
elif len(sys.argv) <= 1 or isInt(sys.argv[1]) or '--root' in sys.argv[1:]:
|
||||
# start internal (basic) http server
|
||||
|
||||
if len(sys.argv) > 1 and isInt(sys.argv[1]):
|
||||
argPort = int(sys.argv[1])
|
||||
if argPort > 0:
|
||||
port = argPort
|
||||
else:
|
||||
raise MorssException('Port must be positive integer')
|
||||
|
||||
else:
|
||||
port = PORT
|
||||
|
||||
print('Serving http://localhost:%s/'%port)
|
||||
httpd = wsgiref.simple_server.make_server('', port, cgi_wrapper)
|
||||
httpd.serve_forever()
|
||||
|
||||
else:
|
||||
# as a CLI app
|
||||
try:
|
||||
cli_app()
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise
|
||||
except Exception as e:
|
||||
print('ERROR: %s' % e.message)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
return FeedFormat(rss, options, 'unicode')
|
||||
|
@@ -1,13 +1,35 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import re
|
||||
|
||||
import lxml.etree
|
||||
import lxml.html
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def parse(data, encoding=None):
|
||||
if encoding:
|
||||
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding=encoding)
|
||||
data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')
|
||||
|
||||
else:
|
||||
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
|
||||
data = BeautifulSoup(data, 'lxml').prettify('utf-8')
|
||||
|
||||
parser = lxml.html.HTMLParser(remove_comments=True, encoding='utf-8')
|
||||
|
||||
return lxml.html.fromstring(data, parser=parser)
|
||||
|
||||
@@ -60,9 +82,10 @@ class_good = ['and', 'article', 'body', 'column', 'main',
|
||||
regex_good = re.compile('|'.join(class_good), re.I)
|
||||
|
||||
|
||||
tags_junk = ['script', 'head', 'iframe', 'object', 'noscript',
|
||||
'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea',
|
||||
'button', 'footer']
|
||||
tags_dangerous = ['script', 'head', 'iframe', 'object', 'style', 'link', 'meta']
|
||||
|
||||
tags_junk = tags_dangerous + ['noscript', 'param', 'embed', 'layer', 'applet',
|
||||
'form', 'input', 'textarea', 'button', 'footer']
|
||||
|
||||
tags_bad = tags_junk + ['a', 'aside']
|
||||
|
||||
@@ -90,13 +113,24 @@ def score_node(node):
|
||||
" Score individual node "
|
||||
|
||||
score = 0
|
||||
class_id = node.get('class', '') + node.get('id', '')
|
||||
class_id = (node.get('class') or '') + (node.get('id') or '')
|
||||
|
||||
if (isinstance(node, lxml.html.HtmlComment)
|
||||
or node.tag in tags_bad
|
||||
or regex_bad.search(class_id)):
|
||||
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
|
||||
return 0
|
||||
|
||||
if node.tag in tags_dangerous:
|
||||
return 0
|
||||
|
||||
if node.tag in tags_junk:
|
||||
score += -1 # actuall -2 as tags_junk is included tags_bad
|
||||
|
||||
if node.tag in tags_bad:
|
||||
score += -1
|
||||
|
||||
if regex_bad.search(class_id):
|
||||
score += -1
|
||||
|
||||
if node.tag in tags_good:
|
||||
score += 4
|
||||
|
||||
@@ -109,38 +143,47 @@ def score_node(node):
|
||||
|
||||
if wc != 0:
|
||||
wca = count_words(' '.join([x.text_content() for x in node.findall('.//a')]))
|
||||
score = score * ( 1 - float(wca)/wc )
|
||||
score = score * ( 1 - 2 * float(wca)/wc )
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def score_all(node, grades=None):
|
||||
def score_all(node):
|
||||
" Fairly dumb loop to score all worthwhile nodes. Tries to be fast "
|
||||
|
||||
if grades is None:
|
||||
grades = {}
|
||||
|
||||
for child in node:
|
||||
score = score_node(child)
|
||||
child.attrib['seen'] = 'yes, ' + str(int(score))
|
||||
child.attrib['morss_own_score'] = str(float(score))
|
||||
|
||||
if score > 0:
|
||||
spread_score(child, score, grades)
|
||||
score_all(child, grades)
|
||||
|
||||
return grades
|
||||
if score > 0 or len(list(child.iterancestors())) <= 2:
|
||||
spread_score(child, score)
|
||||
score_all(child)
|
||||
|
||||
|
||||
def spread_score(node, score, grades):
|
||||
def set_score(node, value):
|
||||
node.attrib['morss_score'] = str(float(value))
|
||||
|
||||
|
||||
def get_score(node):
|
||||
return float(node.attrib.get('morss_score', 0))
|
||||
|
||||
|
||||
def incr_score(node, delta):
|
||||
set_score(node, get_score(node) + delta)
|
||||
|
||||
|
||||
def get_all_scores(node):
|
||||
return {x:get_score(x) for x in list(node.iter()) if get_score(x) != 0}
|
||||
|
||||
|
||||
def spread_score(node, score):
|
||||
" Spread the node's score to its parents, on a linear way "
|
||||
|
||||
delta = score / 2
|
||||
|
||||
for ancestor in [node,] + list(node.iterancestors()):
|
||||
if score >= 1 or ancestor is node:
|
||||
try:
|
||||
grades[ancestor] += score
|
||||
except KeyError:
|
||||
grades[ancestor] = score
|
||||
incr_score(ancestor, score)
|
||||
|
||||
score -= delta
|
||||
|
||||
@@ -148,26 +191,29 @@ def spread_score(node, score, grades):
|
||||
break
|
||||
|
||||
|
||||
def write_score_all(root, grades):
|
||||
" Useful for debugging "
|
||||
|
||||
for node in root.iter():
|
||||
node.attrib['score'] = str(int(grades.get(node, 0)))
|
||||
|
||||
|
||||
def clean_root(root):
|
||||
def clean_root(root, keep_threshold=None):
|
||||
for node in list(root):
|
||||
clean_root(node)
|
||||
clean_node(node)
|
||||
# bottom-up approach, i.e. starting with children before cleaning current node
|
||||
clean_root(node, keep_threshold)
|
||||
clean_node(node, keep_threshold)
|
||||
|
||||
|
||||
def clean_node(node):
|
||||
def clean_node(node, keep_threshold=None):
|
||||
parent = node.getparent()
|
||||
|
||||
if parent is None:
|
||||
# this is <html/> (or a removed element waiting for GC)
|
||||
return
|
||||
|
||||
# remove dangerous tags, no matter what
|
||||
if node.tag in tags_dangerous:
|
||||
parent.remove(node)
|
||||
return
|
||||
|
||||
# high score, so keep
|
||||
if keep_threshold is not None and keep_threshold > 0 and get_score(node) >= keep_threshold:
|
||||
return
|
||||
|
||||
gdparent = parent.getparent()
|
||||
|
||||
# remove shitty tags
|
||||
@@ -248,59 +294,91 @@ def clean_node(node):
|
||||
gdparent.insert(gdparent.index(parent)+1, new_node)
|
||||
|
||||
|
||||
def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
|
||||
ancestorsA = list(nodeA.iterancestors())
|
||||
ancestorsB = list(nodeB.iterancestors())
|
||||
def lowest_common_ancestor(node_a, node_b, max_depth=None):
|
||||
ancestors_a = list(node_a.iterancestors())
|
||||
ancestors_b = list(node_b.iterancestors())
|
||||
|
||||
if max_depth is not None:
|
||||
ancestorsA = ancestorsA[:max_depth]
|
||||
ancestorsB = ancestorsB[:max_depth]
|
||||
ancestors_a = ancestors_a[:max_depth]
|
||||
ancestors_b = ancestors_b[:max_depth]
|
||||
|
||||
ancestorsA.insert(0, nodeA)
|
||||
ancestorsB.insert(0, nodeB)
|
||||
ancestors_a.insert(0, node_a)
|
||||
ancestors_b.insert(0, node_b)
|
||||
|
||||
for ancestorA in ancestorsA:
|
||||
if ancestorA in ancestorsB:
|
||||
return ancestorA
|
||||
for ancestor_a in ancestors_a:
|
||||
if ancestor_a in ancestors_b:
|
||||
return ancestor_a
|
||||
|
||||
return nodeA # should always find one tho, at least <html/>, but needed for max_depth
|
||||
return node_a # should always find one tho, at least <html/>, but needed for max_depth
|
||||
|
||||
|
||||
def rank_nodes(grades):
|
||||
return sorted(grades.items(), key=lambda x: x[1], reverse=True)
|
||||
def get_best_node(html, threshold=5):
|
||||
# score all nodes
|
||||
score_all(html)
|
||||
|
||||
# rank all nodes (largest to smallest)
|
||||
ranked_nodes = sorted(html.iter(), key=lambda x: get_score(x), reverse=True)
|
||||
|
||||
def get_best_node(grades):
|
||||
" To pick the best (raw) node. Another function will clean it "
|
||||
|
||||
if len(grades) == 1:
|
||||
return grades[0]
|
||||
|
||||
top = rank_nodes(grades)
|
||||
lowest = lowest_common_ancestor(top[0][0], top[1][0], 3)
|
||||
|
||||
return lowest
|
||||
|
||||
|
||||
def get_article(data, url=None, encoding=None):
|
||||
" Input a raw html string, returns a raw html string of the article "
|
||||
|
||||
html = parse(data, encoding)
|
||||
scores = score_all(html)
|
||||
|
||||
if not len(scores):
|
||||
# minimum threshold
|
||||
if not len(ranked_nodes) or get_score(ranked_nodes[0]) < threshold:
|
||||
return None
|
||||
|
||||
best = get_best_node(scores)
|
||||
# take common ancestor or the two highest rated nodes
|
||||
if len(ranked_nodes) > 1:
|
||||
best = lowest_common_ancestor(ranked_nodes[0], ranked_nodes[1], 3)
|
||||
|
||||
else:
|
||||
best = ranked_nodes[0]
|
||||
|
||||
return best
|
||||
|
||||
|
||||
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5, xpath=None):
|
||||
" Input a raw html string, returns a raw html string of the article "
|
||||
|
||||
html = parse(data, encoding_in)
|
||||
|
||||
if xpath is not None:
|
||||
xpath_match = html.xpath(xpath)
|
||||
|
||||
if len(xpath_match):
|
||||
best = xpath_match[0]
|
||||
|
||||
else:
|
||||
best = get_best_node(html, threshold)
|
||||
|
||||
else:
|
||||
best = get_best_node(html, threshold)
|
||||
|
||||
# clean up
|
||||
if not debug:
|
||||
keep_threshold = get_score(best) * 3/4
|
||||
clean_root(best, keep_threshold)
|
||||
|
||||
# check for spammy content (links only)
|
||||
wc = count_words(best.text_content())
|
||||
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
|
||||
|
||||
if wc - wca < 50 or float(wca) / wc > 0.3:
|
||||
if not debug and (wc - wca < 50 or float(wca) / wc > 0.3):
|
||||
return None
|
||||
|
||||
# fix urls
|
||||
if url:
|
||||
best.make_links_absolute(url)
|
||||
|
||||
clean_root(best)
|
||||
return lxml.etree.tostring(best if not debug else html, method='html', encoding=encoding_out)
|
||||
|
||||
return lxml.etree.tostring(best, pretty_print=True)
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
|
||||
from . import crawler
|
||||
|
||||
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
|
||||
article = get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
|
||||
|
||||
if sys.flags.interactive:
|
||||
print('>>> Interactive shell: try using `article`')
|
||||
|
||||
else:
|
||||
print(article)
|
||||
|
@@ -1,210 +0,0 @@
|
||||
@require(feed)
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>@feed.title – via morss</title>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="description" content="@feed.desc (via morss)" />
|
||||
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
|
||||
|
||||
<style type="text/css">
|
||||
/* columns - from https://thisisdallas.github.io/Simple-Grid/simpleGrid.css */
|
||||
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
#content {
|
||||
width: 100%;
|
||||
max-width: 1140px;
|
||||
min-width: 755px;
|
||||
margin: 0 auto;
|
||||
overflow: hidden;
|
||||
|
||||
padding-top: 20px;
|
||||
padding-left: 20px; /* grid-space to left */
|
||||
padding-right: 0px; /* grid-space to right: (grid-space-left - column-space) e.g. 20px-20px=0 */
|
||||
}
|
||||
|
||||
.item {
|
||||
width: 33.33%;
|
||||
float: left;
|
||||
padding-right: 20px; /* column-space */
|
||||
}
|
||||
|
||||
@@media handheld, only screen and (max-width: 767px) { /* @@ to escape from the template engine */
|
||||
#content {
|
||||
width: 100%;
|
||||
min-width: 0;
|
||||
margin-left: 0px;
|
||||
margin-right: 0px;
|
||||
padding-left: 20px; /* grid-space to left */
|
||||
padding-right: 10px; /* grid-space to right: (grid-space-left - column-space) e.g. 20px-10px=10px */
|
||||
}
|
||||
|
||||
.item {
|
||||
width: auto;
|
||||
float: none;
|
||||
margin-left: 0px;
|
||||
margin-right: 0px;
|
||||
margin-top: 10px;
|
||||
margin-bottom: 10px;
|
||||
padding-left: 0px;
|
||||
padding-right: 10px; /* column-space */
|
||||
}
|
||||
}
|
||||
|
||||
/* design */
|
||||
|
||||
#header h1, #header h2, #header p {
|
||||
font-family: sans;
|
||||
text-align: center;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
#header h1 {
|
||||
font-size: 2.5em;
|
||||
font-weight: bold;
|
||||
padding: 1em 0 0.25em;
|
||||
}
|
||||
|
||||
#header h2 {
|
||||
font-size: 1em;
|
||||
font-weight: normal;
|
||||
}
|
||||
|
||||
#header p {
|
||||
color: gray;
|
||||
font-style: italic;
|
||||
font-size: 0.75em;
|
||||
}
|
||||
|
||||
#content {
|
||||
text-align: justify;
|
||||
}
|
||||
|
||||
.item .title {
|
||||
font-weight: bold;
|
||||
display: block;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.item .link {
|
||||
color: inherit;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
.item:not(.active) {
|
||||
cursor: pointer;
|
||||
|
||||
height: 20em;
|
||||
margin-bottom: 20px;
|
||||
overflow: hidden;
|
||||
text-overflow: ellpisps;
|
||||
|
||||
padding: 0.25em;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.item:not(.active) .title {
|
||||
padding-bottom: 0.1em;
|
||||
margin-bottom: 0.1em;
|
||||
border-bottom: 1px solid silver;
|
||||
}
|
||||
|
||||
.item:not(.active):before {
|
||||
content: " ";
|
||||
display: block;
|
||||
width: 100%;
|
||||
position: absolute;
|
||||
top: 18.5em;
|
||||
height: 1.5em;
|
||||
background: linear-gradient(to bottom, rgba(255,255,255,0) 0%, rgba(255,255,255,1) 100%);
|
||||
}
|
||||
|
||||
.item:not(.active) .article * {
|
||||
max-width: 100%;
|
||||
font-size: 1em !important;
|
||||
font-weight: normal;
|
||||
display: inline;
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
.item.active {
|
||||
background: white;
|
||||
position: fixed;
|
||||
overflow: auto;
|
||||
top: 0;
|
||||
left: 0;
|
||||
height: 100%;
|
||||
width: 100%;
|
||||
z-index: 1;
|
||||
}
|
||||
|
||||
body.noscroll {
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.item.active > * {
|
||||
max-width: 700px;
|
||||
margin: auto;
|
||||
}
|
||||
|
||||
.item.active .title {
|
||||
font-size: 2em;
|
||||
padding: 0.5em 0;
|
||||
}
|
||||
|
||||
.item.active .article object,
|
||||
.item.active .article video,
|
||||
.item.active .article audio {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.item.active .article img {
|
||||
max-height: 20em;
|
||||
max-width: 100%;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div id="header">
|
||||
<h1>@feed.title</h1>
|
||||
@if feed.desc:
|
||||
<h2>@feed.desc</h2>
|
||||
@end
|
||||
<p>- via morss</p>
|
||||
</div>
|
||||
|
||||
<div id="content">
|
||||
@for item in feed.items:
|
||||
<div class="item">
|
||||
@if item.link:
|
||||
<a class="title link" href="@item.link" target="_blank">@item.title</a>
|
||||
@else:
|
||||
<span class="title">@item.title</span>
|
||||
@end
|
||||
<div class="article">
|
||||
@if item.content:
|
||||
@item.content
|
||||
@else:
|
||||
@item.desc
|
||||
@end
|
||||
</div>
|
||||
</div>
|
||||
@end
|
||||
</div>
|
||||
|
||||
<script>
|
||||
var items = document.getElementsByClassName('item')
|
||||
for (var i in items)
|
||||
items[i].onclick = function()
|
||||
{
|
||||
this.classList.toggle('active')
|
||||
document.body.classList.toggle('noscroll')
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
298
morss/wsgi.py
Normal file
298
morss/wsgi.py
Normal file
@@ -0,0 +1,298 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import cgitb
|
||||
import mimetypes
|
||||
import os.path
|
||||
import re
|
||||
import sys
|
||||
import wsgiref.handlers
|
||||
import wsgiref.simple_server
|
||||
import wsgiref.util
|
||||
|
||||
import lxml.etree
|
||||
|
||||
try:
|
||||
# python 2
|
||||
from urllib import unquote
|
||||
except ImportError:
|
||||
# python 3
|
||||
from urllib.parse import unquote
|
||||
|
||||
from . import caching, crawler, readabilite
|
||||
from .morss import (DELAY, TIMEOUT, FeedFetch, FeedFormat, FeedGather,
|
||||
MorssException, Options, log)
|
||||
|
||||
PORT = int(os.getenv('PORT', 8080))
|
||||
|
||||
|
||||
def parse_options(options):
|
||||
""" Turns ['md=True'] into {'md':True} """
|
||||
out = {}
|
||||
|
||||
for option in options:
|
||||
split = option.split('=', 1)
|
||||
|
||||
if len(split) > 1:
|
||||
out[split[0]] = unquote(split[1]).replace('|', '/') # | -> / for backward compatibility (and Apache)
|
||||
|
||||
else:
|
||||
out[split[0]] = True
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def request_uri(environ):
|
||||
if 'REQUEST_URI' in environ:
|
||||
# when running on Apache/uwsgi
|
||||
url = environ['REQUEST_URI']
|
||||
|
||||
elif 'RAW_URI' in environ:
|
||||
# gunicorn
|
||||
url = environ['RAW_URI']
|
||||
|
||||
else:
|
||||
# when using other servers
|
||||
url = environ['PATH_INFO']
|
||||
|
||||
if environ['QUERY_STRING']:
|
||||
url += '?' + environ['QUERY_STRING']
|
||||
|
||||
return url
|
||||
|
||||
|
||||
def cgi_parse_environ(environ):
|
||||
# get options
|
||||
|
||||
url = request_uri(environ)[1:]
|
||||
url = re.sub(r'^(cgi/)?(morss.py|main.py)/', '', url)
|
||||
|
||||
if url.startswith(':'):
|
||||
parts = url.split('/', 1)
|
||||
raw_options = parts[0].split(':')[1:]
|
||||
url = parts[1] if len(parts) > 1 else ''
|
||||
|
||||
else:
|
||||
raw_options = []
|
||||
|
||||
# init
|
||||
options = Options(parse_options(raw_options))
|
||||
|
||||
return (url, options)
|
||||
|
||||
|
||||
def cgi_app(environ, start_response):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
headers = {}
|
||||
|
||||
# headers
|
||||
headers['status'] = '200 OK'
|
||||
headers['cache-control'] = 'max-age=%s' % DELAY
|
||||
headers['x-content-type-options'] = 'nosniff' # safari work around
|
||||
|
||||
if options.cors:
|
||||
headers['access-control-allow-origin'] = '*'
|
||||
|
||||
if options.format == 'html':
|
||||
headers['content-type'] = 'text/html'
|
||||
elif options.txt or options.silent:
|
||||
headers['content-type'] = 'text/plain'
|
||||
elif options.format == 'json':
|
||||
headers['content-type'] = 'application/json'
|
||||
elif options.callback:
|
||||
headers['content-type'] = 'application/javascript'
|
||||
elif options.format == 'csv':
|
||||
headers['content-type'] = 'text/csv'
|
||||
headers['content-disposition'] = 'attachment; filename="feed.csv"'
|
||||
else:
|
||||
headers['content-type'] = 'text/xml'
|
||||
|
||||
headers['content-type'] += '; charset=utf-8'
|
||||
|
||||
# get the work done
|
||||
url, rss = FeedFetch(url, options)
|
||||
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
|
||||
rss = FeedGather(rss, url, options)
|
||||
out = FeedFormat(rss, options)
|
||||
|
||||
if options.silent:
|
||||
return ['']
|
||||
|
||||
else:
|
||||
return [out]
|
||||
|
||||
|
||||
def middleware(func):
|
||||
" Decorator to turn a function into a wsgi middleware "
|
||||
# This is called when parsing the "@middleware" code
|
||||
|
||||
def app_builder(app):
|
||||
# This is called when doing app = cgi_wrapper(app)
|
||||
|
||||
def app_wrap(environ, start_response):
|
||||
# This is called when a http request is being processed
|
||||
|
||||
return func(environ, start_response, app)
|
||||
|
||||
return app_wrap
|
||||
|
||||
return app_builder
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_file_handler(environ, start_response, app):
|
||||
" Simple HTTP server to serve static files (.html, .css, etc.) "
|
||||
|
||||
url = request_uri(environ)[1:]
|
||||
|
||||
if url == '':
|
||||
url = 'index.html'
|
||||
|
||||
if re.match(r'^/?([a-zA-Z0-9_-][a-zA-Z0-9\._-]+/?)*$', url):
|
||||
# if it is a legitimate url (no funny relative paths)
|
||||
paths = [
|
||||
os.path.join(sys.prefix, 'share/morss/www', url),
|
||||
os.path.join(os.path.dirname(__file__), '../www', url)
|
||||
]
|
||||
|
||||
for path in paths:
|
||||
try:
|
||||
f = open(path, 'rb')
|
||||
|
||||
except IOError:
|
||||
# problem with file (cannot open or not found)
|
||||
continue
|
||||
|
||||
else:
|
||||
# file successfully open
|
||||
headers = {}
|
||||
headers['status'] = '200 OK'
|
||||
headers['content-type'] = mimetypes.guess_type(path)[0] or 'application/octet-stream'
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return wsgiref.util.FileWrapper(f)
|
||||
|
||||
# regex didn't validate or no file found
|
||||
return app(environ, start_response)
|
||||
|
||||
|
||||
def cgi_get(environ, start_response):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
# get page
|
||||
req = crawler.adv_get(url=url, timeout=TIMEOUT)
|
||||
|
||||
if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
||||
if options['get'] == 'page':
|
||||
html = readabilite.parse(req['data'], encoding=req['encoding'])
|
||||
html.make_links_absolute(req['url'])
|
||||
|
||||
kill_tags = ['script', 'iframe', 'noscript']
|
||||
|
||||
for tag in kill_tags:
|
||||
for elem in html.xpath('//'+tag):
|
||||
elem.getparent().remove(elem)
|
||||
|
||||
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
|
||||
|
||||
elif options['get'] == 'article':
|
||||
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
|
||||
|
||||
else:
|
||||
raise MorssException('no :get option passed')
|
||||
|
||||
else:
|
||||
output = req['data']
|
||||
|
||||
# return html page
|
||||
headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
|
||||
start_response(headers['status'], list(headers.items()))
|
||||
return [output]
|
||||
|
||||
|
||||
dispatch_table = {
|
||||
'get': cgi_get,
|
||||
}
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_dispatcher(environ, start_response, app):
|
||||
url, options = cgi_parse_environ(environ)
|
||||
|
||||
for key in dispatch_table.keys():
|
||||
if key in options:
|
||||
return dispatch_table[key](environ, start_response)
|
||||
|
||||
return app(environ, start_response)
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_error_handler(environ, start_response, app):
|
||||
try:
|
||||
return app(environ, start_response)
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
headers = {'status': '500 Oops', 'content-type': 'text/html'}
|
||||
start_response(headers['status'], list(headers.items()), sys.exc_info())
|
||||
log('ERROR: %s' % repr(e))
|
||||
return [cgitb.html(sys.exc_info())]
|
||||
|
||||
|
||||
@middleware
|
||||
def cgi_encode(environ, start_response, app):
|
||||
out = app(environ, start_response)
|
||||
return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
|
||||
|
||||
|
||||
application = cgi_app
|
||||
application = cgi_file_handler(application)
|
||||
application = cgi_dispatcher(application)
|
||||
application = cgi_error_handler(application)
|
||||
application = cgi_encode(application)
|
||||
|
||||
|
||||
def cgi_handle_request():
|
||||
app = cgi_app
|
||||
app = cgi_dispatcher(app)
|
||||
app = cgi_error_handler(app)
|
||||
app = cgi_encode(app)
|
||||
|
||||
wsgiref.handlers.CGIHandler().run(app)
|
||||
|
||||
|
||||
class WSGIRequestHandlerRequestUri(wsgiref.simple_server.WSGIRequestHandler):
|
||||
def get_environ(self):
|
||||
env = super().get_environ()
|
||||
env['REQUEST_URI'] = self.path
|
||||
return env
|
||||
|
||||
|
||||
def cgi_start_server():
|
||||
caching.default_cache.autotrim()
|
||||
|
||||
print('Serving http://localhost:%s/' % PORT)
|
||||
httpd = wsgiref.simple_server.make_server('', PORT, application, handler_class=WSGIRequestHandlerRequestUri)
|
||||
httpd.serve_forever()
|
||||
|
||||
|
||||
if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''):
|
||||
caching.default_cache.autotrim()
|
@@ -1,4 +0,0 @@
|
||||
lxml
|
||||
python-dateutil <= 1.5
|
||||
chardet
|
||||
pymysql
|
34
setup.py
34
setup.py
@@ -1,14 +1,26 @@
|
||||
from setuptools import setup, find_packages
|
||||
from glob import glob
|
||||
|
||||
from setuptools import setup
|
||||
|
||||
package_name = 'morss'
|
||||
|
||||
setup(
|
||||
name=package_name,
|
||||
description='Get full-text RSS feeds',
|
||||
author='pictuga, Samuel Marks',
|
||||
author_email='contact at pictuga dot com',
|
||||
url='http://morss.it/',
|
||||
license='AGPL v3',
|
||||
package_dir={package_name: package_name},
|
||||
packages=find_packages(),
|
||||
package_data={package_name: ['feedify.ini', 'reader.html.template']},
|
||||
test_suite=package_name + '.tests')
|
||||
name = package_name,
|
||||
description = 'Get full-text RSS feeds',
|
||||
author = 'pictuga, Samuel Marks',
|
||||
author_email = 'contact at pictuga dot com',
|
||||
url = 'http://morss.it/',
|
||||
download_url = 'https://git.pictuga.com/pictuga/morss',
|
||||
license = 'AGPL v3',
|
||||
packages = [package_name],
|
||||
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'],
|
||||
extras_require = {'full': ['pymysql', 'redis']},
|
||||
package_data = {package_name: ['feedify.ini']},
|
||||
data_files = [
|
||||
('share/' + package_name, ['README.md', 'LICENSE']),
|
||||
('share/' + package_name + '/www', glob('www/*.*')),
|
||||
('share/' + package_name + '/www/cgi', [])
|
||||
],
|
||||
entry_points = {
|
||||
'console_scripts': [package_name + '=' + package_name + '.__main__:main']
|
||||
})
|
||||
|
@@ -4,6 +4,12 @@ ErrorDocument 403 "Access forbidden"
|
||||
ErrorDocument 404 /cgi/main.py
|
||||
ErrorDocument 500 "A very nasty bug found his way onto this very server"
|
||||
|
||||
# Uncomment below line to turn debug on for all requests
|
||||
#SetEnv DEBUG 1
|
||||
|
||||
# Uncomment below line to turn debug on for requests with :debug in the url
|
||||
#SetEnvIf Request_URI :debug DEBUG=1
|
||||
|
||||
<Files ~ "\.(py|pyc|db|log)$">
|
||||
deny from all
|
||||
</Files>
|
||||
|
@@ -4,6 +4,7 @@
|
||||
<title>morss</title>
|
||||
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
|
||||
<meta charset="UTF-8" />
|
||||
<link rel="shortcut icon" type="image/svg+xml" href="/logo.svg" sizes="any" />
|
||||
<style type="text/css">
|
||||
body
|
||||
{
|
||||
@@ -35,8 +36,8 @@
|
||||
<input type="text" id="url" name="url" placeholder="Feed url (http://example.com/feed.xml)" />
|
||||
</form>
|
||||
|
||||
<code>Copyright: pictuga 2013-2014<br/>
|
||||
Source code: https://github.com/pictuga/morss</code>
|
||||
<code>Copyright: pictuga 2013-2020<br/>
|
||||
Source code: https://git.pictuga.com/pictuga/morss</code>
|
||||
|
||||
<script>
|
||||
form = document.forms[0]
|
||||
|
17
www/logo.svg
Normal file
17
www/logo.svg
Normal file
@@ -0,0 +1,17 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<svg width="16" height="16" viewBox="0 0 16 16" shape-rendering="crispEdges" fill="black" version="1.1" xmlns="http://www.w3.org/2000/svg">
|
||||
<rect x="2" y="4" width="2" height="2" />
|
||||
<rect x="5" y="4" width="6" height="2" />
|
||||
<rect x="12" y="4" width="2" height="2" />
|
||||
|
||||
<rect x="2" y="7" width="2" height="2" />
|
||||
<rect x="7" y="7" width="2" height="2" />
|
||||
<rect x="12" y="7" width="2" height="2" />
|
||||
|
||||
<rect x="2" y="10" width="2" height="2" />
|
||||
<rect x="7" y="10" width="2" height="2" />
|
||||
<rect x="12" y="10" width="2" height="2" />
|
||||
</svg>
|
||||
|
||||
<!-- This work by pictuga is licensed under CC BY-NC-SA 4.0. To view a copy of
|
||||
this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0 -->
|
After Width: | Height: | Size: 735 B |
353
www/sheet.xsl
353
www/sheet.xsl
@@ -1,5 +1,12 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<xsl:stylesheet version="1.1" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
|
||||
<xsl:stylesheet version="1.1"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:atom="http://www.w3.org/2005/Atom"
|
||||
xmlns:atom03="http://purl.org/atom/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:content="http://purl.org/rss/1.0/modules/content/"
|
||||
xmlns:rssfake="http://purl.org/rss/1.0/"
|
||||
>
|
||||
|
||||
<xsl:output method="html"/>
|
||||
|
||||
@@ -7,116 +14,288 @@
|
||||
<html>
|
||||
<head>
|
||||
<title>RSS feed by morss</title>
|
||||
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
|
||||
<meta name="viewport" content="width=device-width; initial-scale=1.0;" />
|
||||
<meta name="robots" content="noindex" />
|
||||
|
||||
<style type="text/css">
|
||||
body * {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
body {
|
||||
overflow-wrap: anywhere;
|
||||
word-wrap: anywhere;
|
||||
word-break: break-word;
|
||||
|
||||
font-family: sans-serif;
|
||||
|
||||
-webkit-tap-highlight-color: transparent; /* safari work around */
|
||||
}
|
||||
|
||||
#url {
|
||||
background-color: rgba(255, 165, 0, 0.25);
|
||||
padding: 1% 5%;
|
||||
display: inline-block;
|
||||
input, select {
|
||||
font-family: inherit;
|
||||
font-size: inherit;
|
||||
text-align: inherit;
|
||||
}
|
||||
|
||||
header {
|
||||
text-align: justify;
|
||||
text-align-last: center;
|
||||
border-bottom: 1px solid silver;
|
||||
}
|
||||
|
||||
.input-combo {
|
||||
display: flex;
|
||||
flex-flow: row;
|
||||
align-items: stretch;
|
||||
|
||||
width: 800px;
|
||||
max-width: 100%;
|
||||
}
|
||||
margin: auto;
|
||||
|
||||
body > ul {
|
||||
border: 1px solid grey;
|
||||
|
||||
padding: .5em .5em;
|
||||
background-color: #FFFAF4;
|
||||
}
|
||||
|
||||
.input-combo * {
|
||||
display: inline-block;
|
||||
line-height: 2em;
|
||||
border: 0;
|
||||
background: transparent;
|
||||
}
|
||||
|
||||
.input-combo > :not(.button) {
|
||||
max-width: 100%;
|
||||
flex-grow: 1;
|
||||
flex-shrink 0;
|
||||
|
||||
white-space: nowrap;
|
||||
text-overflow: ellipsis;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.input-combo .button {
|
||||
flex-grow: 0;
|
||||
flex-shrink 1;
|
||||
|
||||
cursor: pointer;
|
||||
min-width: 2em;
|
||||
text-align: center;
|
||||
border-left: 1px solid silver;
|
||||
color: #06f;
|
||||
}
|
||||
|
||||
[onclick_title] {
|
||||
cursor: pointer;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
[onclick_title]::before {
|
||||
opacity: 0;
|
||||
|
||||
content: attr(onclick_title);
|
||||
font-weight: normal;
|
||||
|
||||
position: absolute;
|
||||
left: -300%;
|
||||
|
||||
z-index: 1;
|
||||
|
||||
background: grey;
|
||||
color: white;
|
||||
|
||||
border-radius: 0.5em;
|
||||
padding: 0 1em;
|
||||
}
|
||||
|
||||
[onclick_title]:not(:active)::before {
|
||||
transition: opacity 1s ease-in-out;
|
||||
}
|
||||
|
||||
[onclick_title]:active::before {
|
||||
opacity: 1;
|
||||
}
|
||||
|
||||
header > form {
|
||||
margin: 1%;
|
||||
}
|
||||
|
||||
header a {
|
||||
text-decoration: inherit;
|
||||
color: #FF7B0A;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.item {
|
||||
background-color: #FFFAF4;
|
||||
border: 1px solid silver;
|
||||
margin: 1%;
|
||||
max-width: 100%;
|
||||
}
|
||||
|
||||
.item > * {
|
||||
padding: 1%;
|
||||
}
|
||||
|
||||
.item > *:empty {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.item > :not(:last-child) {
|
||||
border-bottom: 1px solid silver;
|
||||
}
|
||||
|
||||
.item > a {
|
||||
|
||||
display: block;
|
||||
font-weight: bold;
|
||||
font-size: 1.5em;
|
||||
}
|
||||
|
||||
.desc, .content {
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.desc *, .content * {
|
||||
max-width: 100%;
|
||||
}
|
||||
|
||||
ul {
|
||||
list-style-type: none;
|
||||
}
|
||||
|
||||
.tag {
|
||||
color: darkred;
|
||||
}
|
||||
|
||||
.attr {
|
||||
color: darksalmon;
|
||||
}
|
||||
|
||||
.value {
|
||||
color: darkblue;
|
||||
}
|
||||
|
||||
.comment {
|
||||
color: lightgrey;
|
||||
}
|
||||
|
||||
pre {
|
||||
margin: 0;
|
||||
max-width: 100%;
|
||||
white-space: normal;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<h1>RSS feed by morss</h1>
|
||||
<header>
|
||||
<h1>RSS feed by morss</h1>
|
||||
|
||||
<p>Your RSS feed is <strong style="color: green">ready</strong>. You
|
||||
can enter the following url in your newsreader:</p>
|
||||
<p>Your RSS feed is <strong style="color: green">ready</strong>. You
|
||||
can enter the following url in your newsreader:</p>
|
||||
|
||||
<div id="url"></div>
|
||||
<div class="input-combo">
|
||||
<input id="url" readonly="readonly"/>
|
||||
<span class="button" onclick="copy_link()" title="Copy" onclick_title="Copied">
|
||||
<svg width="16px" height="16px" viewBox="0 0 16 16" fill="currentColor" xmlns="http://www.w3.org/2000/svg">
|
||||
<path fill-rule="evenodd" d="M4 1.5H3a2 2 0 00-2 2V14a2 2 0 002 2h10a2 2 0 002-2V3.5a2 2 0 00-2-2h-1v1h1a1 1 0 011 1V14a1 1 0 01-1 1H3a1 1 0 01-1-1V3.5a1 1 0 011-1h1v-1z" clip-rule="evenodd"/>
|
||||
<path fill-rule="evenodd" d="M9.5 1h-3a.5.5 0 00-.5.5v1a.5.5 0 00.5.5h3a.5.5 0 00.5-.5v-1a.5.5 0 00-.5-.5zm-3-1A1.5 1.5 0 005 1.5v1A1.5 1.5 0 006.5 4h3A1.5 1.5 0 0011 2.5v-1A1.5 1.5 0 009.5 0h-3z" clip-rule="evenodd"/>
|
||||
</svg>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
<ul>
|
||||
<xsl:apply-templates/>
|
||||
</ul>
|
||||
<form onchange="open_feed()">
|
||||
More options: Output the
|
||||
<select>
|
||||
<option value="">full-text</option>
|
||||
<option value=":proxy">original</option>
|
||||
<option value=":clip" title="original + full-text: keep the original description above the full article. Useful for reddit feeds for example, to keep the comment links">combined (?)</option>
|
||||
</select>
|
||||
feed as
|
||||
<select>
|
||||
<option value="">RSS</option>
|
||||
<option value=":format=json:cors">JSON</option>
|
||||
<option value=":format=html">HTML</option>
|
||||
<option value=":format=csv">CSV</option>
|
||||
</select>
|
||||
using the
|
||||
<select>
|
||||
<option value="">standard</option>
|
||||
<option value=":firstlink" title="Pull the article from the first available link in the description, instead of the standard link. Useful for Twitter feeds for example, to get the articles referred to in tweets rather than the tweet itself">first (?)</option>
|
||||
</select>
|
||||
link of the
|
||||
<select>
|
||||
<option value="">first</option>
|
||||
<option value=":newest" title="Select feed items by publication date (instead of appearing order)">newest (?)</option>
|
||||
</select>
|
||||
items and
|
||||
<select>
|
||||
<option value="">keep</option>
|
||||
<option value=":nolink:noref">remove</option>
|
||||
</select>
|
||||
links
|
||||
<input type="hidden" value="" name="extra_options"/>
|
||||
</form>
|
||||
|
||||
<p>You can find a <em>preview</em> of the feed below. You need a <em>feed reader</em> for optimal use</p>
|
||||
<p>Click <a href="/">here</a> to go back to morss and/or to use the tool on another feed</p>
|
||||
</header>
|
||||
|
||||
<div id="header" dir="auto">
|
||||
<h1>
|
||||
<xsl:value-of select="rdf:RDF/rssfake:channel/rssfake:title|rss/channel/title|atom:feed/atom:title|atom03:feed/atom03:title"/>
|
||||
</h1>
|
||||
|
||||
<p>
|
||||
<xsl:value-of select="rdf:RDF/rssfake:channel/rssfake:description|rss/channel/description|atom:feed/atom:subtitle|atom03:feed/atom03:subtitle"/>
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div id="content">
|
||||
<xsl:for-each select="rdf:RDF/rssfake:channel/rssfake:item|rss/channel/item|atom:feed/atom:entry|atom03:feed/atom03:entry">
|
||||
<div class="item" dir="auto">
|
||||
<a target="_blank"><xsl:attribute name="href"><xsl:value-of select="rssfake:link|link|atom:link/@href|atom03:link/@href"/></xsl:attribute>
|
||||
<xsl:value-of select="rssfake:title|title|atom:title|atom03:title"/>
|
||||
</a>
|
||||
|
||||
<div class="desc">
|
||||
<xsl:copy-of select="rssfake:description|description|atom:summary|atom03:summary"/>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
<xsl:copy-of select="content:encoded|atom:content|atom03:content"/>
|
||||
</div>
|
||||
</div>
|
||||
</xsl:for-each>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
document.getElementById("url").innerHTML = window.location.href;
|
||||
//<![CDATA[
|
||||
document.getElementById("url").value = window.location.href
|
||||
|
||||
if (!/:html/.test(window.location.href))
|
||||
for (var content of document.querySelectorAll(".desc,.content"))
|
||||
content.innerHTML = (content.innerText.match(/>/g) || []).length > 3 ? content.innerText : content.innerHTML
|
||||
|
||||
var options = parse_location()[0]
|
||||
|
||||
if (options) {
|
||||
for (var select of document.forms[0].elements)
|
||||
if (select.tagName == 'SELECT')
|
||||
for (var option of select)
|
||||
if (option.value && options.match(option.value)) {
|
||||
select.value = option.value
|
||||
options = options.replace(option.value, '')
|
||||
break
|
||||
}
|
||||
|
||||
document.forms[0]['extra_options'].value = options
|
||||
}
|
||||
|
||||
function copy_content(input) {
|
||||
input.focus()
|
||||
input.select()
|
||||
document.execCommand('copy')
|
||||
input.blur()
|
||||
}
|
||||
|
||||
function copy_link() {
|
||||
copy_content(document.getElementById("url"))
|
||||
}
|
||||
|
||||
function parse_location() {
|
||||
return (window.location.pathname + window.location.search).match(/^\/(?:(:[^\/]+)\/)?(.*$)$/).slice(1)
|
||||
}
|
||||
|
||||
function open_feed() {
|
||||
var url = parse_location()[1]
|
||||
var options = Array.from(document.forms[0].elements).map(x=>x.value).join('')
|
||||
|
||||
var target = '/' + (options ? options + '/' : '') + url
|
||||
|
||||
if (target != window.location.pathname)
|
||||
window.location.href = target
|
||||
}
|
||||
//]]>
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="*">
|
||||
<li>
|
||||
<span class="element">
|
||||
<
|
||||
<span class="tag"><xsl:value-of select="name()"/></span>
|
||||
|
||||
<xsl:for-each select="@*">
|
||||
<span class="attr"> <xsl:value-of select="name()"/></span>
|
||||
=
|
||||
"<span class="value"><xsl:value-of select="."/></span>"
|
||||
</xsl:for-each>
|
||||
>
|
||||
</span>
|
||||
|
||||
<xsl:if test="node()">
|
||||
<ul>
|
||||
<xsl:apply-templates/>
|
||||
</ul>
|
||||
</xsl:if>
|
||||
|
||||
<span class="element">
|
||||
</
|
||||
<span class="tag"><xsl:value-of select="name()"/></span>
|
||||
>
|
||||
</span>
|
||||
</li>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="comment()">
|
||||
<li>
|
||||
<pre class="comment"><![CDATA[<!--]]><xsl:value-of select="."/><![CDATA[-->]]></pre>
|
||||
</li>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="text()">
|
||||
<li>
|
||||
<pre>
|
||||
<xsl:value-of select="normalize-space(.)"/>
|
||||
</pre>
|
||||
</li>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="text()[not(normalize-space())]"/>
|
||||
|
||||
</xsl:stylesheet>
|
||||
|
Reference in New Issue
Block a user