Compare commits

...

195 Commits

Author SHA1 Message Date
2514fabd38 Replace memory-leak-prone Uniq with @uniq_wrapper 2020-10-03 19:43:55 +02:00
8cb7002fe6 feeds: make it possible to append empty items
And return the newly appended items, to make it easy to edit them
2020-10-03 16:56:07 +02:00
6966e03bef Clean up itemClass code
To avoid globals()
2020-10-03 16:25:29 +02:00
03a122c41f Dockerfile: add --no-cache to save some space 2020-10-01 22:33:29 +02:00
5cd6c22d73 Reorganise the README file 2020-10-01 22:25:53 +02:00
e1b41b5f64 Typo in README 2020-10-01 00:18:48 +02:00
9ce6acba20 Fix gunicorn related typo 2020-10-01 00:07:41 +02:00
6192ff4081 gunicorn with --preload
To only load the code once (and start autotrim once)
2020-10-01 00:05:39 +02:00
056a1b143f crawler: autotrim: make ctrl+c working 2020-10-01 00:04:36 +02:00
eed949736a crawler: add ability to limit cache size 2020-09-30 23:59:55 +02:00
2fc7cd391c Shift __main__'s wsgi code where it belongs 2020-09-30 23:24:51 +02:00
d9f46b23a6 crawler: default value for MYSQL_HOST (localhost) 2020-09-30 13:17:02 +02:00
bbada0436a Quick guide to ignore SSL certs 2020-09-27 16:48:22 +02:00
039a672f4e wsgi: clean up url reconstruction 2020-09-27 16:28:26 +02:00
b290568e14 README: decent line length
Obtained from the output of:
python -m morss --help | cat
2020-09-15 23:01:42 +02:00
9ecf856f10 Add :resolve to remove (some?) tracking links 2020-09-15 22:57:52 +02:00
504ede624d Logo CC BY-NC-SA 4.0 2020-09-03 13:17:58 +02:00
0d89f0e6f2 Add a logo
B&W edition of the logo at https://morss.it/
2020-08-28 21:28:07 +02:00
56e0c2391d Missing import for served files 2020-08-28 20:53:03 +02:00
679f406a12 Default mimetype for served files 2020-08-28 20:52:43 +02:00
f6d641eeef Serve any file in www/
Also fixes #41
2020-08-28 20:45:39 +02:00
2456dd9bbc Fix broken pieces
Including #43
2020-08-28 19:38:48 +02:00
0f33db248a Add license info in each file 2020-08-26 20:08:22 +02:00
d57f543c7b README: remove todo 2020-08-24 21:17:31 +02:00
fba112147c README: make it clear that the internal server is _very_ basic 2020-08-24 21:14:48 +02:00
8697c3f0df Remove remaining --debug from README 2020-08-24 19:39:27 +02:00
75935114e4 Remove leftover code 2020-08-23 19:07:12 +02:00
5bd2557619 Fix typo in provided .htaccess 2020-08-23 19:01:34 +02:00
598a2591f1 Dockerfile: remove confusing one-liner code 2020-08-23 18:59:16 +02:00
e76ab2b631 Update gunicorn instructions 2020-08-23 18:59:02 +02:00
aa9143302b Remove now-unused isInt code 2020-08-23 18:51:09 +02:00
0d62a7625b Define http port via env vars as well 2020-08-23 18:50:18 +02:00
bd0efb1529 crawler: missing os import 2020-08-23 18:45:44 +02:00
47a17614ef Rename morss/cgi.py into morss/wsgi.py
To avoid name collision with the built-in cgi lib
2020-08-23 18:44:49 +02:00
4dfebe78f7 Pick caching backend via env vars 2020-08-23 18:43:18 +02:00
dcd3e4a675 cgi.py: add missing impots 2020-08-23 18:31:05 +02:00
e968b2ea7f Remove leftover :debug code 2020-08-23 16:59:34 +02:00
0ac590c798 Set MAX_/LIM_* settings via env var 2020-08-23 16:09:58 +02:00
fa1b5aef09 Instructions for DEBUG= use 2020-08-23 15:31:11 +02:00
7f6309f618 README: :silent was explained twice 2020-08-23 14:34:04 +02:00
f65fb45030 :debug completely deprecated in favour of DEBUG= 2020-08-23 14:33:32 +02:00
6dd40e5cc4 cli.py: fix Options code 2020-08-23 14:25:09 +02:00
0acfce5a22 cli.py: remove log 2020-08-23 14:24:57 +02:00
97ccc15db0 cgi.py: rename parseOptions to parse_options 2020-08-23 14:24:23 +02:00
7a560181f7 Use env var for DEBUG 2020-08-23 14:23:45 +02:00
baccd3b22b Move parseOptions to cgi.py
As it is no longer used in cli.py
2020-08-22 00:37:34 +02:00
f79938ab11 Add :silent to readme & argparse 2020-08-22 00:02:08 +02:00
5b8bd47829 cli.py: remove draft code 2020-08-21 23:59:12 +02:00
b5b355aa6e readabilite: increase penalty for high link density 2020-08-21 23:55:04 +02:00
94097f481a sheet.xsl: better handle some corner cases 2020-08-21 23:54:35 +02:00
8161baa7ae sheet.xsl: improve css 2020-08-21 23:54:12 +02:00
bd182bcb85 Move cli code to argParse
Related code changes (incl. :format=xyz)
2020-08-21 23:52:56 +02:00
c7c2c5d749 Removed unused filterOptions code 2020-08-21 23:23:33 +02:00
c6b52e625f split morss.py into __main__/cgi/cli.py
Should hopefully allow cleaner code in the future
2020-08-21 22:17:55 +02:00
c6d3a0eb53 readabilite: clean up code 2020-07-15 00:49:34 +02:00
c628ee802c README: add docker-compose instructions 2020-07-13 20:50:39 +02:00
6021b912ff morss: fix item removal
Usual issue when editing a list while looping over it
2020-07-06 19:25:48 +02:00
f18a128ee6 Change :first for :newest
i.e. toggle default for the more-obvious option
2020-07-06 19:25:17 +02:00
64af86c11e crawler: catch html parsing errors 2020-07-06 12:25:38 +02:00
15951d228c Add :first to NOT sort items by date 2020-07-06 11:39:08 +02:00
c1b1f5f58a morss: restrict iframe use from :get to avoid abuse 2020-06-09 12:33:37 +02:00
985185f47f morss: more flexible feed creator auto-detection 2020-06-08 13:03:24 +02:00
3190d1ec5a feeds: remove useless if(len) before loop 2020-06-02 13:57:45 +02:00
9815794a97 sheet.xsl: make text more self explanatory 2020-05-27 21:42:00 +02:00
758b6861b9 sheet.xsl: fix text alignment 2020-05-27 21:36:11 +02:00
ce4cf01aa6 crawler: clean up encoding detection code 2020-05-27 21:35:24 +02:00
dcfdb75a15 crawler: fix chinese encoding support 2020-05-27 21:34:43 +02:00
4ccc0dafcd Basic help for sub-lib interactive use 2020-05-26 19:34:20 +02:00
2fe3e0b8ee feeds: clean up other stylesheets before putting ours 2020-05-26 19:26:36 +02:00
ad3ba9de1a sheet.xsl: add <select/> to use :firstlink 2020-05-13 12:33:12 +02:00
68c46a1823 morss: remove deprecated twitter/fb link handling 2020-05-13 12:31:09 +02:00
91be2d229e morss: ability to use first link from desc instead of default link 2020-05-13 12:29:53 +02:00
038f267ea2 Rename :theforce into :force 2020-05-13 11:49:15 +02:00
22005065e8 Use etree.tostring 'method' arg
Gives appropriately formatted html code.
Some pages might otherwise be rendered as blank.
2020-05-13 11:44:34 +02:00
7d0d416610 morss: cache articles for 24hrs
Also make it possible to refetch articles, regardless of cache
2020-05-12 21:10:31 +02:00
5dac4c69a1 crawler: more code comments 2020-05-12 20:44:25 +02:00
36e2a1c3fd crawler: increase size limit from 100KiB to 500
I'm looking at you, worldbankgroup.csod.com/ats/careersite/search.aspx
2020-05-12 19:34:16 +02:00
83dd2925d3 readabilite: better parsing
Keeping blank_text keeps the tree more as-it, making the final output closer to expectations
2020-05-12 14:15:53 +02:00
e09d0abf54 morss: remove deprecated peace of code 2020-05-07 16:05:30 +02:00
ff26a560cb Shift safari work around to morss.py 2020-05-07 16:04:54 +02:00
74d7a1eca2 sheet.xsl: fix word wrap 2020-05-06 16:58:28 +02:00
eba295cba8 sheet.xsl: fixes for safari 2020-05-06 12:01:27 +02:00
f27631954e .htaccess: bypass Safari RSS detection 2020-05-06 11:47:24 +02:00
c74abfa2f4 sheet.xsl: use CDATA for js code 2020-05-06 11:46:38 +02:00
1d5272c299 sheet.xsl: allow zooming on mobile 2020-05-04 14:44:43 +02:00
f685139137 crawler: use UPSERT statements
Avoid potential race conditions
2020-05-03 21:27:45 +02:00
73b477665e morss: separate :clip with <hr> instead of stars 2020-05-02 19:19:54 +02:00
b425992783 morss: don't follow alt=rss with custom feeds
To have the same page as with :get=page and to avoid shitty feeds
2020-05-02 19:18:58 +02:00
271ac8f80f crawler: comment code a bit 2020-05-02 19:18:01 +02:00
64e41b807d crawler: handle http:/ (single slash)
Fixing one more corner case! malayalam.oneindia.com
2020-05-02 19:17:15 +02:00
a2c4691090 sheet.xsl: dir=auto for rtl languages (arabic, etc.) 2020-04-29 15:01:33 +02:00
b6000923bc README: clean up deprecated code 2020-04-28 22:31:11 +02:00
27a42c47aa morss: use final request url
Code is not very elegant...
2020-04-28 22:30:21 +02:00
c27c38f7c7 crawler: return dict instead of tuple 2020-04-28 22:29:07 +02:00
a1dc96cb50 feeds: remove mimetype from function call as no longer used 2020-04-28 22:07:25 +02:00
749acc87fc Centralize url clean up in crawler.py 2020-04-28 22:03:49 +02:00
c186188557 README: warning about lxml installation 2020-04-28 21:58:26 +02:00
cb69e3167f crawler: accept non-ascii urls
Covering one more corner case!
2020-04-28 14:47:23 +02:00
c3f06da947 morss: process(): specify encoding for clarity 2020-04-28 14:45:00 +02:00
44a3e0edc4 readabilite: specify in- and out-going encoding 2020-04-28 14:44:35 +02:00
4a9b505499 README: update python lib instructions 2020-04-27 18:12:14 +02:00
818cdaaa9b Make it possible to call sub-libs in non interactive mode
Run `python -m morss.feeds http://lemonde.fr` and so on
2020-04-27 18:00:14 +02:00
2806c64326 Make it possible to directly run sub-libs (feeds, crawler, readabilite)
Run `python -im morss.feeds http://website.sample/rss.xml` and so on
2020-04-27 17:19:31 +02:00
d39d7bb19d sheet.xsl: limit overflow 2020-04-25 15:27:49 +02:00
e5e3746fc6 sheet.xsl: show plain url 2020-04-25 15:27:13 +02:00
960c9d10d6 sheet.xsl: customize output feed form 2020-04-25 15:26:47 +02:00
0e7a5b9780 sheet.xsl: wrap header in <header> 2020-04-25 15:24:57 +02:00
186bedcf62 sheet.xsl: smarter html reparser 2020-04-25 15:22:25 +02:00
5847e18e42 sheet: improved feed address output (w/ c/c) 2020-04-25 15:21:47 +02:00
f6bc23927f readabilite: drop dangerous tags (script, style) 2020-04-25 12:25:02 +02:00
c86572374e readabilite: minimum score requirement 2020-04-25 12:24:36 +02:00
59ef5af9e2 feeds: fix bug when deleting attr in html 2020-04-24 22:12:05 +02:00
6a0531ca03 crawler: randomize user agent 2020-04-24 11:28:39 +02:00
8187876a06 crawler: stop at first alternative link
Should save a few ms and the first one is usually (?) the most relevant/generic
2020-04-23 11:23:45 +02:00
325a373e3e feeds: add SyntaxError catch 2020-04-20 16:15:15 +02:00
2719bd6776 crawler: fix chinese encoding 2020-04-20 16:14:55 +02:00
285e1e5f42 docker: pip install local 2020-04-19 13:25:53 +02:00
41a63900c2 README: improve docker instructions 2020-04-19 13:01:08 +02:00
ec8edb02f1 Various small bug fixes 2020-04-19 12:54:02 +02:00
d01b943597 Remove leftover threading var 2020-04-19 12:51:11 +02:00
b361aa2867 Add timeout to :get 2020-04-19 12:50:26 +02:00
4ce3c7cb32 Small code clean ups 2020-04-19 12:50:05 +02:00
7e45b2611d Disable multi-threading
Impact was mostly negative due to locks
2020-04-19 12:29:52 +02:00
036e5190f1 crawler: remove unused code 2020-04-18 21:40:02 +02:00
e99c5b3b71 morss: more sensible default MAX/LIM values 2020-04-18 17:21:45 +02:00
4f44df8d63 Make all ports default to 8080 2020-04-18 17:15:59 +02:00
497c14db81 Add dockerfile & how to in README 2020-04-18 17:04:44 +02:00
a4e1dba8b7 sheet.xsl: improve url display 2020-04-16 10:33:36 +02:00
7375adce33 sheet.xsl: fix & improve 2020-04-15 23:34:28 +02:00
663212de0a sheet.xsl: various cosmetic improvements 2020-04-15 23:22:45 +02:00
4a2ea1bce9 README: add gunicorn instructions 2020-04-15 22:31:21 +02:00
fe82b19c91 Merge .xsl & html template
Turns out they somehow serve a similar purpose
2020-04-15 22:30:45 +02:00
0b31e97492 morss: remove debug code in http file handler 2020-04-14 23:20:03 +02:00
b0ad7c259d Add README & LICENSE to data_files 2020-04-14 19:34:12 +02:00
bffb23f884 README: how to use cli 2020-04-14 18:21:32 +02:00
59139272fd Auto-detect the location of www/
Either ../www or /usr/share/morss
Adapted README accordingly
2020-04-14 18:07:19 +02:00
39b0a1d7cc setup.py: fix deps & files 2020-04-14 17:36:42 +02:00
65803b328d New git url and updated date in provided index.html 2020-04-13 15:30:32 +02:00
e6b7c0eb33 Fix app definition for uwsgi 2020-04-13 15:30:09 +02:00
67c096ad5b feeds: add fake path to default html parser
Without it, some websites were accidentally matching it (false positives)
2020-04-12 13:00:56 +02:00
f018437544 crawler: make mysql backend thread safe 2020-04-12 12:53:05 +02:00
8e5e8d24a4 Timezone fixes 2020-04-10 20:33:59 +02:00
ee78a7875a morss: focus on the most recent feed items 2020-04-10 16:08:13 +02:00
9e7b9d95ee feeds: properly use html template 2020-04-09 20:00:51 +02:00
987a719c4e feeds: try all parsers regardless of contenttype
Turns out some websites send the wrong contenttype (json for html, html for xml, etc.)
2020-04-09 19:17:51 +02:00
47b33f4baa morss: specify server output encoding 2020-04-09 19:10:45 +02:00
3c7f512583 feeds: handle several errors 2020-04-09 19:09:10 +02:00
a32f5a8536 readabilite: add debug option (also used by :get) 2020-04-09 19:08:13 +02:00
63a06524b7 morss: various encoding fixes 2020-04-09 19:06:51 +02:00
b0f80c6d3c morss: fix csv output encoding 2020-04-09 19:05:50 +02:00
78cea10ead morss: replace :getpage with :get
Also provides readabilite debugging
2020-04-09 18:43:20 +02:00
e5a82ff1f4 crawler: drop auto-referer
Was solving some issues. But creating even more issues.
2020-04-07 10:39:21 +02:00
f3d1f92b39 Detect encoding everytime 2020-04-07 10:38:36 +02:00
7691df5257 Use wrapper for http calls 2020-04-07 10:30:17 +02:00
0ae0dbc175 README: mention csv output 2020-04-07 09:24:32 +02:00
f1d0431e68 morss: drop :html, replaced with :reader
README updated accordingly
2020-04-07 09:23:29 +02:00
a09831415f feeds: fix bug when mimetype matches nothing 2020-04-06 18:53:07 +02:00
bfad6b7a4a readabilite: clean before counting
To remove links which are not kept anyway
2020-04-06 16:55:39 +02:00
6b8c3e51e7 readabilite: fix threshold feature
Awkward typo...
2020-04-06 16:52:06 +02:00
dc9e425247 readabilite: don't clean-out the top 10% nodes
Loosen up the code once again to limit over-kill
2020-04-06 14:26:28 +02:00
2f48e18bb1 readabilite: put scores directly in html node
Probably slower but makes code somewhat cleaner...
2020-04-06 14:21:41 +02:00
31cac921c7 README: remove ref to iTunes 2020-04-05 22:20:33 +02:00
a82ec96eb7 Delete feedify.py leftover code
iTunes integration untested, unreliable and not working...
2020-04-05 22:16:52 +02:00
aad2398e69 feeds: turns out lxml.etree doesn't have drop_tag 2020-04-05 21:50:38 +02:00
eeac630855 crawler: add more "realistic" headers 2020-04-05 21:11:57 +02:00
e136b0feb2 readabilite: loosen the slayer
Previous impl. lead to too many empty results
2020-04-05 20:47:30 +02:00
6cf32af6c0 readabilite: also use BS 2020-04-05 20:46:42 +02:00
568e7d7dd2 feeds: make BS's output bytes for lxml's sake 2020-04-05 20:46:04 +02:00
3617f86e9d morss: make cgi_encore more robust 2020-04-05 16:43:11 +02:00
d90756b337 morss: drop 'keep' option
Because the Firefox behaviour it is working around is no longer in use
2020-04-05 16:37:27 +02:00
40c69f17d2 feeds: parse html with BS
More robust & to make it consistent with :getpage
2020-04-05 16:12:41 +02:00
99461ea185 crawler: fix var name issues (private_cache) 2020-04-05 16:11:36 +02:00
bf86c1e962 crawler: make AutoUA match http(s) type 2020-04-05 16:07:51 +02:00
d20f6237bd crawler: replace ContentNegoHandler with AlternateHandler
More basic. Sends the same headers no matter what. Make requests more "replicable".
Also, drop "text/xml" from RSS contenttype, too broad, matches garbage
2020-04-05 16:05:59 +02:00
8a4d68d72c crawler: drop 'basic' toggle
Can't even remember the use case
2020-04-05 16:03:06 +02:00
e6811138fd morss: use redirected url in :getpage
Still have to find how to do the same thing with feeds...
2020-04-04 20:04:57 +02:00
35b702fffd morss: default values for feed creation 2020-04-04 19:39:32 +02:00
4a88886767 morss: get_page to act as a basic proxy (for iframes) 2020-04-04 16:37:15 +02:00
1653394cf7 morss: cgi_dispatcher to be able to create extra functions 2020-04-04 16:35:16 +02:00
a8a90cf414 morss: move url/options parsing to own function
For future re-use
2020-04-04 16:33:52 +02:00
bdbaf0f8a7 morss/cgi: fix handling of special chars in url 2020-04-04 16:21:37 +02:00
d0e447a2a6 ItemFix: clean up Pocket links 2020-04-04 16:20:39 +02:00
e6817e01b4 sheet.xsl: set font to "sans"
Browsers don't all have the same default font. Overriding for consistency
2020-04-03 17:47:19 +02:00
7c3091d64c morss: code spacing
One of those commits that make me feel useful
2020-03-21 23:41:46 +01:00
37b4e144a9 morss: small fixes
Includes dropping off ftp support
2020-03-21 23:30:18 +01:00
bd4b7b5bb2 morss: convert HTML feeds to XML ones for completeness 2020-03-21 23:27:42 +01:00
68d920d4b5 morss: make FeedFormat more flexible with encoding 2020-03-21 23:26:35 +01:00
758ff404a8 morss: fix cgi_app silent output
*Must* return sth
2020-03-21 23:25:25 +01:00
463530f02c morss: middleware to enforce encoding
bytes are always expected
2020-03-21 23:23:50 +01:00
ec0a28a91d morss: use middleware for wsgi apps 2020-03-21 23:23:21 +01:00
421acb439d morss: make errors more readable over http 2020-03-21 23:08:29 +01:00
42c5d09ccb morss: split "options" var into "raw_options" & "options"
To make it clearer who-is-what
2020-03-21 23:07:07 +01:00
056de12484 morss: add sheet.xsl to file handled by http server 2020-03-21 23:06:28 +01:00
961a31141f morss: fix url fixing 2020-03-21 17:28:00 +01:00
a7b01ee85e readabilite: further html processing instructions fix 2020-03-21 17:23:50 +01:00
20 changed files with 1680 additions and 1044 deletions

8
Dockerfile Normal file
View File

@@ -0,0 +1,8 @@
FROM alpine:latest
RUN apk add --no-cache python3 py3-lxml py3-gunicorn py3-pip git
ADD . /app
RUN pip3 install /app
CMD gunicorn --bind 0.0.0.0:8080 -w 4 --preload morss

346
README.md
View File

@@ -1,6 +1,7 @@
# Morss - Get full-text RSS feeds
_GNU AGPLv3 code_
_GNU AGPLv3 code_
_Provided logo is CC BY-NC-SA 4.0_
Upstream source code: https://git.pictuga.com/pictuga/morss
Github mirror (for Issues & Pull requests): https://github.com/pictuga/morss
@@ -18,21 +19,20 @@ Morss also provides additional features, such as: .csv and json export, extended
control over output. A strength of morss is its ability to deal with broken
feeds, and to replace tracking links with direct links to the actual content.
Morss can also generate feeds from html and json files (see `feedify.py`), which
Morss can also generate feeds from html and json files (see `feeds.py`), which
for instance makes it possible to get feeds for Facebook or Twitter, using
hand-written rules (ie. there's no automatic detection of links to build feeds).
Please mind that feeds based on html files may stop working unexpectedly, due to
html structure changes on the target website.
Additionally morss can grab the source xml feed of iTunes podcast, and detect
rss feeds in html pages' `<meta>`.
Additionally morss can detect rss feeds in html pages' `<meta>`.
You can use this program online for free at **[morss.it](https://morss.it/)**.
Some features of morss:
- Read RSS/Atom feeds
- Create RSS feeds from json/html pages
- Convert iTunes podcast links into xml links
- Export feeds as RSS/JSON/CSV/HTML
- Fetch full-text content of feed items
- Follow 301/meta redirects
@@ -42,76 +42,125 @@ Some features of morss:
- Works as server/cli tool
- Deobfuscate various tracking links
## Dependencies
## Install
You do need:
### Python package
```shell
pip install git+https://git.pictuga.com/pictuga/morss.git
```
The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
C code needs to be compiled). If possible on your distribution, try installing
it with the system package manager.
Dependencies:
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
- [lxml](http://lxml.de/) for xml parsing
- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
- [chardet](https://pypi.python.org/pypi/chardet)
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
- pymysql
Simplest way to get these:
```shell
pip install -r requirements.txt
```
You may also need:
- Apache, with python-cgi support, to run on a server
- a fast internet connection
## Arguments
### Docker
morss accepts some arguments, to lightly alter the output of morss. Arguments
may need to have a value (usually a string or a number). In the different "Use
cases" below is detailed how to pass those arguments to morss.
Build & run
The arguments are:
```shell
docker build --tag morss https://git.pictuga.com/pictuga/morss.git
docker run -p 8080:8080 morss
```
- Change what morss does
- `json`: output as JSON
- `proxy`: doesn't fill the articles
- `clip`: stick the full article content under the original feed content (useful for twitter)
- `keep`: by default, morss does drop feed description whenever the full-content is found (so as not to mislead users who use Firefox, since the latter only shows the description in the feed preview, so they might believe morss doens't work), but with this argument, the description is kept
- `search=STRING`: does a basic case-sensitive search in the feed
- Advanced
- `csv`: export to csv
- `indent`: returns indented XML or JSON, takes more place, but human-readable
- `nolink`: drop links, but keeps links' inner text
- `noref`: drop items' link
- `cache`: only take articles from the cache (ie. don't grab new articles' content), so as to save time
- `debug`: to have some feedback from the script execution. Useful for debugging
- `mono`: disable multithreading while fetching, makes debugging easier
- `theforce`: force download the rss feed and ignore cached http errros
- `silent`: don't output the final RSS (useless on its own, but can be nice when debugging)
- `encoding=ENCODING`: overrides the encoding auto-detection of the crawler. Some web developers did not quite understand the importance of setting charset/encoding tags correctly...
- http server only
- `callback=NAME`: for JSONP calls
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers)
- `html`: changes the http content-type to html, so that python cgi erros (written in html) are readable in a web browser
- `txt`: changes the http content-type to txt (for faster "`view-source:`")
- Custom feeds: you can turn any HTML page into a RSS feed using morss, using xpath rules. The article content will be fetched as usual (with readabilite). Please note that you will have to **replace** any `/` in your rule with a `|` when using morss as a webserver
- `items`: (**mandatory** to activate the custom feeds function) xpath rule to match all the RSS entries
- `item_link`: xpath rule relative to `items` to point to the entry's link
- `item_title`: entry's title
- `item_content`: entry's description
- `item_time`: entry's date & time (accepts a wide range of time formats)
With docker-compose:
## Use cases
```yml
services:
app:
build: https://git.pictuga.com/pictuga/morss.git
image: morss
ports:
- '8080:8080'
```
Then execute
```shell
docker-compose build
docker-compose up
```
To update:
- To get the latest code from the git repository, add `--no-cache` to the build
commands
- To update the base image (`alpine:latest`), add `--pull` to the build commands
## Run
morss will auto-detect what "mode" to use.
### Running on a server
### Running on/as a server
Set up the server as indicated below, then visit:
```
http://PATH/TO/MORSS/[main.py/][:argwithoutvalue[:argwithvalue=value[...]]]/FEEDURL
```
For example: `http://morss.example/:clip/https://twitter.com/pictuga`
*(Brackets indicate optional text)*
The `main.py` part is only needed if your server doesn't support the Apache
redirect rule set in the provided `.htaccess`.
Works like a charm with [Tiny Tiny
RSS](http://tt-rss.org/redmine/projects/tt-rss/wiki), and most probably other
clients.
#### Via Docker
See above (in Install)
#### Using Gunicorn
```shell
gunicorn --preload morss
```
#### Using uWSGI
Running this command should do:
```shell
uwsgi --http :8080 --plugin python --wsgi-file main.py
```
#### Using morss' internal HTTP server
Morss can run its own, **very basic**, HTTP server, meant for debugging mostly.
The latter should start when you run morss without any argument, on port 8080.
I'd highly recommend you to use gunicorn or something similar for better
performance.
```shell
morss
```
You can change the port using environment variables like this `PORT=9000 morss`.
#### Via mod_cgi/FastCGI with Apache/nginx
For this, you'll want to change a bit the architecture of the files, for example
into something like this.
```
/
├── cgi
@@ -138,47 +187,15 @@ method uses HTTP calls to fetch the RSS feeds, which will be handled through
Please pay attention to `main.py` permissions for it to be executable. Also
ensure that the provided `/www/.htaccess` works well with your server.
#### Using uWSGI
Running this command should do:
```shell
uwsgi --http :9090 --plugin python --wsgi-file main.py
```
However, one problem might be how to serve the provided `index.html` file if it
isn't in the same directory. Therefore you can add this at the end of the
command to point to another directory `--pyargv '--root ../../www/'`.
#### Using morss' internal HTTP server
Morss can run its own HTTP server. The later should start when you run morss
without any argument, on port 8080.
You can change the port and the location of the `www/` folder like this `python -m morss 9000 --root ../../www`.
#### Passing arguments
Then visit:
```
http://PATH/TO/MORSS/[main.py/][:argwithoutvalue[:argwithvalue=value[...]]]/FEEDURL
```
For example: `http://morss.example/:clip/https://twitter.com/pictuga`
*(Brackets indicate optional text)*
The `main.py` part is only needed if your server doesn't support the Apache redirect rule set in the provided `.htaccess`.
Works like a charm with [Tiny Tiny RSS](http://tt-rss.org/redmine/projects/tt-rss/wiki), and most probably other clients.
### As a CLI application
Run:
```
python[2.7] -m morss [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
morss [--argwithoutvalue] [--argwithvalue=value] [...] FEEDURL
```
For example: `python -m morss debug http://feeds.bbci.co.uk/news/rss.xml`
For example: `morss --clip http://feeds.bbci.co.uk/news/rss.xml`
*(Brackets indicate optional text)*
@@ -189,17 +206,21 @@ To use it, the newsreader [Liferea](http://lzone.de/liferea/) is required
scripts can be run on top of the RSS feed, using its
[output](http://lzone.de/liferea/scraping.htm) as an RSS feed.
To use this script, you have to enable "(Unix) command" in liferea feed settings, and use the command:
To use this script, you have to enable "(Unix) command" in liferea feed
settings, and use the command:
```
[python[2.7]] PATH/TO/MORSS/main.py [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
morss [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
```
For example: `python2.7 PATH/TO/MORSS/main.py http://feeds.bbci.co.uk/news/rss.xml`
For example: `morss http://feeds.bbci.co.uk/news/rss.xml`
*(Brackets indicate optional text)*
### As a python library
Quickly get a full-text feed:
```python
>>> import morss
>>> xml_string = morss.process('http://feeds.bbci.co.uk/news/rss.xml')
@@ -208,6 +229,7 @@ Quickly get a full-text feed:
```
Using cache and passing arguments:
```python
>>> import morss
>>> url = 'http://feeds.bbci.co.uk/news/rss.xml'
@@ -223,6 +245,7 @@ possible to call the simpler functions, to have more control on what's happening
under the hood.
Doing it step-by-step:
```python
import morss, morss.crawler
@@ -230,46 +253,140 @@ url = 'http://newspaper.example/feed.xml'
options = morss.Options(csv=True) # arguments
morss.crawler.sqlite_default = '/tmp/morss-cache.db' # sqlite cache location
rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
url, rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
rss = morss.FeedGather(rss, url, options) # this fills the feed and cleans it up
output = morss.Format(rss, options) # formats final feed
output = morss.FeedFormat(rss, options, 'unicode') # formats final feed
```
## Cache information
## Arguments and settings
morss uses caching to make loading faster. There are 2 possible cache backends
(visible in `morss/crawler.py`):
### Arguments
- `SQLiteCache`: sqlite3 cache. Default file location is in-memory (i.e. it will
be cleared every time the program is run
- `MySQLCacheHandler`: /!\ Does NOT support multi-threading
morss accepts some arguments, to lightly alter the output of morss. Arguments
may need to have a value (usually a string or a number). In the different "Use
cases" below is detailed how to pass those arguments to morss.
## Configuration
### Length limitation
The list of arguments can be obtained by running `morss --help`
```
usage: morss [-h] [--format {rss,json,html,csv}] [--search STRING] [--clip]
[--indent] [--cache] [--force] [--proxy] [--newest] [--firstlink]
[--resolve] [--items XPATH] [--item_link XPATH]
[--item_title XPATH] [--item_content XPATH] [--item_time XPATH]
[--nolink] [--noref] [--silent]
url
Get full-text RSS feeds
positional arguments:
url feed url
optional arguments:
-h, --help show this help message and exit
output:
--format {rss,json,html,csv}
output format
--search STRING does a basic case-sensitive search in the feed
--clip stick the full article content under the original feed
content (useful for twitter)
--indent returns indented XML or JSON, takes more place, but
human-readable
action:
--cache only take articles from the cache (ie. don't grab new
articles' content), so as to save time
--force force refetch the rss feed and articles
--proxy doesn't fill the articles
--newest return the feed items in chronological order (morss
ohterwise shows the items by appearing order)
--firstlink pull the first article mentioned in the description
instead of the default link
--resolve replace tracking links with direct links to articles
(not compatible with --proxy)
custom feeds:
--items XPATH (mandatory to activate the custom feeds function)
xpath rule to match all the RSS entries
--item_link XPATH xpath rule relative to items to point to the entry's
link
--item_title XPATH entry's title
--item_content XPATH entry's content
--item_time XPATH entry's date & time (accepts a wide range of time
formats)
misc:
--nolink drop links, but keeps links' inner text
--noref drop items' link
--silent don't output the final RSS (useless on its own, but
can be nice when debugging)
GNU AGPLv3 code
```
Further HTTP-only options:
- `callback=NAME`: for JSONP calls
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other
servers)
- `txt`: changes the http content-type to txt (for faster "`view-source:`")
### Environment variables
To pass environment variables:
- Docker-cli: `docker run -p 8080:8080 morss --env KEY=value`
- docker-compose: add an `environment:` section in the .yml file
- Gunicorn/uWSGI/CLI: prepend `KEY=value` before the command
- Apache: via the `SetEnv` instruction (see sample `.htaccess` provided)
Generic:
- `DEBUG=1`: to have some feedback from the script
execution. Useful for debugging.
- `DELAY` sets the browser cache delay, only for HTTP clients
- `TIMEOUT` sets the HTTP timeout when fetching rss feeds and articles
When parsing long feeds, with a lot of items (100+), morss might take a lot of
time to parse it, or might even run into a memory overflow on some shared
hosting plans (limits around 10Mb), in which case you might want to adjust the
different values at the top of the script.
below settings via environment variables.
- `MAX_TIME` sets the maximum amount of time spent *fetching* articles, more time might be spent taking older articles from cache. `-1` for unlimited.
- `MAX_ITEM` sets the maximum number of articles to fetch. `-1` for unlimited. More articles will be taken from cache following the nexts settings.
- `LIM_TIME` sets the maximum amount of time spent working on the feed (whether or not it's already cached). Articles beyond that limit will be dropped from the feed. `-1` for unlimited.
- `LIM_ITEM` sets the maximum number of article checked, limiting both the number of articles fetched and taken from cache. Articles beyond that limit will be dropped from the feed, even if they're cached. `-1` for unlimited.
- `MAX_TIME` sets the maximum amount of time spent *fetching* articles, more
time might be spent taking older articles from cache. `-1` for unlimited.
- `MAX_ITEM` sets the maximum number of articles to fetch. `-1` for unlimited.
More articles will be taken from cache following the nexts settings.
- `LIM_TIME` sets the maximum amount of time spent working on the feed (whether
or not it's already cached). Articles beyond that limit will be dropped from the
feed. `-1` for unlimited.
- `LIM_ITEM` sets the maximum number of article checked, limiting both the
number of articles fetched and taken from cache. Articles beyond that limit will
be dropped from the feed, even if they're cached. `-1` for unlimited.
### Other settings
morss uses caching to make loading faster. There are 3 possible cache backends:
- `DELAY` sets the browser cache delay, only for HTTP clients
- `TIMEOUT` sets the HTTP timeout when fetching rss feeds and articles
- `THREADS` sets the number of threads to use. `1` makes no use of multithreading.
- `(nothing/default)`: a simple python in-memory dict-like object.
- `CACHE=sqlite`: sqlite3 cache. Default file location is in-memory (i.e. it
will be cleared every time the program is run). Path can be defined with
`SQLITE_PATH`.
- `CACHE=mysql`: MySQL cache. Connection can be defined with the following
environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
To limit the size of the cache:
- `CACHE_SIZE` sets the target number of items in the cache (further items will
be deleted but the cache might be temporarily bigger than that). Defaults to 10k
entries.
- `CACHE_LIFESPAN` sets how often the cache must be trimmed (i.e. cut down to
the number of items set in `CACHE_SIZE`). Defaults to 1hr.
### Content matching
The content of articles is grabbed with our own readability fork. This means
that most of the time the right content is matched. However sometimes it fails,
therefore some tweaking is required. Most of the time, what has to be done is to
add some "rules" in the main script file in *readability* (not in morss).
add some "rules" in the main script file in `readabilite.py` (not in morss).
Most of the time when hardly nothing is matched, it means that the main content
of the article is made of images, videos, pictures, etc., which readability
@@ -280,14 +397,3 @@ morss will also try to figure out whether the full content is already in place
(for those websites which understood the whole point of RSS feeds). However this
detection is very simple, and only works if the actual content is put in the
"content" section in the feed and not in the "summary" section.
***
## Todo
You can contribute to this project. If you're not sure what to do, you can pick
from this list:
- Add ability to run morss.py as an update daemon
- Add ability to use custom xpath rule instead of readability
- More ideas here <https://github.com/pictuga/morss/issues/15>

20
main.py
View File

@@ -1,6 +1,24 @@
#!/usr/bin/env python
from morss import main, cgi_wrapper as application
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
from morss.__main__ import main
from morss.wsgi import application
if __name__ == '__main__':
main()

View File

@@ -1,2 +1,20 @@
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
# ran on `import morss`
from .morss import *
from .wsgi import application

View File

@@ -1,5 +1,50 @@
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
# ran on `python -m morss`
from .morss import main
import os
import sys
from . import wsgi
from . import cli
from .morss import MorssException
def main():
if 'REQUEST_URI' in os.environ:
# mod_cgi (w/o file handler)
wsgi.cgi_handle_request()
elif len(sys.argv) <= 1:
# start internal (basic) http server (w/ file handler)
wsgi.cgi_start_server()
else:
# as a CLI app
try:
cli.cli_app()
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
print('ERROR: %s' % e.message)
if __name__ == '__main__':
main()

69
morss/cli.py Normal file
View File

@@ -0,0 +1,69 @@
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import sys
import os.path
import argparse
from .morss import FeedFetch, FeedGather, FeedFormat
from .morss import Options
def cli_app():
parser = argparse.ArgumentParser(
prog='morss',
description='Get full-text RSS feeds',
epilog='GNU AGPLv3 code'
)
parser.add_argument('url', help='feed url')
group = parser.add_argument_group('output')
group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
group.add_argument('--search', action='store', type=str, metavar='STRING', help='does a basic case-sensitive search in the feed')
group.add_argument('--clip', action='store_true', help='stick the full article content under the original feed content (useful for twitter)')
group.add_argument('--indent', action='store_true', help='returns indented XML or JSON, takes more place, but human-readable')
group = parser.add_argument_group('action')
group.add_argument('--cache', action='store_true', help='only take articles from the cache (ie. don\'t grab new articles\' content), so as to save time')
group.add_argument('--force', action='store_true', help='force refetch the rss feed and articles')
group.add_argument('--proxy', action='store_true', help='doesn\'t fill the articles')
group.add_argument('--newest', action='store_true', help='return the feed items in chronological order (morss ohterwise shows the items by appearing order)')
group.add_argument('--firstlink', action='store_true', help='pull the first article mentioned in the description instead of the default link')
group.add_argument('--resolve', action='store_true', help='replace tracking links with direct links to articles (not compatible with --proxy)')
group = parser.add_argument_group('custom feeds')
group.add_argument('--items', action='store', type=str, metavar='XPATH', help='(mandatory to activate the custom feeds function) xpath rule to match all the RSS entries')
group.add_argument('--item_link', action='store', type=str, metavar='XPATH', help='xpath rule relative to items to point to the entry\'s link')
group.add_argument('--item_title', action='store', type=str, metavar='XPATH', help='entry\'s title')
group.add_argument('--item_content', action='store', type=str, metavar='XPATH', help='entry\'s content')
group.add_argument('--item_time', action='store', type=str, metavar='XPATH', help='entry\'s date & time (accepts a wide range of time formats)')
group = parser.add_argument_group('misc')
group.add_argument('--nolink', action='store_true', help='drop links, but keeps links\' inner text')
group.add_argument('--noref', action='store_true', help='drop items\' link')
group.add_argument('--silent', action='store_true', help='don\'t output the final RSS (useless on its own, but can be nice when debugging)')
options = Options(vars(parser.parse_args()))
url = options.url
url, rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options, 'unicode')
if not options.silent:
print(out)

View File

@@ -1,3 +1,21 @@
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import sys
import zlib
@@ -7,14 +25,21 @@ import chardet
from cgi import parse_header
import lxml.html
import time
import threading
import random
from collections import OrderedDict
try:
# python 2
from urllib2 import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
from urllib import quote
from urlparse import urlparse, urlunparse
import mimetools
except ImportError:
# python 3
from urllib.request import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
from urllib.parse import quote
from urllib.parse import urlparse, urlunparse
import email
try:
@@ -25,15 +50,67 @@ except NameError:
basestring = unicode = str
CACHE_SIZE = int(os.getenv('CACHE_SIZE', 10000)) # max number of items in cache (default: 10k items)
CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60*60)) # how often to auto-clear the cache (default: 1hr)
# uncomment the lines below to ignore SSL certs
#import ssl
#ssl._create_default_https_context = ssl._create_unverified_context
MIMETYPE = {
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
DEFAULT_UAS = [
#https://gist.github.com/fijimunkii/952acac988f2d25bef7e0284bc63c406
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
]
def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=False):
PROTOCOL = ['http', 'https']
def get(*args, **kwargs):
return adv_get(*args, **kwargs)['data']
def adv_get(url, timeout=None, *args, **kwargs):
url = sanitize_url(url)
if timeout is None:
con = custom_handler(*args, **kwargs).open(url)
else:
con = custom_handler(*args, **kwargs).open(url, timeout=timeout)
data = con.read()
contenttype = con.info().get('Content-Type', '').split(';')[0]
encoding= detect_encoding(data, con)
return {
'data':data,
'url': con.geturl(),
'con': con,
'contenttype': contenttype,
'encoding': encoding
}
def custom_handler(follow=None, delay=None, encoding=None):
handlers = []
# as per urllib2 source code, these Handelers are added first
@@ -45,26 +122,65 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F
# & HTTPSHandler
#handlers.append(DebugHandler())
handlers.append(SizeLimitHandler(100*1024)) # 100KiB
handlers.append(SizeLimitHandler(500*1024)) # 500KiB
handlers.append(HTTPCookieProcessor())
handlers.append(GZIPHandler())
handlers.append(HTTPEquivHandler())
handlers.append(HTTPRefreshHandler())
handlers.append(UAHandler(DEFAULT_UA))
if not basic:
handlers.append(AutoRefererHandler())
handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
handlers.append(BrowserlyHeaderHandler())
handlers.append(EncodingFixHandler(encoding))
if accept:
handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict))
if follow:
handlers.append(AlternateHandler(MIMETYPE[follow]))
handlers.append(CacheHandler(force_min=delay))
return build_opener(*handlers)
def is_ascii(string):
# there's a native function in py3, but home-made fix for backward compatibility
try:
string.encode('ascii')
except UnicodeError:
return False
else:
return True
def sanitize_url(url):
# make sure the url is unicode, i.e. not bytes
if isinstance(url, bytes):
url = url.decode()
# make sure there's a protocol (http://)
if url.split(':', 1)[0] not in PROTOCOL:
url = 'http://' + url
# turns out some websites have really badly fomatted urls (fix http:/badurl)
url = re.sub('^(https?):/([^/])', r'\1://\2', url)
# escape spaces
url = url.replace(' ', '%20')
# escape non-ascii unicode characters
# https://stackoverflow.com/a/4391299
parts = list(urlparse(url))
for i in range(len(parts)):
if not is_ascii(parts[i]):
if i == 1:
parts[i] = parts[i].encode('idna').decode('ascii')
else:
parts[i] = quote(parts[i].encode('utf-8'))
return urlunparse(parts)
class DebugHandler(BaseHandler):
handler_order = 2000
@@ -132,6 +248,15 @@ class GZIPHandler(BaseHandler):
def detect_encoding(data, resp=None):
enc = detect_raw_encoding(data, resp)
if enc.lower() == 'gb2312':
enc = 'gbk'
return enc
def detect_raw_encoding(data, resp=None):
if resp is not None:
enc = resp.headers.get('charset')
if enc is not None:
@@ -165,14 +290,10 @@ class EncodingFixHandler(BaseHandler):
if 200 <= resp.code < 300 and maintype == 'text':
data = resp.read()
if not self.encoding:
enc = detect_encoding(data, resp)
else:
enc = self.encoding
enc = self.encoding or detect_encoding(data, resp)
if enc:
data = data.decode(enc, 'replace')
data = data.encode(enc)
data = data.decode(enc, 'replace')
data = data.encode(enc)
fp = BytesIO(data)
old_resp = resp
@@ -196,48 +317,43 @@ class UAHandler(BaseHandler):
https_request = http_request
class AutoRefererHandler(BaseHandler):
class BrowserlyHeaderHandler(BaseHandler):
""" Add more headers to look less suspicious """
def http_request(self, req):
req.add_unredirected_header('Referer', 'http://%s' % req.host)
req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
req.add_unredirected_header('Accept-Language', 'en-US,en;q=0.5')
return req
https_request = http_request
class ContentNegociationHandler(BaseHandler):
" Handler for content negociation. Also parses <link rel='alternate' type='application/rss+xml' href='...' /> "
class AlternateHandler(BaseHandler):
" Follow <link rel='alternate' type='application/rss+xml' href='...' /> "
def __init__(self, accept=None, strict=False):
self.accept = accept
self.strict = strict
def http_request(self, req):
if self.accept is not None:
if isinstance(self.accept, basestring):
self.accept = (self.accept,)
string = ','.join(self.accept)
if self.strict:
string += ',*/*;q=0.9'
req.add_unredirected_header('Accept', string)
return req
def __init__(self, follow=None):
self.follow = follow or []
def http_response(self, req, resp):
contenttype = resp.info().get('Content-Type', '').split(';')[0]
if 200 <= resp.code < 300 and self.accept is not None and self.strict and contenttype in MIMETYPE['html'] and contenttype not in self.accept:
if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
data = resp.read()
links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
for link in links:
if link.get('type', '') in self.accept:
resp.code = 302
resp.msg = 'Moved Temporarily'
resp.headers['location'] = link.get('href')
try:
links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
for link in links:
if link.get('type', '') in self.follow:
resp.code = 302
resp.msg = 'Moved Temporarily'
resp.headers['location'] = link.get('href')
break
except (ValueError, SyntaxError):
# catch parsing errors
pass
fp = BytesIO(data)
old_resp = resp
@@ -246,7 +362,6 @@ class ContentNegociationHandler(BaseHandler):
return resp
https_request = http_request
https_response = http_response
@@ -260,10 +375,15 @@ class HTTPEquivHandler(BaseHandler):
if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
data = resp.read()
headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
try:
headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
for header in headers:
resp.headers[header.get('http-equiv').lower()] = header.get('content')
for header in headers:
resp.headers[header.get('http-equiv').lower()] = header.get('content')
except (ValueError, SyntaxError):
# catch parsing errors
pass
fp = BytesIO(data)
old_resp = resp
@@ -297,18 +417,31 @@ class HTTPRefreshHandler(BaseHandler):
https_response = http_response
default_cache = {}
class CacheHandler(BaseHandler):
" Cache based on etags/last-modified "
private_cache = False # False to behave like a CDN (or if you just don't care), True like a PC
private_cache = False # Websites can indicate whether the page should be
# cached by CDNs (e.g. shouldn't be the case for
# private/confidential/user-specific pages.
# With this setting, decide whether (False) you want
# the cache to behave like a CDN (i.e. don't cache
# private pages), or (True) to behave like a end-cache
# private pages. If unsure, False is the safest bet.
handler_order = 499
def __init__(self, cache=None, force_min=None):
self.cache = cache or default_cache
self.force_min = force_min # force_min (seconds) to bypass http headers, -1 forever, 0 never, -2 do nothing if not in cache
self.force_min = force_min
# Servers indicate how long they think their content is "valid".
# With this parameter (force_min, expressed in seconds), we can
# override the validity period (i.e. bypassing http headers)
# Special values:
# -1: valid forever, i.e. use the cache no matter what (and fetch
# the page online if not present in cache)
# 0: valid zero second, i.e. force refresh
# -2: same as -1, i.e. use the cache no matter what, but do NOT
# fetch the page online if not present in cache, throw an
# error instead
def load(self, url):
try:
@@ -338,6 +471,10 @@ class CacheHandler(BaseHandler):
return req
def http_open(self, req):
# Reminder of how/when this function is called by urllib2:
# If 'None' is returned, try your chance with the next-available handler
# If a 'resp' is returned, stop there, and proceed with 'http_response'
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
# some info needed to process everything
@@ -360,6 +497,7 @@ class CacheHandler(BaseHandler):
pass
else:
# raise an error, via urllib handlers
headers['Morss'] = 'from_cache'
resp = addinfourl(BytesIO(), headers, req.get_full_url(), 409)
resp.msg = 'Conflict'
@@ -378,14 +516,18 @@ class CacheHandler(BaseHandler):
return None
elif code == 301 and cache_age < 7*24*3600:
# "301 Moved Permanently" has to be cached...as long as we want (awesome HTTP specs), let's say a week (why not?)
# use force_min=0 if you want to bypass this (needed for a proper refresh)
# "301 Moved Permanently" has to be cached...as long as we want
# (awesome HTTP specs), let's say a week (why not?). Use force_min=0
# if you want to bypass this (needed for a proper refresh)
pass
elif self.force_min is None and ('no-cache' in cc_list
or 'no-store' in cc_list
or ('private' in cc_list and not self.private)):
or ('private' in cc_list and not self.private_cache)):
# kindly follow web servers indications, refresh
# if the same settings are used all along, this section shouldn't be
# of any use, since the page woudln't be cached in the first place
# the check is only performed "just in case"
return None
elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
@@ -400,7 +542,7 @@ class CacheHandler(BaseHandler):
# according to the www, we have to refresh when nothing is said
return None
# return the cache as a response
# return the cache as a response. This code is reached with 'pass' above
headers['morss'] = 'from_cache' # TODO delete the morss header from incoming pages, to avoid websites messing up with us
resp = addinfourl(BytesIO(data), headers, req.get_full_url(), code)
resp.msg = msg
@@ -419,7 +561,7 @@ class CacheHandler(BaseHandler):
cc_list = [x for x in cache_control if '=' not in x]
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private):
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache):
# kindly follow web servers indications
return resp
@@ -431,6 +573,8 @@ class CacheHandler(BaseHandler):
data = resp.read()
self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
# the below is only needed because of 'resp.read()' above, as we can't
# seek(0) on arbitraty file-like objects (e.g. sockets)
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
@@ -450,10 +594,14 @@ class CacheHandler(BaseHandler):
unverifiable=True)
new.add_unredirected_header('Morss', 'from_304')
# create a "fake" new request to just re-run through the various
# handlers
return self.parent.open(new, timeout=req.timeout)
return None
return None # when returning 'None', the next-available handler is used
# the 'HTTPRedirectHandler' has no 'handler_order', i.e.
# uses the default of 500, therefore executed after this
https_request = http_request
https_open = http_open
@@ -461,6 +609,20 @@ class CacheHandler(BaseHandler):
class BaseCache:
""" Subclasses must behave like a dict """
def trim(self):
pass
def autotrim(self, delay=CACHE_LIFESPAN):
# trim the cache every so often
self.trim()
t = threading.Timer(delay, self.autotrim)
t.daemon = True
t.start()
def __contains__(self, url):
try:
self[url]
@@ -477,15 +639,21 @@ import sqlite3
class SQLiteCache(BaseCache):
def __init__(self, filename=':memory:'):
self.con = sqlite3.connect(filename or sqlite_default, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
self.con = sqlite3.connect(filename, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
with self.con:
self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)')
self.con.execute('pragma journal_mode=WAL')
self.trim()
def __del__(self):
self.con.close()
def trim(self):
with self.con:
self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,))
def __getitem__(self, url):
row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
@@ -499,32 +667,34 @@ class SQLiteCache(BaseCache):
value[3] = sqlite3.Binary(value[3]) # data
value = tuple(value)
if url in self:
with self.con:
self.con.execute('UPDATE data SET code=?, msg=?, headers=?, data=?, timestamp=? WHERE url=?',
value + (url,))
else:
with self.con:
self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?)', (url,) + value)
with self.con:
self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?) ON CONFLICT(url) DO UPDATE SET code=?, msg=?, headers=?, data=?, timestamp=?', (url,) + value + value)
import pymysql.cursors
class MySQLCacheHandler(BaseCache):
" NB. Requires mono-threading, as pymysql isn't thread-safe "
def __init__(self, user, password, database, host='localhost'):
self.con = pymysql.connect(host=host, user=user, password=password, database=database, charset='utf8', autocommit=True)
self.user = user
self.password = password
self.database = database
self.host = host
with self.con.cursor() as cursor:
with self.cursor() as cursor:
cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)')
def __del__(self):
self.con.close()
self.trim()
def cursor(self):
return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
def trim(self):
with self.cursor() as cursor:
cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,))
def __getitem__(self, url):
cursor = self.con.cursor()
cursor = self.cursor()
cursor.execute('SELECT * FROM data WHERE url=%s', (url,))
row = cursor.fetchone()
@@ -534,11 +704,51 @@ class MySQLCacheHandler(BaseCache):
return row[1:]
def __setitem__(self, url, value): # (code, msg, headers, data, timestamp)
if url in self:
with self.con.cursor() as cursor:
cursor.execute('UPDATE data SET code=%s, msg=%s, headers=%s, data=%s, timestamp=%s WHERE url=%s',
value + (url,))
with self.cursor() as cursor:
cursor.execute('INSERT INTO data VALUES (%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE code=%s, msg=%s, headers=%s, data=%s, timestamp=%s',
(url,) + value + value)
class CappedDict(OrderedDict, BaseCache):
def trim(self):
if CACHE_SIZE >= 0:
for i in range( max( len(self) - CACHE_SIZE , 0 )):
self.popitem(False)
def __setitem__(self, key, value):
# https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
if key in self:
del self[key]
OrderedDict.__setitem__(self, key, value)
if 'CACHE' in os.environ:
if os.environ['CACHE'] == 'mysql':
default_cache = MySQLCacheHandler(
user = os.getenv('MYSQL_USER'),
password = os.getenv('MYSQL_PWD'),
database = os.getenv('MYSQL_DB'),
host = os.getenv('MYSQL_HOST', 'localhost')
)
elif os.environ['CACHE'] == 'sqlite':
if 'SQLITE_PATH' in os.environ:
path = os.getenv('SQLITE_PATH') + '/morss-cache.db'
else:
with self.con.cursor() as cursor:
cursor.execute('INSERT INTO data VALUES (%s,%s,%s,%s,%s,%s)', (url,) + value)
path = ':memory:'
default_cache = SQLiteCache(path)
else:
default_cache = CappedDict()
if __name__ == '__main__':
req = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
if sys.flags.interactive:
print('>>> Interactive shell: try using `req`')
else:
print(req['data'].decode(req['encoding']))

View File

@@ -90,8 +90,11 @@ item_updated = updated
[html]
mode = html
path =
http://localhost/
title = //div[@id='header']/h1
desc = //div[@id='header']/h2
desc = //div[@id='header']/p
items = //div[@id='content']/div
item_title = ./a
@@ -99,7 +102,7 @@ item_link = ./a/@href
item_desc = ./div[class=desc]
item_content = ./div[class=content]
base = <!DOCTYPE html> <html> <head> <title>Feed reader by morss</title> <meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" /> </head> <body> <div id="header"> <h1>@feed.title</h1> <h2>@feed.desc</h2> <p>- via morss</p> </div> <div id="content"> <div class="item"> <a class="title link" href="@item.link" target="_blank">@item.title</a> <div class="desc">@item.desc</div> <div class="content">@item.content</div> </div> </div> <script> var items = document.getElementsByClassName('item') for (var i in items) items[i].onclick = function() { this.classList.toggle('active') document.body.classList.toggle('noscroll') } </script> </body> </html>
base = file:sheet.xsl
[twitter]
mode = html

View File

@@ -1,28 +0,0 @@
import re
import json
from . import crawler
try:
basestring
except NameError:
basestring = str
def pre_worker(url):
if url.startswith('http://itunes.apple.com/') or url.startswith('https://itunes.apple.com/'):
match = re.search('/id([0-9]+)(\?.*)?$', url)
if match:
iid = match.groups()[0]
redirect = 'https://itunes.apple.com/lookup?id=%s' % iid
try:
con = crawler.custom_handler(basic=True).open(redirect, timeout=4)
data = con.read()
except (IOError, HTTPException):
raise
return json.loads(data.decode('utf-8', 'replace'))['results'][0]['feedUrl']
return None

View File

@@ -1,3 +1,20 @@
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import sys
import os.path
@@ -15,6 +32,7 @@ import dateutil.parser
from copy import deepcopy
import lxml.html
from .readabilite import parse as html_parse
json.encoder.c_make_encoder = None
@@ -45,14 +63,32 @@ def parse_rules(filename=None):
rules = dict([(x, dict(config.items(x))) for x in config.sections()])
for section in rules.keys():
# for each ruleset
for arg in rules[section].keys():
if '\n' in rules[section][arg]:
# for each rule
if rules[section][arg].startswith('file:'):
paths = [os.path.join(sys.prefix, 'share/morss/www', rules[section][arg][5:]),
os.path.join(os.path.dirname(__file__), '../www', rules[section][arg][5:]),
os.path.join(os.path.dirname(__file__), '../..', rules[section][arg][5:])]
for path in paths:
try:
file_raw = open(path).read()
file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
rules[section][arg] = file_clean
except IOError:
pass
elif '\n' in rules[section][arg]:
rules[section][arg] = rules[section][arg].split('\n')[1:]
return rules
def parse(data, url=None, mimetype=None):
def parse(data, url=None, encoding=None):
" Determine which ruleset to use "
rulesets = parse_rules()
@@ -66,28 +102,22 @@ def parse(data, url=None, mimetype=None):
for path in ruleset['path']:
if fnmatch(url, path):
parser = [x for x in parsers if x.mode == ruleset['mode']][0]
return parser(data, ruleset)
return parser(data, ruleset, encoding=encoding)
# 2) Look for a parser based on mimetype
if mimetype is not None:
parser_candidates = [x for x in parsers if mimetype in x.mimetype]
if mimetype is None or parser_candidates is None:
parser_candidates = parsers
# 2) Try each and every parser
# 3) Look for working ruleset for given parser
# 3a) See if parsing works
# 3b) See if .items matches anything
for parser in parser_candidates:
for parser in parsers:
ruleset_candidates = [x for x in rulesets.values() if x['mode'] == parser.mode and 'path' not in x]
# 'path' as they should have been caught beforehands
try:
feed = parser(data)
feed = parser(data, encoding=encoding)
except (ValueError):
except (ValueError, SyntaxError):
# parsing did not work
pass
@@ -112,7 +142,7 @@ def parse(data, url=None, mimetype=None):
class ParserBase(object):
def __init__(self, data=None, rules=None, parent=None):
def __init__(self, data=None, rules=None, parent=None, encoding=None):
if rules is None:
rules = parse_rules()[self.default_ruleset]
@@ -121,9 +151,10 @@ class ParserBase(object):
if data is None:
data = rules['base']
self.root = self.parse(data)
self.parent = parent
self.encoding = encoding
self.root = self.parse(data)
def parse(self, raw):
pass
@@ -148,15 +179,15 @@ class ParserBase(object):
c = csv.writer(out, dialect=csv.excel)
for item in self.items:
row = [getattr(item, x) for x in item.dic]
if encoding != 'unicode':
row = [x.encode(encoding) if isinstance(x, unicode) else x for x in row]
c.writerow(row)
c.writerow([getattr(item, x) for x in item.dic])
out.seek(0)
return out.read()
out = out.read()
if encoding != 'unicode':
out = out.encode(encoding)
return out
def tohtml(self, **k):
return self.convert(FeedHTML).tostring(**k)
@@ -267,8 +298,15 @@ class ParserBase(object):
except AttributeError:
# does not exist, have to create it
self.rule_create(self.rules[rule_name])
self.rule_set(self.rules[rule_name], value)
try:
self.rule_create(self.rules[rule_name])
except AttributeError:
# no way to create it, give up
pass
else:
self.rule_set(self.rules[rule_name], value)
def rmv(self, rule_name):
# easy deleter
@@ -286,10 +324,7 @@ class ParserXML(ParserBase):
NSMAP = {'atom': 'http://www.w3.org/2005/Atom',
'atom03': 'http://purl.org/atom/ns#',
'media': 'http://search.yahoo.com/mrss/',
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'slash': 'http://purl.org/rss/1.0/modules/slash/',
'dc': 'http://purl.org/dc/elements/1.1/',
'content': 'http://purl.org/rss/1.0/modules/content/',
'rssfake': 'http://purl.org/rss/1.0/'}
@@ -301,7 +336,7 @@ class ParserXML(ParserBase):
return self.root.getparent().remove(self.root)
def tostring(self, encoding='unicode', **k):
return etree.tostring(self.root, encoding=encoding, **k)
return etree.tostring(self.root, encoding=encoding, method='xml', **k)
def _rule_parse(self, rule):
test = re.search(r'^(.*)/@([a-z]+)$', rule) # to match //div/a/@href
@@ -383,7 +418,8 @@ class ParserXML(ParserBase):
return
elif key is not None:
del x.attrib[key]
if key in match.attrib:
del match.attrib[key]
else:
match.getparent().remove(match)
@@ -401,13 +437,14 @@ class ParserXML(ParserBase):
else:
if html_rich:
# atom stuff
if 'atom' in rule:
match.attrib['type'] = 'xhtml'
self._clean_node(match)
match.append(lxml.html.fragment_fromstring(value, create_parent='div'))
match.find('div').drop_tag()
if self.rules['mode'] == 'html':
match.find('div').drop_tag() # not supported by lxml.etree
else: # i.e. if atom
match.attrib['type'] = 'xhtml'
else:
if match is not None and len(match):
@@ -440,11 +477,10 @@ class ParserHTML(ParserXML):
mimetype = ['text/html', 'application/xhtml+xml']
def parse(self, raw):
parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
return etree.fromstring(raw, parser)
return html_parse(raw, encoding=self.encoding)
def tostring(self, encoding='unicode', **k):
return lxml.html.tostring(self.root, encoding=encoding, **k)
return lxml.html.tostring(self.root, encoding=encoding, method='html', **k)
def rule_search_all(self, rule):
try:
@@ -467,6 +503,9 @@ class ParserHTML(ParserXML):
element = deepcopy(match)
match.getparent().append(element)
else:
raise AttributeError('no way to create item')
def parse_time(value):
if value is None or value == 0:
@@ -474,13 +513,13 @@ def parse_time(value):
elif isinstance(value, basestring):
if re.match(r'^[0-9]+$', value):
return datetime.fromtimestamp(int(value), tz.UTC)
return datetime.fromtimestamp(int(value), tz.tzutc())
else:
return dateutil.parser.parse(value)
return dateutil.parser.parse(value).replace(tzinfo=tz.tzutc())
elif isinstance(value, int):
return datetime.fromtimestamp(value, tz.UTC)
return datetime.fromtimestamp(value, tz.tzutc())
elif isinstance(value, datetime):
return value
@@ -587,34 +626,41 @@ class ParserJSON(ParserBase):
return out.replace('\n', '<br/>') if out else out
class Uniq(object):
_map = {}
_id = None
def wrap_uniq(wrapper_fn_name):
" Wraps the output of the function with the specified function "
# This is called when parsing "wrap_uniq('wrap_item')"
def __new__(cls, *args, **kwargs):
# check if a wrapper was already created for it
# if so, reuse it
# if not, create a new one
# note that the item itself (the tree node) is created beforehands
def decorator(func):
# This is called when parsing "@wrap_uniq('wrap_item')"
tmp_id = cls._gen_id(*args, **kwargs)
if tmp_id in cls._map:
return cls._map[tmp_id]
def wrapped_func(self, *args, **kwargs):
# This is called when the wrapped function is called
else:
obj = object.__new__(cls) #, *args, **kwargs)
cls._map[tmp_id] = obj
return obj
output = func(self, *args, **kwargs)
output_id = id(output)
try:
return self._map[output_id]
except (KeyError, AttributeError):
if not hasattr(self, '_map'):
self._map = {}
wrapper_fn = getattr(self, wrapper_fn_name)
obj = wrapper_fn(output)
self._map[output_id] = obj
return obj
return wrapped_func
return decorator
class Feed(object):
itemsClass = 'Item'
itemsClass = property(lambda x: Item) # because Item is define below, i.e. afterwards
dic = ('title', 'desc', 'items')
def wrap_items(self, items):
itemsClass = globals()[self.itemsClass]
return [itemsClass(x, self.rules, self) for x in items]
title = property(
lambda f: f.get('title'),
lambda f,x: f.set('title', x),
@@ -630,10 +676,7 @@ class Feed(object):
self.rule_create(self.rules['items'])
item = self.items[-1]
if new is None:
return
for attr in globals()[self.itemsClass].dic:
for attr in self.itemsClass.dic:
try:
setattr(item, attr, getattr(new, attr))
@@ -644,8 +687,14 @@ class Feed(object):
except (IndexError, TypeError):
pass
return item
def wrap_item(self, item):
return self.itemsClass(item, self.rules, self)
@wrap_uniq('wrap_item')
def __getitem__(self, key):
return self.wrap_items(self.get_raw('items'))[key]
return self.get_raw('items')[key]
def __delitem__(self, key):
self[key].remove()
@@ -654,7 +703,7 @@ class Feed(object):
return len(self.get_raw('items'))
class Item(Uniq):
class Item(object):
dic = ('title', 'link', 'desc', 'content', 'time', 'updated')
def __init__(self, xml=None, rules=None, parent=None):
@@ -693,32 +742,45 @@ class Item(Uniq):
lambda f: f.rmv('item_updated') )
class FeedXML(Feed, ParserXML):
itemsClass = 'ItemXML'
def tostring(self, encoding='unicode', **k):
# override needed due to "getroottree" inclusion
if self.root.getprevious() is None:
self.root.addprevious(etree.PI('xml-stylesheet', 'type="text/xsl" href="/sheet.xsl"'))
return etree.tostring(self.root.getroottree(), encoding=encoding, **k)
class ItemXML(Item, ParserXML):
pass
class FeedHTML(Feed, ParserHTML):
itemsClass = 'ItemHTML'
class FeedXML(Feed, ParserXML):
itemsClass = ItemXML
def root_siblings(self):
out = []
current = self.root.getprevious()
while current is not None:
out.append(current)
current = current.getprevious()
return out
def tostring(self, encoding='unicode', **k):
# override needed due to "getroottree" inclusion
# and to add stylesheet
stylesheets = [x for x in self.root_siblings() if isinstance(x, etree.PIBase) and x.target == 'xml-stylesheet']
for stylesheet in stylesheets:
# remove all stylesheets present (be that ours or others')
self.root.append(stylesheet) # needed as we can't delete root siblings https://stackoverflow.com/a/60232366
self.root.remove(stylesheet)
self.root.addprevious(etree.PI('xml-stylesheet', 'type="text/xsl" href="/sheet.xsl"'))
return etree.tostring(self.root.getroottree(), encoding=encoding, method='xml', **k)
class ItemHTML(Item, ParserHTML):
pass
class FeedJSON(Feed, ParserJSON):
itemsClass = 'ItemJSON'
class FeedHTML(Feed, ParserHTML):
itemsClass = ItemHTML
class ItemJSON(Item, ParserJSON):
@@ -732,3 +794,20 @@ class ItemJSON(Item, ParserJSON):
return
cur = cur[node]
class FeedJSON(Feed, ParserJSON):
itemsClass = ItemJSON
if __name__ == '__main__':
from . import crawler
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')
feed = parse(req['data'], url=req['url'], encoding=req['encoding'])
if sys.flags.interactive:
print('>>> Interactive shell: try using `feed`')
else:
for item in feed.items:
print(item.title, item.link)

View File

@@ -1,9 +1,25 @@
import sys
import os
import os.path
import time
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import threading
import os
import time
from datetime import datetime
from dateutil import tz
from fnmatch import fnmatch
import re
@@ -12,67 +28,49 @@ import lxml.etree
import lxml.html
from . import feeds
from . import feedify
from . import crawler
from . import readabilite
import wsgiref.simple_server
import wsgiref.handlers
try:
# python 2
from Queue import Queue
from httplib import HTTPException
from urllib import quote_plus
from urlparse import urlparse, urljoin, parse_qs
except ImportError:
# python 3
from queue import Queue
from http.client import HTTPException
from urllib.parse import quote_plus
from urllib.parse import urlparse, urljoin, parse_qs
LIM_ITEM = 100 # deletes what's beyond
LIM_TIME = 7 # deletes what's after
MAX_ITEM = 50 # cache-only beyond
MAX_TIME = 7 # cache-only after (in sec)
DELAY = 10 * 60 # xml cache & ETag cache (in sec)
TIMEOUT = 4 # http timeout (in sec)
THREADS = 10 # number of threads (1 for single-threaded)
DEBUG = False
PORT = 8080
MAX_ITEM = int(os.getenv('MAX_ITEM', 5)) # cache-only beyond
MAX_TIME = int(os.getenv('MAX_TIME', 2)) # cache-only after (in sec)
PROTOCOL = ['http', 'https', 'ftp']
LIM_ITEM = int(os.getenv('LIM_ITEM', 10)) # deletes what's beyond
LIM_TIME = int(os.getenv('LIM_TIME', 2.5)) # deletes what's after
def filterOptions(options):
return options
# example of filtering code below
#allowed = ['proxy', 'clip', 'keep', 'cache', 'force', 'silent', 'pro', 'debug']
#filtered = dict([(key,value) for (key,value) in options.items() if key in allowed])
#return filtered
DELAY = int(os.getenv('DELAY', 10 * 60)) # xml cache & ETag cache (in sec)
TIMEOUT = int(os.getenv('TIMEOUT', 4)) # http timeout (in sec)
class MorssException(Exception):
pass
def log(txt, force=False):
if DEBUG or force:
def log(txt):
if 'DEBUG' in os.environ:
if 'REQUEST_URI' in os.environ:
# when running on Apache
open('morss.log', 'a').write("%s\n" % repr(txt))
else:
# when using internal server or cli
print(repr(txt))
def len_html(txt):
if len(txt):
return len(lxml.html.fromstring(txt).text_content())
else:
return 0
@@ -80,6 +78,7 @@ def len_html(txt):
def count_words(txt):
if len(txt):
return len(lxml.html.fromstring(txt).text_content().split())
return 0
@@ -88,12 +87,14 @@ class Options:
if len(args):
self.options = args
self.options.update(options or {})
else:
self.options = options or {}
def __getattr__(self, key):
if key in self.options:
return self.options[key]
else:
return False
@@ -104,28 +105,11 @@ class Options:
return key in self.options
def parseOptions(options):
""" Turns ['md=True'] into {'md':True} """
out = {}
for option in options:
split = option.split('=', 1)
if len(split) > 1:
if split[0].lower() == 'true':
out[split[0]] = True
elif split[0].lower() == 'false':
out[split[0]] = False
else:
out[split[0]] = split[1]
else:
out[split[0]] = True
return out
def ItemFix(item, feedurl='/'):
def ItemFix(item, options, feedurl='/'):
""" Improves feed items (absolute links, resolve feedburner links, etc) """
# check unwanted uppercase title
if len(item.title) > 20 and item.title.isupper():
if item.title is not None and len(item.title) > 20 and item.title.isupper():
item.title = item.title.title()
# check if it includes link
@@ -140,6 +124,13 @@ def ItemFix(item, feedurl='/'):
item.link = match[0]
log(item.link)
# at user's election, use first <a>
if options.firstlink and (item.desc or item.content):
match = lxml.html.fromstring(item.desc or item.content).xpath('//a/@href')
if len(match):
item.link = match[0]
log(item.link)
# check relative urls
item.link = urljoin(feedurl, item.link)
@@ -158,6 +149,11 @@ def ItemFix(item, feedurl='/'):
item.link = parse_qs(urlparse(item.link).query)['url'][0]
log(item.link)
# pocket
if fnmatch(item.link, 'https://getpocket.com/redirect?url=*'):
item.link = parse_qs(urlparse(item.link).query)['url'][0]
log(item.link)
# facebook
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
item.link = parse_qs(urlparse(item.link).query)['u'][0]
@@ -183,7 +179,7 @@ def ItemFix(item, feedurl='/'):
# reddit
if urlparse(feedurl).netloc == 'www.reddit.com':
match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
match = lxml.html.fromstring(item.content).xpath('//a[text()="[link]"]/@href')
if len(match):
item.link = match[0]
log(item.link)
@@ -196,59 +192,43 @@ def ItemFill(item, options, feedurl='/', fast=False):
if not item.link:
log('no link')
return item
return True
log(item.link)
link = item.link
# twitter
if urlparse(feedurl).netloc == 'twitter.com':
match = lxml.html.fromstring(item.desc).xpath('//a/@data-expanded-url')
if len(match):
link = match[0]
log(link)
else:
link = None
# facebook
if urlparse(feedurl).netloc == 'graph.facebook.com':
match = lxml.html.fromstring(item.content).xpath('//a/@href')
if len(match) and urlparse(match[0]).netloc != 'www.facebook.com':
link = match[0]
log(link)
else:
link = None
if link is None:
log('no used link')
return True
# download
delay = -1
if fast:
# super-fast mode
if fast or options.fast:
# force cache, don't fetch
delay = -2
elif options.force:
# force refresh
delay = 0
else:
delay = 24*60*60 # 24h
try:
con = crawler.custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT)
data = con.read()
req = crawler.adv_get(url=item.link, delay=delay, timeout=TIMEOUT)
except (IOError, HTTPException) as e:
log('http error')
return False # let's just delete errors stuff when in cache mode
contenttype = con.info().get('Content-Type', '').split(';')[0]
if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
if req['contenttype'] not in crawler.MIMETYPE['html'] and req['contenttype'] != 'text/plain':
log('non-text page')
return True
out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con))
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
if out is not None:
item.content = out
if options.resolve:
item.link = req['url']
return True
@@ -265,10 +245,7 @@ def ItemBefore(item, options):
def ItemAfter(item, options):
if options.clip and item.desc and item.content:
item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
del item.desc
if not options.keep and not options.proxy:
item.content = item.desc + "<br/><br/><hr/><br/><br/>" + item.content
del item.desc
if options.nolink and item.content:
@@ -276,7 +253,7 @@ def ItemAfter(item, options):
for link in content.xpath('//a'):
log(link.text_content())
link.drop_tag()
item.content = lxml.etree.tostring(content)
item.content = lxml.etree.tostring(content, method='html')
if options.noref:
item.link = ''
@@ -285,71 +262,50 @@ def ItemAfter(item, options):
def FeedFetch(url, options):
# basic url clean-up
if url is None:
raise MorssException('No url provided')
if urlparse(url).scheme not in PROTOCOL:
url = 'http://' + url
log(url)
url = url.replace(' ', '%20')
if isinstance(url, bytes):
url = url.decode()
# allow for code execution for feedify
pre = feedify.pre_worker(url)
if pre:
url = pre
log('url redirect')
log(url)
# fetch feed
delay = DELAY
if options.theforce:
if options.force:
delay = 0
try:
con = crawler.custom_handler(accept='xml', strict=True, delay=delay,
encoding=options.encoding, basic=not options.items) \
.open(url, timeout=TIMEOUT * 2)
xml = con.read()
req = crawler.adv_get(url=url, follow=('rss' if not options.items else None), delay=delay, timeout=TIMEOUT * 2)
except (IOError, HTTPException):
raise MorssException('Error downloading feed')
contenttype = con.info().get('Content-Type', '').split(';')[0]
if options.items:
# using custom rules
rss = feeds.FeedHTML(xml, url, contenttype)
feed.rule
rss = feeds.FeedHTML(req['data'], encoding=req['encoding'])
rss.rules['title'] = options.title if options.title else '//head/title'
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
rss.rules['items'] = options.items
if options.item_title:
rss.rules['item_title'] = options.item_title
if options.item_link:
rss.rules['item_link'] = options.item_link
rss.rules['item_title'] = options.item_title if options.item_title else '.'
rss.rules['item_link'] = options.item_link if options.item_link else './@href|.//a/@href|ancestor::a/@href'
if options.item_content:
rss.rules['item_content'] = options.item_content
if options.item_time:
rss.rules['item_time'] = options.item_time
rss = rss.convert(feeds.FeedXML)
else:
try:
rss = feeds.parse(xml, url, contenttype)
rss = feeds.parse(req['data'], url=url, encoding=req['encoding'])
rss = rss.convert(feeds.FeedXML)
# contains all fields, otherwise much-needed data can be lost
except TypeError:
log('random page')
log(contenttype)
log(req['contenttype'])
raise MorssException('Link provided is not a valid feed')
return rss
return req['url'], rss
def FeedGather(rss, url, options):
@@ -361,42 +317,37 @@ def FeedGather(rss, url, options):
lim_time = LIM_TIME
max_item = MAX_ITEM
max_time = MAX_TIME
threads = THREADS
if options.cache:
max_time = 0
if options.mono:
threads = 1
if options.newest:
# :newest take the newest items
now = datetime.now(tz.tzutc())
sorted_items = sorted(rss.items, key=lambda x:x.updated or x.time or now, reverse=True)
# set
def runner(queue):
while True:
value = queue.get()
try:
worker(*value)
except Exception as e:
log('Thread Error: %s' % e.message)
queue.task_done()
else:
# default behavior, take the first items (in appearing order)
sorted_items = list(rss.items)
def worker(i, item):
for i, item in enumerate(sorted_items):
if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0:
log('dropped')
item.remove()
return
continue
item = ItemBefore(item, options)
if item is None:
return
continue
item = ItemFix(item, url)
item = ItemFix(item, options, url)
if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
if not options.proxy:
if ItemFill(item, options, url, True) is False:
item.remove()
return
continue
else:
if not options.proxy:
@@ -404,22 +355,6 @@ def FeedGather(rss, url, options):
item = ItemAfter(item, options)
queue = Queue()
for i in range(threads):
t = threading.Thread(target=runner, args=(queue,))
t.daemon = True
t.start()
for i, item in enumerate(list(rss.items)):
if threads == 1:
worker(*[i, item])
else:
queue.put([i, item])
if threads != 1:
queue.join()
if options.ad:
new = rss.items.append()
new.title = "Are you hungry?"
@@ -433,37 +368,38 @@ def FeedGather(rss, url, options):
return rss
def FeedFormat(rss, options):
def FeedFormat(rss, options, encoding='utf-8'):
if options.callback:
if re.match(r'^[a-zA-Z0-9\.]+$', options.callback) is not None:
return '%s(%s)' % (options.callback, rss.tojson())
out = '%s(%s)' % (options.callback, rss.tojson(encoding='unicode'))
return out if encoding == 'unicode' else out.encode(encoding)
else:
raise MorssException('Invalid callback var name')
elif options.json:
elif options.format == 'json':
if options.indent:
return rss.tojson(encoding='UTF-8', indent=4)
return rss.tojson(encoding=encoding, indent=4)
else:
return rss.tojson(encoding='UTF-8')
return rss.tojson(encoding=encoding)
elif options.csv:
return rss.tocsv(encoding='UTF-8')
elif options.format == 'csv':
return rss.tocsv(encoding=encoding)
elif options.reader:
elif options.format == 'html':
if options.indent:
return rss.tohtml(encoding='UTF-8', pretty_print=True)
return rss.tohtml(encoding=encoding, pretty_print=True)
else:
return rss.tohtml(encoding='UTF-8')
return rss.tohtml(encoding=encoding)
else:
else: # i.e. format == 'rss'
if options.indent:
return rss.torss(xml_declaration=True, encoding='UTF-8', pretty_print=True)
return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding, pretty_print=True)
else:
return rss.torss(xml_declaration=True, encoding='UTF-8')
return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding)
def process(url, cache=None, options=None):
@@ -475,187 +411,7 @@ def process(url, cache=None, options=None):
if cache:
crawler.default_cache = crawler.SQLiteCache(cache)
rss = FeedFetch(url, options)
url, rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options)
return FeedFormat(rss, options)
def cgi_app(environ, start_response):
# get options
if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:]
else:
url = environ['PATH_INFO'][1:]
if environ['QUERY_STRING']:
url += '?' + environ['QUERY_STRING']
url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
if url.startswith(':'):
split = url.split('/', 1)
options = split[0].replace('|', '/').replace('\\\'', '\'').split(':')[1:]
if len(split) > 1:
url = split[1]
else:
url = ''
else:
options = []
# init
options = Options(filterOptions(parseOptions(options)))
headers = {}
global DEBUG
DEBUG = options.debug
# headers
headers['status'] = '200 OK'
headers['cache-control'] = 'max-age=%s' % DELAY
if options.cors:
headers['access-control-allow-origin'] = '*'
if options.html or options.reader:
headers['content-type'] = 'text/html'
elif options.txt or options.silent:
headers['content-type'] = 'text/plain'
elif options.json:
headers['content-type'] = 'application/json'
elif options.callback:
headers['content-type'] = 'application/javascript'
elif options.csv:
headers['content-type'] = 'text/csv'
headers['content-disposition'] = 'attachment; filename="feed.csv"'
else:
headers['content-type'] = 'text/xml'
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
# get the work done
rss = FeedFetch(url, options)
if headers['content-type'] == 'text/xml':
headers['content-type'] = rss.mimetype[0]
start_response(headers['status'], list(headers.items()))
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options)
if not options.silent:
return out
def cgi_wrapper(environ, start_response):
# simple http server for html and css
files = {
'': 'text/html',
'index.html': 'text/html'}
if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:]
else:
url = environ['PATH_INFO'][1:]
if url in files:
headers = {}
if url == '':
url = 'index.html'
if '--root' in sys.argv[1:]:
path = os.path.join(sys.argv[-1], url)
else:
path = url
try:
body = open(path, 'rb').read()
headers['status'] = '200 OK'
headers['content-type'] = files[url]
start_response(headers['status'], list(headers.items()))
return [body]
except IOError:
headers['status'] = '404 Not found'
start_response(headers['status'], list(headers.items()))
return ['Error %s' % headers['status']]
# actual morss use
try:
return [cgi_app(environ, start_response) or '(empty)']
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
headers = {'status': '500 Oops', 'content-type': 'text/plain'}
start_response(headers['status'], list(headers.items()), sys.exc_info())
log('ERROR <%s>: %s' % (url, e.message), force=True)
return ['An error happened:\n%s' % e.message]
def cli_app():
options = Options(filterOptions(parseOptions(sys.argv[1:-1])))
url = sys.argv[-1]
global DEBUG
DEBUG = options.debug
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options)
if not options.silent:
print(out.decode('utf-8', 'replace') if isinstance(out, bytes) else out)
log('done')
def isInt(string):
try:
int(string)
return True
except ValueError:
return False
def main():
if 'REQUEST_URI' in os.environ:
# mod_cgi
wsgiref.handlers.CGIHandler().run(cgi_wrapper)
elif len(sys.argv) <= 1 or isInt(sys.argv[1]) or '--root' in sys.argv[1:]:
# start internal (basic) http server
if len(sys.argv) > 1 and isInt(sys.argv[1]):
argPort = int(sys.argv[1])
if argPort > 0:
port = argPort
else:
raise MorssException('Port must be positive integer')
else:
port = PORT
print('Serving http://localhost:%s/'%port)
httpd = wsgiref.simple_server.make_server('', port, cgi_wrapper)
httpd.serve_forever()
else:
# as a CLI app
try:
cli_app()
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
print('ERROR: %s' % e.message)
if __name__ == '__main__':
main()
return FeedFormat(rss, options, 'unicode')

View File

@@ -1,13 +1,34 @@
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import lxml.etree
import lxml.html
from bs4 import BeautifulSoup
import re
def parse(data, encoding=None):
if encoding:
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding=encoding)
data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')
else:
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
data = BeautifulSoup(data, 'lxml').prettify('utf-8')
parser = lxml.html.HTMLParser(remove_comments=True, encoding='utf-8')
return lxml.html.fromstring(data, parser=parser)
@@ -60,9 +81,10 @@ class_good = ['and', 'article', 'body', 'column', 'main',
regex_good = re.compile('|'.join(class_good), re.I)
tags_junk = ['script', 'head', 'iframe', 'object', 'noscript',
'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea',
'button', 'footer']
tags_dangerous = ['script', 'head', 'iframe', 'object', 'style', 'link', 'meta']
tags_junk = tags_dangerous + ['noscript', 'param', 'embed', 'layer', 'applet',
'form', 'input', 'textarea', 'button', 'footer']
tags_bad = tags_junk + ['a', 'aside']
@@ -90,13 +112,24 @@ def score_node(node):
" Score individual node "
score = 0
class_id = node.get('class', '') + node.get('id', '')
class_id = (node.get('class') or '') + (node.get('id') or '')
if (isinstance(node, lxml.html.HtmlComment)
or node.tag in tags_bad
or regex_bad.search(class_id)):
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
return 0
if node.tag in tags_dangerous:
return 0
if node.tag in tags_junk:
score += -1 # actuall -2 as tags_junk is included tags_bad
if node.tag in tags_bad:
score += -1
if regex_bad.search(class_id):
score += -1
if node.tag in tags_good:
score += 4
@@ -109,38 +142,47 @@ def score_node(node):
if wc != 0:
wca = count_words(' '.join([x.text_content() for x in node.findall('.//a')]))
score = score * ( 1 - float(wca)/wc )
score = score * ( 1 - 2 * float(wca)/wc )
return score
def score_all(node, grades=None):
def score_all(node):
" Fairly dumb loop to score all worthwhile nodes. Tries to be fast "
if grades is None:
grades = {}
for child in node:
score = score_node(child)
child.attrib['seen'] = 'yes, ' + str(int(score))
child.attrib['morss_own_score'] = str(float(score))
if score > 0:
spread_score(child, score, grades)
score_all(child, grades)
return grades
if score > 0 or len(list(child.iterancestors())) <= 2:
spread_score(child, score)
score_all(child)
def spread_score(node, score, grades):
def set_score(node, value):
node.attrib['morss_score'] = str(float(value))
def get_score(node):
return float(node.attrib.get('morss_score', 0))
def incr_score(node, delta):
set_score(node, get_score(node) + delta)
def get_all_scores(node):
return {x:get_score(x) for x in list(node.iter()) if get_score(x) != 0}
def spread_score(node, score):
" Spread the node's score to its parents, on a linear way "
delta = score / 2
for ancestor in [node,] + list(node.iterancestors()):
if score >= 1 or ancestor is node:
try:
grades[ancestor] += score
except KeyError:
grades[ancestor] = score
incr_score(ancestor, score)
score -= delta
@@ -148,26 +190,29 @@ def spread_score(node, score, grades):
break
def write_score_all(root, grades):
" Useful for debugging "
for node in root.iter():
node.attrib['score'] = str(int(grades.get(node, 0)))
def clean_root(root):
def clean_root(root, keep_threshold=None):
for node in list(root):
clean_root(node)
clean_node(node)
# bottom-up approach, i.e. starting with children before cleaning current node
clean_root(node, keep_threshold)
clean_node(node, keep_threshold)
def clean_node(node):
def clean_node(node, keep_threshold=None):
parent = node.getparent()
if parent is None:
# this is <html/> (or a removed element waiting for GC)
return
# remove dangerous tags, no matter what
if node.tag in tags_dangerous:
parent.remove(node)
return
# high score, so keep
if keep_threshold is not None and get_score(node) >= keep_threshold:
return
gdparent = parent.getparent()
# remove shitty tags
@@ -266,41 +311,54 @@ def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
return nodeA # should always find one tho, at least <html/>, but needed for max_depth
def rank_nodes(grades):
return sorted(grades.items(), key=lambda x: x[1], reverse=True)
def get_best_node(grades):
" To pick the best (raw) node. Another function will clean it "
if len(grades) == 1:
return grades[0]
top = rank_nodes(grades)
lowest = lowest_common_ancestor(top[0][0], top[1][0], 3)
return lowest
def get_article(data, url=None, encoding=None):
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5):
" Input a raw html string, returns a raw html string of the article "
html = parse(data, encoding)
scores = score_all(html)
html = parse(data, encoding_in)
score_all(html)
if not len(scores):
# rank all nodes (largest to smallest)
ranked_nodes = sorted(html.iter(), key=lambda x: get_score(x), reverse=True)
# minimum threshold
if not len(ranked_nodes) or get_score(ranked_nodes[0]) < threshold:
return None
best = get_best_node(scores)
# take common ancestor or the two highest rated nodes
if len(ranked_nodes) > 1:
best = lowest_common_ancestor(ranked_nodes[0], ranked_nodes[1], 3)
else:
best = ranked_nodes[0]
# clean up
if not debug:
keep_threshold = get_score(ranked_nodes[0]) * 3/4
clean_root(best, keep_threshold)
# check for spammy content (links only)
wc = count_words(best.text_content())
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
if wc - wca < 50 or float(wca) / wc > 0.3:
if not debug and (wc - wca < 50 or float(wca) / wc > 0.3):
return None
# fix urls
if url:
best.make_links_absolute(url)
clean_root(best)
return lxml.etree.tostring(best if not debug else html, method='html', encoding=encoding_out)
return lxml.etree.tostring(best, pretty_print=True)
if __name__ == '__main__':
import sys
from . import crawler
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
article = get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
if sys.flags.interactive:
print('>>> Interactive shell: try using `article`')
else:
print(article)

View File

@@ -1,210 +0,0 @@
@require(feed)
<!DOCTYPE html>
<html>
<head>
<title>@feed.title &#8211; via morss</title>
<meta charset="UTF-8" />
<meta name="description" content="@feed.desc (via morss)" />
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
<style type="text/css">
/* columns - from https://thisisdallas.github.io/Simple-Grid/simpleGrid.css */
* {
box-sizing: border-box;
}
#content {
width: 100%;
max-width: 1140px;
min-width: 755px;
margin: 0 auto;
overflow: hidden;
padding-top: 20px;
padding-left: 20px; /* grid-space to left */
padding-right: 0px; /* grid-space to right: (grid-space-left - column-space) e.g. 20px-20px=0 */
}
.item {
width: 33.33%;
float: left;
padding-right: 20px; /* column-space */
}
@@media handheld, only screen and (max-width: 767px) { /* @@ to escape from the template engine */
#content {
width: 100%;
min-width: 0;
margin-left: 0px;
margin-right: 0px;
padding-left: 20px; /* grid-space to left */
padding-right: 10px; /* grid-space to right: (grid-space-left - column-space) e.g. 20px-10px=10px */
}
.item {
width: auto;
float: none;
margin-left: 0px;
margin-right: 0px;
margin-top: 10px;
margin-bottom: 10px;
padding-left: 0px;
padding-right: 10px; /* column-space */
}
}
/* design */
#header h1, #header h2, #header p {
font-family: sans;
text-align: center;
margin: 0;
padding: 0;
}
#header h1 {
font-size: 2.5em;
font-weight: bold;
padding: 1em 0 0.25em;
}
#header h2 {
font-size: 1em;
font-weight: normal;
}
#header p {
color: gray;
font-style: italic;
font-size: 0.75em;
}
#content {
text-align: justify;
}
.item .title {
font-weight: bold;
display: block;
text-align: center;
}
.item .link {
color: inherit;
text-decoration: none;
}
.item:not(.active) {
cursor: pointer;
height: 20em;
margin-bottom: 20px;
overflow: hidden;
text-overflow: ellpisps;
padding: 0.25em;
position: relative;
}
.item:not(.active) .title {
padding-bottom: 0.1em;
margin-bottom: 0.1em;
border-bottom: 1px solid silver;
}
.item:not(.active):before {
content: " ";
display: block;
width: 100%;
position: absolute;
top: 18.5em;
height: 1.5em;
background: linear-gradient(to bottom, rgba(255,255,255,0) 0%, rgba(255,255,255,1) 100%);
}
.item:not(.active) .article * {
max-width: 100%;
font-size: 1em !important;
font-weight: normal;
display: inline;
margin: 0;
}
.item.active {
background: white;
position: fixed;
overflow: auto;
top: 0;
left: 0;
height: 100%;
width: 100%;
z-index: 1;
}
body.noscroll {
overflow: hidden;
}
.item.active > * {
max-width: 700px;
margin: auto;
}
.item.active .title {
font-size: 2em;
padding: 0.5em 0;
}
.item.active .article object,
.item.active .article video,
.item.active .article audio {
display: none;
}
.item.active .article img {
max-height: 20em;
max-width: 100%;
}
</style>
</head>
<body>
<div id="header">
<h1>@feed.title</h1>
@if feed.desc:
<h2>@feed.desc</h2>
@end
<p>- via morss</p>
</div>
<div id="content">
@for item in feed.items:
<div class="item">
@if item.link:
<a class="title link" href="@item.link" target="_blank">@item.title</a>
@else:
<span class="title">@item.title</span>
@end
<div class="article">
@if item.content:
@item.content
@else:
@item.desc
@end
</div>
</div>
@end
</div>
<script>
var items = document.getElementsByClassName('item')
for (var i in items)
items[i].onclick = function()
{
this.classList.toggle('active')
document.body.classList.toggle('noscroll')
}
</script>
</body>
</html>

295
morss/wsgi.py Normal file
View File

@@ -0,0 +1,295 @@
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import sys
import os.path
import re
import lxml.etree
import cgitb
import wsgiref.util
import wsgiref.simple_server
import wsgiref.handlers
import mimetypes
try:
# python 2
from urllib import unquote
except ImportError:
# python 3
from urllib.parse import unquote
from . import crawler
from . import readabilite
from .morss import FeedFetch, FeedGather, FeedFormat
from .morss import Options, log, TIMEOUT, DELAY, MorssException
PORT = int(os.getenv('PORT', 8080))
def parse_options(options):
""" Turns ['md=True'] into {'md':True} """
out = {}
for option in options:
split = option.split('=', 1)
if len(split) > 1:
out[split[0]] = split[1]
else:
out[split[0]] = True
return out
def get_path(environ):
if 'REQUEST_URI' in environ:
# when running on Apache
url = unquote(environ['REQUEST_URI'][1:])
else:
# when using internal server
url = environ['PATH_INFO'][1:]
if environ['QUERY_STRING']:
url += '?' + environ['QUERY_STRING']
return url
def cgi_parse_environ(environ):
# get options
url = get_path(environ)
url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
if url.startswith(':'):
split = url.split('/', 1)
raw_options = split[0].replace('|', '/').replace('\\\'', '\'').split(':')[1:]
if len(split) > 1:
url = split[1]
else:
url = ''
else:
raw_options = []
# init
options = Options(parse_options(raw_options))
return (url, options)
def cgi_app(environ, start_response):
url, options = cgi_parse_environ(environ)
headers = {}
# headers
headers['status'] = '200 OK'
headers['cache-control'] = 'max-age=%s' % DELAY
headers['x-content-type-options'] = 'nosniff' # safari work around
if options.cors:
headers['access-control-allow-origin'] = '*'
if options.format == 'html':
headers['content-type'] = 'text/html'
elif options.txt or options.silent:
headers['content-type'] = 'text/plain'
elif options.format == 'json':
headers['content-type'] = 'application/json'
elif options.callback:
headers['content-type'] = 'application/javascript'
elif options.format == 'csv':
headers['content-type'] = 'text/csv'
headers['content-disposition'] = 'attachment; filename="feed.csv"'
else:
headers['content-type'] = 'text/xml'
headers['content-type'] += '; charset=utf-8'
# get the work done
url, rss = FeedFetch(url, options)
start_response(headers['status'], list(headers.items()))
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options)
if options.silent:
return ['']
else:
return [out]
def middleware(func):
" Decorator to turn a function into a wsgi middleware "
# This is called when parsing the "@middleware" code
def app_builder(app):
# This is called when doing app = cgi_wrapper(app)
def app_wrap(environ, start_response):
# This is called when a http request is being processed
return func(environ, start_response, app)
return app_wrap
return app_builder
@middleware
def cgi_file_handler(environ, start_response, app):
" Simple HTTP server to serve static files (.html, .css, etc.) "
url = get_path(environ)
if url == '':
url = 'index.html'
if re.match(r'^/?([a-zA-Z0-9_-][a-zA-Z0-9\._-]+/?)*$', url):
# if it is a legitimate url (no funny relative paths)
paths = [
os.path.join(sys.prefix, 'share/morss/www', url),
os.path.join(os.path.dirname(__file__), '../www', url)
]
for path in paths:
try:
f = open(path, 'rb')
except IOError:
# problem with file (cannot open or not found)
continue
else:
# file successfully open
headers = {}
headers['status'] = '200 OK'
headers['content-type'] = mimetypes.guess_type(path)[0] or 'application/octet-stream'
start_response(headers['status'], list(headers.items()))
return wsgiref.util.FileWrapper(f)
# regex didn't validate or no file found
return app(environ, start_response)
def cgi_get(environ, start_response):
url, options = cgi_parse_environ(environ)
# get page
req = crawler.adv_get(url=url, timeout=TIMEOUT)
if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
if options.get == 'page':
html = readabilite.parse(req['data'], encoding=req['encoding'])
html.make_links_absolute(req['url'])
kill_tags = ['script', 'iframe', 'noscript']
for tag in kill_tags:
for elem in html.xpath('//'+tag):
elem.getparent().remove(elem)
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
elif options.get == 'article':
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
else:
raise MorssException('no :get option passed')
else:
output = req['data']
# return html page
headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
start_response(headers['status'], list(headers.items()))
return [output]
dispatch_table = {
'get': cgi_get,
}
@middleware
def cgi_dispatcher(environ, start_response, app):
url, options = cgi_parse_environ(environ)
for key in dispatch_table.keys():
if key in options:
return dispatch_table[key](environ, start_response)
return app(environ, start_response)
@middleware
def cgi_error_handler(environ, start_response, app):
try:
return app(environ, start_response)
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
headers = {'status': '500 Oops', 'content-type': 'text/html'}
start_response(headers['status'], list(headers.items()), sys.exc_info())
log('ERROR: %s' % repr(e), force=True)
return [cgitb.html(sys.exc_info())]
@middleware
def cgi_encode(environ, start_response, app):
out = app(environ, start_response)
return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
application = cgi_app
application = cgi_file_handler(application)
application = cgi_dispatcher(application)
application = cgi_error_handler(application)
application = cgi_encode(application)
def cgi_handle_request():
app = cgi_app
app = cgi_dispatcher(app)
app = cgi_error_handler(app)
app = cgi_encode(app)
wsgiref.handlers.CGIHandler().run(app)
def cgi_start_server():
crawler.default_cache.autotrim()
print('Serving http://localhost:%s/' % PORT)
httpd = wsgiref.simple_server.make_server('', PORT, application)
httpd.serve_forever()
if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''):
crawler.default_cache.autotrim()

View File

@@ -1,4 +0,0 @@
lxml
python-dateutil <= 1.5
chardet
pymysql

View File

@@ -1,14 +1,24 @@
from setuptools import setup, find_packages
from setuptools import setup
from glob import glob
package_name = 'morss'
setup(
name=package_name,
description='Get full-text RSS feeds',
author='pictuga, Samuel Marks',
author_email='contact at pictuga dot com',
url='http://morss.it/',
license='AGPL v3',
package_dir={package_name: package_name},
packages=find_packages(),
package_data={package_name: ['feedify.ini', 'reader.html.template']},
test_suite=package_name + '.tests')
name = package_name,
description = 'Get full-text RSS feeds',
author = 'pictuga, Samuel Marks',
author_email = 'contact at pictuga dot com',
url = 'http://morss.it/',
download_url = 'https://git.pictuga.com/pictuga/morss',
license = 'AGPL v3',
packages = [package_name],
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet', 'pymysql'],
package_data = {package_name: ['feedify.ini']},
data_files = [
('share/' + package_name, ['README.md', 'LICENSE']),
('share/' + package_name + '/www', glob('www/*.*')),
('share/' + package_name + '/www/cgi', [])
],
entry_points = {
'console_scripts': [package_name + '=' + package_name + '.__main__:main']
})

View File

@@ -4,6 +4,12 @@ ErrorDocument 403 "Access forbidden"
ErrorDocument 404 /cgi/main.py
ErrorDocument 500 "A very nasty bug found his way onto this very server"
# Uncomment below line to turn debug on for all requests
#SetEnv DEBUG 1
# Uncomment below line to turn debug on for requests with :debug in the url
#SetEnvIf Request_URI :debug DEBUG=1
<Files ~ "\.(py|pyc|db|log)$">
deny from all
</Files>

View File

@@ -4,6 +4,7 @@
<title>morss</title>
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
<meta charset="UTF-8" />
<link rel="shortcut icon" type="image/svg+xml" href="/logo.svg" sizes="any" />
<style type="text/css">
body
{
@@ -35,8 +36,8 @@
<input type="text" id="url" name="url" placeholder="Feed url (http://example.com/feed.xml)" />
</form>
<code>Copyright: pictuga 2013-2014<br/>
Source code: https://github.com/pictuga/morss</code>
<code>Copyright: pictuga 2013-2020<br/>
Source code: https://git.pictuga.com/pictuga/morss</code>
<script>
form = document.forms[0]

17
www/logo.svg Normal file
View File

@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<svg width="16" height="16" viewBox="0 0 16 16" shape-rendering="crispEdges" fill="black" version="1.1" xmlns="http://www.w3.org/2000/svg">
<rect x="2" y="4" width="2" height="2" />
<rect x="5" y="4" width="6" height="2" />
<rect x="12" y="4" width="2" height="2" />
<rect x="2" y="7" width="2" height="2" />
<rect x="7" y="7" width="2" height="2" />
<rect x="12" y="7" width="2" height="2" />
<rect x="2" y="10" width="2" height="2" />
<rect x="7" y="10" width="2" height="2" />
<rect x="12" y="10" width="2" height="2" />
</svg>
<!-- This work by pictuga is licensed under CC BY-NC-SA 4.0. To view a copy of
this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0 -->

After

Width:  |  Height:  |  Size: 735 B

View File

@@ -1,5 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet version="1.1" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:stylesheet version="1.1"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:atom="http://www.w3.org/2005/Atom"
xmlns:atom03="http://purl.org/atom/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:rssfake="http://purl.org/rss/1.0/"
>
<xsl:output method="html"/>
@@ -7,116 +14,288 @@
<html>
<head>
<title>RSS feed by morss</title>
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
<meta name="viewport" content="width=device-width; initial-scale=1.0;" />
<meta name="robots" content="noindex" />
<style type="text/css">
body * {
box-sizing: border-box;
}
body {
overflow-wrap: anywhere;
word-wrap: anywhere;
word-break: break-word;
font-family: sans-serif;
-webkit-tap-highlight-color: transparent; /* safari work around */
}
#url {
background-color: rgba(255, 165, 0, 0.25);
padding: 1% 5%;
display: inline-block;
input, select {
font-family: inherit;
font-size: inherit;
text-align: inherit;
}
header {
text-align: justify;
text-align-last: center;
border-bottom: 1px solid silver;
}
.input-combo {
display: flex;
flex-flow: row;
align-items: stretch;
width: 800px;
max-width: 100%;
}
margin: auto;
body > ul {
border: 1px solid grey;
padding: .5em .5em;
background-color: #FFFAF4;
}
.input-combo * {
display: inline-block;
line-height: 2em;
border: 0;
background: transparent;
}
.input-combo > :not(.button) {
max-width: 100%;
flex-grow: 1;
flex-shrink 0;
white-space: nowrap;
text-overflow: ellipsis;
overflow: hidden;
}
.input-combo .button {
flex-grow: 0;
flex-shrink 1;
cursor: pointer;
min-width: 2em;
text-align: center;
border-left: 1px solid silver;
color: #06f;
}
[onclick_title] {
cursor: pointer;
position: relative;
}
[onclick_title]::before {
opacity: 0;
content: attr(onclick_title);
font-weight: normal;
position: absolute;
left: -300%;
z-index: 1;
background: grey;
color: white;
border-radius: 0.5em;
padding: 0 1em;
}
[onclick_title]:not(:active)::before {
transition: opacity 1s ease-in-out;
}
[onclick_title]:active::before {
opacity: 1;
}
header > form {
margin: 1%;
}
header a {
text-decoration: inherit;
color: #FF7B0A;
font-weight: bold;
}
.item {
background-color: #FFFAF4;
border: 1px solid silver;
margin: 1%;
max-width: 100%;
}
.item > * {
padding: 1%;
}
.item > *:empty {
display: none;
}
.item > :not(:last-child) {
border-bottom: 1px solid silver;
}
.item > a {
display: block;
font-weight: bold;
font-size: 1.5em;
}
.desc, .content {
overflow: hidden;
}
.desc *, .content * {
max-width: 100%;
}
ul {
list-style-type: none;
}
.tag {
color: darkred;
}
.attr {
color: darksalmon;
}
.value {
color: darkblue;
}
.comment {
color: lightgrey;
}
pre {
margin: 0;
max-width: 100%;
white-space: normal;
}
</style>
</head>
<body>
<h1>RSS feed by morss</h1>
<header>
<h1>RSS feed by morss</h1>
<p>Your RSS feed is <strong style="color: green">ready</strong>. You
can enter the following url in your newsreader:</p>
<p>Your RSS feed is <strong style="color: green">ready</strong>. You
can enter the following url in your newsreader:</p>
<div id="url"></div>
<div class="input-combo">
<input id="url" readonly="readonly"/>
<span class="button" onclick="copy_link()" title="Copy" onclick_title="Copied">
<svg width="16px" height="16px" viewBox="0 0 16 16" fill="currentColor" xmlns="http://www.w3.org/2000/svg">
<path fill-rule="evenodd" d="M4 1.5H3a2 2 0 00-2 2V14a2 2 0 002 2h10a2 2 0 002-2V3.5a2 2 0 00-2-2h-1v1h1a1 1 0 011 1V14a1 1 0 01-1 1H3a1 1 0 01-1-1V3.5a1 1 0 011-1h1v-1z" clip-rule="evenodd"/>
<path fill-rule="evenodd" d="M9.5 1h-3a.5.5 0 00-.5.5v1a.5.5 0 00.5.5h3a.5.5 0 00.5-.5v-1a.5.5 0 00-.5-.5zm-3-1A1.5 1.5 0 005 1.5v1A1.5 1.5 0 006.5 4h3A1.5 1.5 0 0011 2.5v-1A1.5 1.5 0 009.5 0h-3z" clip-rule="evenodd"/>
</svg>
</span>
</div>
<ul>
<xsl:apply-templates/>
</ul>
<form onchange="open_feed()">
More options: Output the
<select>
<option value="">full-text</option>
<option value=":proxy">original</option>
<option value=":clip" title="original + full-text: keep the original description above the full article. Useful for reddit feeds for example, to keep the comment links">combined (?)</option>
</select>
feed as
<select>
<option value="">RSS</option>
<option value=":json:cors">JSON</option>
<option value=":html">HTML</option>
<option value=":csv">CSV</option>
</select>
using the
<select>
<option value="">standard</option>
<option value=":firstlink" title="Pull the article from the first available link in the description, instead of the standard link. Useful for Twitter feeds for example, to get the articles referred to in tweets rather than the tweet itself">first (?)</option>
</select>
link of the
<select>
<option value="">first</option>
<option value=":newest" title="Select feed items by publication date (instead of appearing order)">newest (?)</option>
</select>
items and
<select>
<option value="">keep</option>
<option value=":nolink:noref">remove</option>
</select>
links
<input type="hidden" value="" name="extra_options"/>
</form>
<p>You can find a <em>preview</em> of the feed below. You need a <em>feed reader</em> for optimal use</p>
<p>Click <a href="/">here</a> to go back to morss and/or to use the tool on another feed</p>
</header>
<div id="header" dir="auto">
<h1>
<xsl:value-of select="rdf:RDF/rssfake:channel/rssfake:title|rss/channel/title|atom:feed/atom:title|atom03:feed/atom03:title"/>
</h1>
<p>
<xsl:value-of select="rdf:RDF/rssfake:channel/rssfake:description|rss/channel/description|atom:feed/atom:subtitle|atom03:feed/atom03:subtitle"/>
</p>
</div>
<div id="content">
<xsl:for-each select="rdf:RDF/rssfake:channel/rssfake:item|rss/channel/item|atom:feed/atom:entry|atom03:feed/atom03:entry">
<div class="item" dir="auto">
<a target="_blank"><xsl:attribute name="href"><xsl:value-of select="rssfake:link|link|atom:link/@href|atom03:link/@href"/></xsl:attribute>
<xsl:value-of select="rssfake:title|title|atom:title|atom03:title"/>
</a>
<div class="desc">
<xsl:copy-of select="rssfake:description|description|atom:summary|atom03:summary"/>
</div>
<div class="content">
<xsl:copy-of select="content:encoded|atom:content|atom03:content"/>
</div>
</div>
</xsl:for-each>
</div>
<script>
document.getElementById("url").innerHTML = window.location.href;
//<![CDATA[
document.getElementById("url").value = window.location.href
if (!/:html/.test(window.location.href))
for (var content of document.querySelectorAll(".desc,.content"))
content.innerHTML = (content.innerText.match(/>/g) || []).length > 3 ? content.innerText : content.innerHTML
var options = parse_location()[0]
if (options) {
for (var select of document.forms[0].elements)
if (select.tagName == 'SELECT')
for (var option of select)
if (option.value && options.match(option.value)) {
select.value = option.value
options = options.replace(option.value, '')
break
}
document.forms[0]['extra_options'].value = options
}
function copy_content(input) {
input.focus()
input.select()
document.execCommand('copy')
input.blur()
}
function copy_link() {
copy_content(document.getElementById("url"))
}
function parse_location() {
return (window.location.pathname + window.location.search).match(/^\/(?:(:[^\/]+)\/)?(.*$)$/).slice(1)
}
function open_feed() {
var url = parse_location()[1]
var options = Array.from(document.forms[0].elements).map(x=>x.value).join('')
var target = '/' + (options ? options + '/' : '') + url
if (target != window.location.pathname)
window.location.href = target
}
//]]>
</script>
</body>
</html>
</xsl:template>
<xsl:template match="*">
<li>
<span class="element">
&lt;
<span class="tag"><xsl:value-of select="name()"/></span>
<xsl:for-each select="@*">
<span class="attr"> <xsl:value-of select="name()"/></span>
=
"<span class="value"><xsl:value-of select="."/></span>"
</xsl:for-each>
&gt;
</span>
<xsl:if test="node()">
<ul>
<xsl:apply-templates/>
</ul>
</xsl:if>
<span class="element">
&lt;/
<span class="tag"><xsl:value-of select="name()"/></span>
&gt;
</span>
</li>
</xsl:template>
<xsl:template match="comment()">
<li>
<pre class="comment"><![CDATA[<!--]]><xsl:value-of select="."/><![CDATA[-->]]></pre>
</li>
</xsl:template>
<xsl:template match="text()">
<li>
<pre>
<xsl:value-of select="normalize-space(.)"/>
</pre>
</li>
</xsl:template>
<xsl:template match="text()[not(normalize-space())]"/>
</xsl:stylesheet>