Compare commits

...

169 Commits

Author SHA1 Message Date
75935114e4 Remove leftover code 2020-08-23 19:07:12 +02:00
5bd2557619 Fix typo in provided .htaccess 2020-08-23 19:01:34 +02:00
598a2591f1 Dockerfile: remove confusing one-liner code 2020-08-23 18:59:16 +02:00
e76ab2b631 Update gunicorn instructions 2020-08-23 18:59:02 +02:00
aa9143302b Remove now-unused isInt code 2020-08-23 18:51:09 +02:00
0d62a7625b Define http port via env vars as well 2020-08-23 18:50:18 +02:00
bd0efb1529 crawler: missing os import 2020-08-23 18:45:44 +02:00
47a17614ef Rename morss/cgi.py into morss/wsgi.py
To avoid name collision with the built-in cgi lib
2020-08-23 18:44:49 +02:00
4dfebe78f7 Pick caching backend via env vars 2020-08-23 18:43:18 +02:00
dcd3e4a675 cgi.py: add missing impots 2020-08-23 18:31:05 +02:00
e968b2ea7f Remove leftover :debug code 2020-08-23 16:59:34 +02:00
0ac590c798 Set MAX_/LIM_* settings via env var 2020-08-23 16:09:58 +02:00
fa1b5aef09 Instructions for DEBUG= use 2020-08-23 15:31:11 +02:00
7f6309f618 README: :silent was explained twice 2020-08-23 14:34:04 +02:00
f65fb45030 :debug completely deprecated in favour of DEBUG= 2020-08-23 14:33:32 +02:00
6dd40e5cc4 cli.py: fix Options code 2020-08-23 14:25:09 +02:00
0acfce5a22 cli.py: remove log 2020-08-23 14:24:57 +02:00
97ccc15db0 cgi.py: rename parseOptions to parse_options 2020-08-23 14:24:23 +02:00
7a560181f7 Use env var for DEBUG 2020-08-23 14:23:45 +02:00
baccd3b22b Move parseOptions to cgi.py
As it is no longer used in cli.py
2020-08-22 00:37:34 +02:00
f79938ab11 Add :silent to readme & argparse 2020-08-22 00:02:08 +02:00
5b8bd47829 cli.py: remove draft code 2020-08-21 23:59:12 +02:00
b5b355aa6e readabilite: increase penalty for high link density 2020-08-21 23:55:04 +02:00
94097f481a sheet.xsl: better handle some corner cases 2020-08-21 23:54:35 +02:00
8161baa7ae sheet.xsl: improve css 2020-08-21 23:54:12 +02:00
bd182bcb85 Move cli code to argParse
Related code changes (incl. :format=xyz)
2020-08-21 23:52:56 +02:00
c7c2c5d749 Removed unused filterOptions code 2020-08-21 23:23:33 +02:00
c6b52e625f split morss.py into __main__/cgi/cli.py
Should hopefully allow cleaner code in the future
2020-08-21 22:17:55 +02:00
c6d3a0eb53 readabilite: clean up code 2020-07-15 00:49:34 +02:00
c628ee802c README: add docker-compose instructions 2020-07-13 20:50:39 +02:00
6021b912ff morss: fix item removal
Usual issue when editing a list while looping over it
2020-07-06 19:25:48 +02:00
f18a128ee6 Change :first for :newest
i.e. toggle default for the more-obvious option
2020-07-06 19:25:17 +02:00
64af86c11e crawler: catch html parsing errors 2020-07-06 12:25:38 +02:00
15951d228c Add :first to NOT sort items by date 2020-07-06 11:39:08 +02:00
c1b1f5f58a morss: restrict iframe use from :get to avoid abuse 2020-06-09 12:33:37 +02:00
985185f47f morss: more flexible feed creator auto-detection 2020-06-08 13:03:24 +02:00
3190d1ec5a feeds: remove useless if(len) before loop 2020-06-02 13:57:45 +02:00
9815794a97 sheet.xsl: make text more self explanatory 2020-05-27 21:42:00 +02:00
758b6861b9 sheet.xsl: fix text alignment 2020-05-27 21:36:11 +02:00
ce4cf01aa6 crawler: clean up encoding detection code 2020-05-27 21:35:24 +02:00
dcfdb75a15 crawler: fix chinese encoding support 2020-05-27 21:34:43 +02:00
4ccc0dafcd Basic help for sub-lib interactive use 2020-05-26 19:34:20 +02:00
2fe3e0b8ee feeds: clean up other stylesheets before putting ours 2020-05-26 19:26:36 +02:00
ad3ba9de1a sheet.xsl: add <select/> to use :firstlink 2020-05-13 12:33:12 +02:00
68c46a1823 morss: remove deprecated twitter/fb link handling 2020-05-13 12:31:09 +02:00
91be2d229e morss: ability to use first link from desc instead of default link 2020-05-13 12:29:53 +02:00
038f267ea2 Rename :theforce into :force 2020-05-13 11:49:15 +02:00
22005065e8 Use etree.tostring 'method' arg
Gives appropriately formatted html code.
Some pages might otherwise be rendered as blank.
2020-05-13 11:44:34 +02:00
7d0d416610 morss: cache articles for 24hrs
Also make it possible to refetch articles, regardless of cache
2020-05-12 21:10:31 +02:00
5dac4c69a1 crawler: more code comments 2020-05-12 20:44:25 +02:00
36e2a1c3fd crawler: increase size limit from 100KiB to 500
I'm looking at you, worldbankgroup.csod.com/ats/careersite/search.aspx
2020-05-12 19:34:16 +02:00
83dd2925d3 readabilite: better parsing
Keeping blank_text keeps the tree more as-it, making the final output closer to expectations
2020-05-12 14:15:53 +02:00
e09d0abf54 morss: remove deprecated peace of code 2020-05-07 16:05:30 +02:00
ff26a560cb Shift safari work around to morss.py 2020-05-07 16:04:54 +02:00
74d7a1eca2 sheet.xsl: fix word wrap 2020-05-06 16:58:28 +02:00
eba295cba8 sheet.xsl: fixes for safari 2020-05-06 12:01:27 +02:00
f27631954e .htaccess: bypass Safari RSS detection 2020-05-06 11:47:24 +02:00
c74abfa2f4 sheet.xsl: use CDATA for js code 2020-05-06 11:46:38 +02:00
1d5272c299 sheet.xsl: allow zooming on mobile 2020-05-04 14:44:43 +02:00
f685139137 crawler: use UPSERT statements
Avoid potential race conditions
2020-05-03 21:27:45 +02:00
73b477665e morss: separate :clip with <hr> instead of stars 2020-05-02 19:19:54 +02:00
b425992783 morss: don't follow alt=rss with custom feeds
To have the same page as with :get=page and to avoid shitty feeds
2020-05-02 19:18:58 +02:00
271ac8f80f crawler: comment code a bit 2020-05-02 19:18:01 +02:00
64e41b807d crawler: handle http:/ (single slash)
Fixing one more corner case! malayalam.oneindia.com
2020-05-02 19:17:15 +02:00
a2c4691090 sheet.xsl: dir=auto for rtl languages (arabic, etc.) 2020-04-29 15:01:33 +02:00
b6000923bc README: clean up deprecated code 2020-04-28 22:31:11 +02:00
27a42c47aa morss: use final request url
Code is not very elegant...
2020-04-28 22:30:21 +02:00
c27c38f7c7 crawler: return dict instead of tuple 2020-04-28 22:29:07 +02:00
a1dc96cb50 feeds: remove mimetype from function call as no longer used 2020-04-28 22:07:25 +02:00
749acc87fc Centralize url clean up in crawler.py 2020-04-28 22:03:49 +02:00
c186188557 README: warning about lxml installation 2020-04-28 21:58:26 +02:00
cb69e3167f crawler: accept non-ascii urls
Covering one more corner case!
2020-04-28 14:47:23 +02:00
c3f06da947 morss: process(): specify encoding for clarity 2020-04-28 14:45:00 +02:00
44a3e0edc4 readabilite: specify in- and out-going encoding 2020-04-28 14:44:35 +02:00
4a9b505499 README: update python lib instructions 2020-04-27 18:12:14 +02:00
818cdaaa9b Make it possible to call sub-libs in non interactive mode
Run `python -m morss.feeds http://lemonde.fr` and so on
2020-04-27 18:00:14 +02:00
2806c64326 Make it possible to directly run sub-libs (feeds, crawler, readabilite)
Run `python -im morss.feeds http://website.sample/rss.xml` and so on
2020-04-27 17:19:31 +02:00
d39d7bb19d sheet.xsl: limit overflow 2020-04-25 15:27:49 +02:00
e5e3746fc6 sheet.xsl: show plain url 2020-04-25 15:27:13 +02:00
960c9d10d6 sheet.xsl: customize output feed form 2020-04-25 15:26:47 +02:00
0e7a5b9780 sheet.xsl: wrap header in <header> 2020-04-25 15:24:57 +02:00
186bedcf62 sheet.xsl: smarter html reparser 2020-04-25 15:22:25 +02:00
5847e18e42 sheet: improved feed address output (w/ c/c) 2020-04-25 15:21:47 +02:00
f6bc23927f readabilite: drop dangerous tags (script, style) 2020-04-25 12:25:02 +02:00
c86572374e readabilite: minimum score requirement 2020-04-25 12:24:36 +02:00
59ef5af9e2 feeds: fix bug when deleting attr in html 2020-04-24 22:12:05 +02:00
6a0531ca03 crawler: randomize user agent 2020-04-24 11:28:39 +02:00
8187876a06 crawler: stop at first alternative link
Should save a few ms and the first one is usually (?) the most relevant/generic
2020-04-23 11:23:45 +02:00
325a373e3e feeds: add SyntaxError catch 2020-04-20 16:15:15 +02:00
2719bd6776 crawler: fix chinese encoding 2020-04-20 16:14:55 +02:00
285e1e5f42 docker: pip install local 2020-04-19 13:25:53 +02:00
41a63900c2 README: improve docker instructions 2020-04-19 13:01:08 +02:00
ec8edb02f1 Various small bug fixes 2020-04-19 12:54:02 +02:00
d01b943597 Remove leftover threading var 2020-04-19 12:51:11 +02:00
b361aa2867 Add timeout to :get 2020-04-19 12:50:26 +02:00
4ce3c7cb32 Small code clean ups 2020-04-19 12:50:05 +02:00
7e45b2611d Disable multi-threading
Impact was mostly negative due to locks
2020-04-19 12:29:52 +02:00
036e5190f1 crawler: remove unused code 2020-04-18 21:40:02 +02:00
e99c5b3b71 morss: more sensible default MAX/LIM values 2020-04-18 17:21:45 +02:00
4f44df8d63 Make all ports default to 8080 2020-04-18 17:15:59 +02:00
497c14db81 Add dockerfile & how to in README 2020-04-18 17:04:44 +02:00
a4e1dba8b7 sheet.xsl: improve url display 2020-04-16 10:33:36 +02:00
7375adce33 sheet.xsl: fix & improve 2020-04-15 23:34:28 +02:00
663212de0a sheet.xsl: various cosmetic improvements 2020-04-15 23:22:45 +02:00
4a2ea1bce9 README: add gunicorn instructions 2020-04-15 22:31:21 +02:00
fe82b19c91 Merge .xsl & html template
Turns out they somehow serve a similar purpose
2020-04-15 22:30:45 +02:00
0b31e97492 morss: remove debug code in http file handler 2020-04-14 23:20:03 +02:00
b0ad7c259d Add README & LICENSE to data_files 2020-04-14 19:34:12 +02:00
bffb23f884 README: how to use cli 2020-04-14 18:21:32 +02:00
59139272fd Auto-detect the location of www/
Either ../www or /usr/share/morss
Adapted README accordingly
2020-04-14 18:07:19 +02:00
39b0a1d7cc setup.py: fix deps & files 2020-04-14 17:36:42 +02:00
65803b328d New git url and updated date in provided index.html 2020-04-13 15:30:32 +02:00
e6b7c0eb33 Fix app definition for uwsgi 2020-04-13 15:30:09 +02:00
67c096ad5b feeds: add fake path to default html parser
Without it, some websites were accidentally matching it (false positives)
2020-04-12 13:00:56 +02:00
f018437544 crawler: make mysql backend thread safe 2020-04-12 12:53:05 +02:00
8e5e8d24a4 Timezone fixes 2020-04-10 20:33:59 +02:00
ee78a7875a morss: focus on the most recent feed items 2020-04-10 16:08:13 +02:00
9e7b9d95ee feeds: properly use html template 2020-04-09 20:00:51 +02:00
987a719c4e feeds: try all parsers regardless of contenttype
Turns out some websites send the wrong contenttype (json for html, html for xml, etc.)
2020-04-09 19:17:51 +02:00
47b33f4baa morss: specify server output encoding 2020-04-09 19:10:45 +02:00
3c7f512583 feeds: handle several errors 2020-04-09 19:09:10 +02:00
a32f5a8536 readabilite: add debug option (also used by :get) 2020-04-09 19:08:13 +02:00
63a06524b7 morss: various encoding fixes 2020-04-09 19:06:51 +02:00
b0f80c6d3c morss: fix csv output encoding 2020-04-09 19:05:50 +02:00
78cea10ead morss: replace :getpage with :get
Also provides readabilite debugging
2020-04-09 18:43:20 +02:00
e5a82ff1f4 crawler: drop auto-referer
Was solving some issues. But creating even more issues.
2020-04-07 10:39:21 +02:00
f3d1f92b39 Detect encoding everytime 2020-04-07 10:38:36 +02:00
7691df5257 Use wrapper for http calls 2020-04-07 10:30:17 +02:00
0ae0dbc175 README: mention csv output 2020-04-07 09:24:32 +02:00
f1d0431e68 morss: drop :html, replaced with :reader
README updated accordingly
2020-04-07 09:23:29 +02:00
a09831415f feeds: fix bug when mimetype matches nothing 2020-04-06 18:53:07 +02:00
bfad6b7a4a readabilite: clean before counting
To remove links which are not kept anyway
2020-04-06 16:55:39 +02:00
6b8c3e51e7 readabilite: fix threshold feature
Awkward typo...
2020-04-06 16:52:06 +02:00
dc9e425247 readabilite: don't clean-out the top 10% nodes
Loosen up the code once again to limit over-kill
2020-04-06 14:26:28 +02:00
2f48e18bb1 readabilite: put scores directly in html node
Probably slower but makes code somewhat cleaner...
2020-04-06 14:21:41 +02:00
31cac921c7 README: remove ref to iTunes 2020-04-05 22:20:33 +02:00
a82ec96eb7 Delete feedify.py leftover code
iTunes integration untested, unreliable and not working...
2020-04-05 22:16:52 +02:00
aad2398e69 feeds: turns out lxml.etree doesn't have drop_tag 2020-04-05 21:50:38 +02:00
eeac630855 crawler: add more "realistic" headers 2020-04-05 21:11:57 +02:00
e136b0feb2 readabilite: loosen the slayer
Previous impl. lead to too many empty results
2020-04-05 20:47:30 +02:00
6cf32af6c0 readabilite: also use BS 2020-04-05 20:46:42 +02:00
568e7d7dd2 feeds: make BS's output bytes for lxml's sake 2020-04-05 20:46:04 +02:00
3617f86e9d morss: make cgi_encore more robust 2020-04-05 16:43:11 +02:00
d90756b337 morss: drop 'keep' option
Because the Firefox behaviour it is working around is no longer in use
2020-04-05 16:37:27 +02:00
40c69f17d2 feeds: parse html with BS
More robust & to make it consistent with :getpage
2020-04-05 16:12:41 +02:00
99461ea185 crawler: fix var name issues (private_cache) 2020-04-05 16:11:36 +02:00
bf86c1e962 crawler: make AutoUA match http(s) type 2020-04-05 16:07:51 +02:00
d20f6237bd crawler: replace ContentNegoHandler with AlternateHandler
More basic. Sends the same headers no matter what. Make requests more "replicable".
Also, drop "text/xml" from RSS contenttype, too broad, matches garbage
2020-04-05 16:05:59 +02:00
8a4d68d72c crawler: drop 'basic' toggle
Can't even remember the use case
2020-04-05 16:03:06 +02:00
e6811138fd morss: use redirected url in :getpage
Still have to find how to do the same thing with feeds...
2020-04-04 20:04:57 +02:00
35b702fffd morss: default values for feed creation 2020-04-04 19:39:32 +02:00
4a88886767 morss: get_page to act as a basic proxy (for iframes) 2020-04-04 16:37:15 +02:00
1653394cf7 morss: cgi_dispatcher to be able to create extra functions 2020-04-04 16:35:16 +02:00
a8a90cf414 morss: move url/options parsing to own function
For future re-use
2020-04-04 16:33:52 +02:00
bdbaf0f8a7 morss/cgi: fix handling of special chars in url 2020-04-04 16:21:37 +02:00
d0e447a2a6 ItemFix: clean up Pocket links 2020-04-04 16:20:39 +02:00
e6817e01b4 sheet.xsl: set font to "sans"
Browsers don't all have the same default font. Overriding for consistency
2020-04-03 17:47:19 +02:00
7c3091d64c morss: code spacing
One of those commits that make me feel useful
2020-03-21 23:41:46 +01:00
37b4e144a9 morss: small fixes
Includes dropping off ftp support
2020-03-21 23:30:18 +01:00
bd4b7b5bb2 morss: convert HTML feeds to XML ones for completeness 2020-03-21 23:27:42 +01:00
68d920d4b5 morss: make FeedFormat more flexible with encoding 2020-03-21 23:26:35 +01:00
758ff404a8 morss: fix cgi_app silent output
*Must* return sth
2020-03-21 23:25:25 +01:00
463530f02c morss: middleware to enforce encoding
bytes are always expected
2020-03-21 23:23:50 +01:00
ec0a28a91d morss: use middleware for wsgi apps 2020-03-21 23:23:21 +01:00
421acb439d morss: make errors more readable over http 2020-03-21 23:08:29 +01:00
42c5d09ccb morss: split "options" var into "raw_options" & "options"
To make it clearer who-is-what
2020-03-21 23:07:07 +01:00
056de12484 morss: add sheet.xsl to file handled by http server 2020-03-21 23:06:28 +01:00
961a31141f morss: fix url fixing 2020-03-21 17:28:00 +01:00
a7b01ee85e readabilite: further html processing instructions fix 2020-03-21 17:23:50 +01:00
19 changed files with 1280 additions and 936 deletions

8
Dockerfile Normal file
View File

@@ -0,0 +1,8 @@
FROM alpine:latest
RUN apk add python3 py3-lxml py3-gunicorn py3-pip git
ADD . /app
RUN pip3 install /app
CMD gunicorn --bind 0.0.0.0:8080 -w 4 morss

156
README.md
View File

@@ -24,15 +24,13 @@ hand-written rules (ie. there's no automatic detection of links to build feeds).
Please mind that feeds based on html files may stop working unexpectedly, due to
html structure changes on the target website.
Additionally morss can grab the source xml feed of iTunes podcast, and detect
rss feeds in html pages' `<meta>`.
Additionally morss can detect rss feeds in html pages' `<meta>`.
You can use this program online for free at **[morss.it](https://morss.it/)**.
Some features of morss:
- Read RSS/Atom feeds
- Create RSS feeds from json/html pages
- Convert iTunes podcast links into xml links
- Export feeds as RSS/JSON/CSV/HTML
- Fetch full-text content of feed items
- Follow 301/meta redirects
@@ -48,6 +46,7 @@ You do need:
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
- [lxml](http://lxml.de/) for xml parsing
- [bs4](https://pypi.org/project/bs4/) for badly-formatted html pages
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
- [chardet](https://pypi.python.org/pypi/chardet)
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
@@ -56,9 +55,13 @@ You do need:
Simplest way to get these:
```shell
pip install -r requirements.txt
pip install git+https://git.pictuga.com/pictuga/morss.git@master
```
The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
C code needs to be compiled). If possible on your distribution, try installing
it with the system package manager.
You may also need:
- Apache, with python-cgi support, to run on a server
@@ -70,36 +73,56 @@ morss accepts some arguments, to lightly alter the output of morss. Arguments
may need to have a value (usually a string or a number). In the different "Use
cases" below is detailed how to pass those arguments to morss.
The arguments are:
The list of arguments can be obtained by running `morss --help`
```
usage: morss [-h] [--format {rss,json,html,csv}] [--search STRING] [--clip] [--indent] [--cache] [--force] [--proxy] [--newest] [--firstlink] [--items XPATH] [--item_link XPATH]
[--item_title XPATH] [--item_content XPATH] [--item_time XPATH] [--nolink] [--noref] [--debug]
url
Get full-text RSS feeds
positional arguments:
url feed url
optional arguments:
-h, --help show this help message and exit
output:
--format {rss,json,html,csv}
output format
--search STRING does a basic case-sensitive search in the feed
--clip stick the full article content under the original feed content (useful for twitter)
--indent returns indented XML or JSON, takes more place, but human-readable
action:
--cache only take articles from the cache (ie. don't grab new articles' content), so as to save time
--force force refetch the rss feed and articles
--proxy doesn't fill the articles
--newest return the feed items in chronological order (morss ohterwise shows the items by appearing order)
--firstlink pull the first article mentioned in the description instead of the default link
custom feeds:
--items XPATH (mandatory to activate the custom feeds function) xpath rule to match all the RSS entries
--item_link XPATH xpath rule relative to items to point to the entry's link
--item_title XPATH entry's title
--item_content XPATH entry's content
--item_time XPATH entry's date & time (accepts a wide range of time formats)
misc:
--nolink drop links, but keeps links' inner text
--noref drop items' link
--silent don't output the final RSS (useless on its own, but can be nice when debugging)
GNU AGPLv3 code
```
Further options:
- Change what morss does
- `json`: output as JSON
- `proxy`: doesn't fill the articles
- `clip`: stick the full article content under the original feed content (useful for twitter)
- `keep`: by default, morss does drop feed description whenever the full-content is found (so as not to mislead users who use Firefox, since the latter only shows the description in the feed preview, so they might believe morss doens't work), but with this argument, the description is kept
- `search=STRING`: does a basic case-sensitive search in the feed
- Advanced
- `csv`: export to csv
- `indent`: returns indented XML or JSON, takes more place, but human-readable
- `nolink`: drop links, but keeps links' inner text
- `noref`: drop items' link
- `cache`: only take articles from the cache (ie. don't grab new articles' content), so as to save time
- `debug`: to have some feedback from the script execution. Useful for debugging
- `mono`: disable multithreading while fetching, makes debugging easier
- `theforce`: force download the rss feed and ignore cached http errros
- `silent`: don't output the final RSS (useless on its own, but can be nice when debugging)
- `encoding=ENCODING`: overrides the encoding auto-detection of the crawler. Some web developers did not quite understand the importance of setting charset/encoding tags correctly...
- http server only
- `callback=NAME`: for JSONP calls
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers)
- `html`: changes the http content-type to html, so that python cgi erros (written in html) are readable in a web browser
- `txt`: changes the http content-type to txt (for faster "`view-source:`")
- Custom feeds: you can turn any HTML page into a RSS feed using morss, using xpath rules. The article content will be fetched as usual (with readabilite). Please note that you will have to **replace** any `/` in your rule with a `|` when using morss as a webserver
- `items`: (**mandatory** to activate the custom feeds function) xpath rule to match all the RSS entries
- `item_link`: xpath rule relative to `items` to point to the entry's link
- `item_title`: entry's title
- `item_content`: entry's description
- `item_time`: entry's date & time (accepts a wide range of time formats)
- Environment variable `DEBUG=`: to have some feedback from the script execution. Useful for debugging. On Apache, can be set via the `SetEnv` instruction (see sample `.htaccess` provided).
- `callback=NAME`: for JSONP calls
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers)
- `txt`: changes the http content-type to txt (for faster "`view-source:`")
## Use cases
@@ -111,7 +134,6 @@ morss will auto-detect what "mode" to use.
For this, you'll want to change a bit the architecture of the files, for example
into something like this.
```
/
├── cgi
@@ -143,20 +165,50 @@ ensure that the provided `/www/.htaccess` works well with your server.
Running this command should do:
```shell
uwsgi --http :9090 --plugin python --wsgi-file main.py
uwsgi --http :8080 --plugin python --wsgi-file main.py
```
However, one problem might be how to serve the provided `index.html` file if it
isn't in the same directory. Therefore you can add this at the end of the
command to point to another directory `--pyargv '--root ../../www/'`.
#### Using Gunicorn
```shell
gunicorn morss
```
#### Using docker
Build & run
```shell
docker build https://git.pictuga.com/pictuga/morss.git -t morss
docker run -p 8080:8080 morss
```
With docker-compose:
```yml
services:
app:
build: https://git.pictuga.com/pictuga/morss.git
ports:
- '8080:8080'
```
Then run
```shell
docker-compose up --build
```
#### Using morss' internal HTTP server
Morss can run its own HTTP server. The later should start when you run morss
without any argument, on port 8080.
You can change the port and the location of the `www/` folder like this `python -m morss 9000 --root ../../www`.
```shell
morss
```
You can change the port using environment variables like this `PORT=9000 morss`.
#### Passing arguments
@@ -176,9 +228,9 @@ Works like a charm with [Tiny Tiny RSS](http://tt-rss.org/redmine/projects/tt-rs
Run:
```
python[2.7] -m morss [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
morss [--argwithoutvalue] [--argwithvalue=value] [...] FEEDURL
```
For example: `python -m morss debug http://feeds.bbci.co.uk/news/rss.xml`
For example: `morss --debug http://feeds.bbci.co.uk/news/rss.xml`
*(Brackets indicate optional text)*
@@ -191,9 +243,9 @@ scripts can be run on top of the RSS feed, using its
To use this script, you have to enable "(Unix) command" in liferea feed settings, and use the command:
```
[python[2.7]] PATH/TO/MORSS/main.py [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
morss [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
```
For example: `python2.7 PATH/TO/MORSS/main.py http://feeds.bbci.co.uk/news/rss.xml`
For example: `morss http://feeds.bbci.co.uk/news/rss.xml`
*(Brackets indicate optional text)*
@@ -230,20 +282,23 @@ url = 'http://newspaper.example/feed.xml'
options = morss.Options(csv=True) # arguments
morss.crawler.sqlite_default = '/tmp/morss-cache.db' # sqlite cache location
rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
url, rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
rss = morss.FeedGather(rss, url, options) # this fills the feed and cleans it up
output = morss.Format(rss, options) # formats final feed
output = morss.FeedFormat(rss, options, 'unicode') # formats final feed
```
## Cache information
morss uses caching to make loading faster. There are 2 possible cache backends
(visible in `morss/crawler.py`):
morss uses caching to make loading faster. There are 3 possible cache backends,
which can be picked via environment variables:
- `SQLiteCache`: sqlite3 cache. Default file location is in-memory (i.e. it will
be cleared every time the program is run
- `MySQLCacheHandler`: /!\ Does NOT support multi-threading
- `(nothing/default)`: a simple python in-memory dict() object.
- `CACHE=sqlite`: sqlite3 cache. Default file location is in-memory (i.e. it
will be cleared every time the program is run). Path can be defined with
`SQLITE_PATH`.
- `CACHE=mysql`: MySQL cache. Connection can be defined with the following
environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
## Configuration
### Length limitation
@@ -251,7 +306,7 @@ be cleared every time the program is run
When parsing long feeds, with a lot of items (100+), morss might take a lot of
time to parse it, or might even run into a memory overflow on some shared
hosting plans (limits around 10Mb), in which case you might want to adjust the
different values at the top of the script.
below settings via environment variables.
- `MAX_TIME` sets the maximum amount of time spent *fetching* articles, more time might be spent taking older articles from cache. `-1` for unlimited.
- `MAX_ITEM` sets the maximum number of articles to fetch. `-1` for unlimited. More articles will be taken from cache following the nexts settings.
@@ -262,7 +317,6 @@ different values at the top of the script.
- `DELAY` sets the browser cache delay, only for HTTP clients
- `TIMEOUT` sets the HTTP timeout when fetching rss feeds and articles
- `THREADS` sets the number of threads to use. `1` makes no use of multithreading.
### Content matching

View File

@@ -1,6 +1,7 @@
#!/usr/bin/env python
from morss import main, cgi_wrapper as application
from morss.__main__ import main
from morss.wsgi import application
if __name__ == '__main__':
main()

View File

@@ -1,2 +1,3 @@
# ran on `import morss`
from .morss import *
from .wsgi import application

View File

@@ -1,5 +1,54 @@
# ran on `python -m morss`
from .morss import main
import os
import sys
from . import wsgi
from . import cli
from .morss import MorssException
import wsgiref.simple_server
import wsgiref.handlers
PORT = int(os.getenv('PORT', 8080))
def main():
if 'REQUEST_URI' in os.environ:
# mod_cgi (w/o file handler)
app = wsgi.cgi_app
app = wsgi.cgi_dispatcher(app)
app = wsgi.cgi_error_handler(app)
app = wsgi.cgi_encode(app)
wsgiref.handlers.CGIHandler().run(app)
elif len(sys.argv) <= 1:
# start internal (basic) http server (w/ file handler)
app = wsgi.cgi_app
app = wsgi.cgi_file_handler(app)
app = wsgi.cgi_dispatcher(app)
app = wsgi.cgi_error_handler(app)
app = wsgi.cgi_encode(app)
print('Serving http://localhost:%s/' % port)
httpd = wsgiref.simple_server.make_server('', PORT, app)
httpd.serve_forever()
else:
# as a CLI app
try:
cli.cli_app()
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
print('ERROR: %s' % e.message)
if __name__ == '__main__':
main()

51
morss/cli.py Normal file
View File

@@ -0,0 +1,51 @@
import sys
import os.path
import argparse
from .morss import FeedFetch, FeedGather, FeedFormat
from .morss import Options
def cli_app():
parser = argparse.ArgumentParser(
prog='morss',
description='Get full-text RSS feeds',
epilog='GNU AGPLv3 code'
)
parser.add_argument('url', help='feed url')
group = parser.add_argument_group('output')
group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
group.add_argument('--search', action='store', type=str, metavar='STRING', help='does a basic case-sensitive search in the feed')
group.add_argument('--clip', action='store_true', help='stick the full article content under the original feed content (useful for twitter)')
group.add_argument('--indent', action='store_true', help='returns indented XML or JSON, takes more place, but human-readable')
group = parser.add_argument_group('action')
group.add_argument('--cache', action='store_true', help='only take articles from the cache (ie. don\'t grab new articles\' content), so as to save time')
group.add_argument('--force', action='store_true', help='force refetch the rss feed and articles')
group.add_argument('--proxy', action='store_true', help='doesn\'t fill the articles')
group.add_argument('--newest', action='store_true', help='return the feed items in chronological order (morss ohterwise shows the items by appearing order)')
group.add_argument('--firstlink', action='store_true', help='pull the first article mentioned in the description instead of the default link')
group = parser.add_argument_group('custom feeds')
group.add_argument('--items', action='store', type=str, metavar='XPATH', help='(mandatory to activate the custom feeds function) xpath rule to match all the RSS entries')
group.add_argument('--item_link', action='store', type=str, metavar='XPATH', help='xpath rule relative to items to point to the entry\'s link')
group.add_argument('--item_title', action='store', type=str, metavar='XPATH', help='entry\'s title')
group.add_argument('--item_content', action='store', type=str, metavar='XPATH', help='entry\'s content')
group.add_argument('--item_time', action='store', type=str, metavar='XPATH', help='entry\'s date & time (accepts a wide range of time formats)')
group = parser.add_argument_group('misc')
group.add_argument('--nolink', action='store_true', help='drop links, but keeps links\' inner text')
group.add_argument('--noref', action='store_true', help='drop items\' link')
group.add_argument('--silent', action='store_true', help='don\'t output the final RSS (useless on its own, but can be nice when debugging)')
options = Options(vars(parser.parse_args()))
url = options.url
url, rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options, 'unicode')
if not options.silent:
print(out)

View File

@@ -1,3 +1,4 @@
import os
import sys
import zlib
@@ -7,14 +8,19 @@ import chardet
from cgi import parse_header
import lxml.html
import time
import random
try:
# python 2
from urllib2 import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
from urllib import quote
from urlparse import urlparse, urlunparse
import mimetools
except ImportError:
# python 3
from urllib.request import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
from urllib.parse import quote
from urllib.parse import urlparse, urlunparse
import email
try:
@@ -27,13 +33,56 @@ except NameError:
MIMETYPE = {
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
DEFAULT_UAS = [
#https://gist.github.com/fijimunkii/952acac988f2d25bef7e0284bc63c406
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
]
def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=False):
PROTOCOL = ['http', 'https']
def get(*args, **kwargs):
return adv_get(*args, **kwargs)['data']
def adv_get(url, timeout=None, *args, **kwargs):
url = sanitize_url(url)
if timeout is None:
con = custom_handler(*args, **kwargs).open(url)
else:
con = custom_handler(*args, **kwargs).open(url, timeout=timeout)
data = con.read()
contenttype = con.info().get('Content-Type', '').split(';')[0]
encoding= detect_encoding(data, con)
return {
'data':data,
'url': con.geturl(),
'con': con,
'contenttype': contenttype,
'encoding': encoding
}
def custom_handler(follow=None, delay=None, encoding=None):
handlers = []
# as per urllib2 source code, these Handelers are added first
@@ -45,26 +94,65 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F
# & HTTPSHandler
#handlers.append(DebugHandler())
handlers.append(SizeLimitHandler(100*1024)) # 100KiB
handlers.append(SizeLimitHandler(500*1024)) # 500KiB
handlers.append(HTTPCookieProcessor())
handlers.append(GZIPHandler())
handlers.append(HTTPEquivHandler())
handlers.append(HTTPRefreshHandler())
handlers.append(UAHandler(DEFAULT_UA))
if not basic:
handlers.append(AutoRefererHandler())
handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
handlers.append(BrowserlyHeaderHandler())
handlers.append(EncodingFixHandler(encoding))
if accept:
handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict))
if follow:
handlers.append(AlternateHandler(MIMETYPE[follow]))
handlers.append(CacheHandler(force_min=delay))
return build_opener(*handlers)
def is_ascii(string):
# there's a native function in py3, but home-made fix for backward compatibility
try:
string.encode('ascii')
except UnicodeError:
return False
else:
return True
def sanitize_url(url):
# make sure the url is unicode, i.e. not bytes
if isinstance(url, bytes):
url = url.decode()
# make sure there's a protocol (http://)
if url.split(':', 1)[0] not in PROTOCOL:
url = 'http://' + url
# turns out some websites have really badly fomatted urls (fix http:/badurl)
url = re.sub('^(https?):/([^/])', r'\1://\2', url)
# escape spaces
url = url.replace(' ', '%20')
# escape non-ascii unicode characters
# https://stackoverflow.com/a/4391299
parts = list(urlparse(url))
for i in range(len(parts)):
if not is_ascii(parts[i]):
if i == 1:
parts[i] = parts[i].encode('idna').decode('ascii')
else:
parts[i] = quote(parts[i].encode('utf-8'))
return urlunparse(parts)
class DebugHandler(BaseHandler):
handler_order = 2000
@@ -132,6 +220,15 @@ class GZIPHandler(BaseHandler):
def detect_encoding(data, resp=None):
enc = detect_raw_encoding(data, resp)
if enc.lower() == 'gb2312':
enc = 'gbk'
return enc
def detect_raw_encoding(data, resp=None):
if resp is not None:
enc = resp.headers.get('charset')
if enc is not None:
@@ -165,14 +262,10 @@ class EncodingFixHandler(BaseHandler):
if 200 <= resp.code < 300 and maintype == 'text':
data = resp.read()
if not self.encoding:
enc = detect_encoding(data, resp)
else:
enc = self.encoding
enc = self.encoding or detect_encoding(data, resp)
if enc:
data = data.decode(enc, 'replace')
data = data.encode(enc)
data = data.decode(enc, 'replace')
data = data.encode(enc)
fp = BytesIO(data)
old_resp = resp
@@ -196,48 +289,43 @@ class UAHandler(BaseHandler):
https_request = http_request
class AutoRefererHandler(BaseHandler):
class BrowserlyHeaderHandler(BaseHandler):
""" Add more headers to look less suspicious """
def http_request(self, req):
req.add_unredirected_header('Referer', 'http://%s' % req.host)
req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
req.add_unredirected_header('Accept-Language', 'en-US,en;q=0.5')
return req
https_request = http_request
class ContentNegociationHandler(BaseHandler):
" Handler for content negociation. Also parses <link rel='alternate' type='application/rss+xml' href='...' /> "
class AlternateHandler(BaseHandler):
" Follow <link rel='alternate' type='application/rss+xml' href='...' /> "
def __init__(self, accept=None, strict=False):
self.accept = accept
self.strict = strict
def http_request(self, req):
if self.accept is not None:
if isinstance(self.accept, basestring):
self.accept = (self.accept,)
string = ','.join(self.accept)
if self.strict:
string += ',*/*;q=0.9'
req.add_unredirected_header('Accept', string)
return req
def __init__(self, follow=None):
self.follow = follow or []
def http_response(self, req, resp):
contenttype = resp.info().get('Content-Type', '').split(';')[0]
if 200 <= resp.code < 300 and self.accept is not None and self.strict and contenttype in MIMETYPE['html'] and contenttype not in self.accept:
if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
data = resp.read()
links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
for link in links:
if link.get('type', '') in self.accept:
resp.code = 302
resp.msg = 'Moved Temporarily'
resp.headers['location'] = link.get('href')
try:
links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
for link in links:
if link.get('type', '') in self.follow:
resp.code = 302
resp.msg = 'Moved Temporarily'
resp.headers['location'] = link.get('href')
break
except (ValueError, SyntaxError):
# catch parsing errors
pass
fp = BytesIO(data)
old_resp = resp
@@ -246,7 +334,6 @@ class ContentNegociationHandler(BaseHandler):
return resp
https_request = http_request
https_response = http_response
@@ -260,10 +347,15 @@ class HTTPEquivHandler(BaseHandler):
if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
data = resp.read()
headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
try:
headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
for header in headers:
resp.headers[header.get('http-equiv').lower()] = header.get('content')
for header in headers:
resp.headers[header.get('http-equiv').lower()] = header.get('content')
except (ValueError, SyntaxError):
# catch parsing errors
pass
fp = BytesIO(data)
old_resp = resp
@@ -297,18 +389,31 @@ class HTTPRefreshHandler(BaseHandler):
https_response = http_response
default_cache = {}
class CacheHandler(BaseHandler):
" Cache based on etags/last-modified "
private_cache = False # False to behave like a CDN (or if you just don't care), True like a PC
private_cache = False # Websites can indicate whether the page should be
# cached by CDNs (e.g. shouldn't be the case for
# private/confidential/user-specific pages.
# With this setting, decide whether (False) you want
# the cache to behave like a CDN (i.e. don't cache
# private pages), or (True) to behave like a end-cache
# private pages. If unsure, False is the safest bet.
handler_order = 499
def __init__(self, cache=None, force_min=None):
self.cache = cache or default_cache
self.force_min = force_min # force_min (seconds) to bypass http headers, -1 forever, 0 never, -2 do nothing if not in cache
self.force_min = force_min
# Servers indicate how long they think their content is "valid".
# With this parameter (force_min, expressed in seconds), we can
# override the validity period (i.e. bypassing http headers)
# Special values:
# -1: valid forever, i.e. use the cache no matter what (and fetch
# the page online if not present in cache)
# 0: valid zero second, i.e. force refresh
# -2: same as -1, i.e. use the cache no matter what, but do NOT
# fetch the page online if not present in cache, throw an
# error instead
def load(self, url):
try:
@@ -338,6 +443,10 @@ class CacheHandler(BaseHandler):
return req
def http_open(self, req):
# Reminder of how/when this function is called by urllib2:
# If 'None' is returned, try your chance with the next-available handler
# If a 'resp' is returned, stop there, and proceed with 'http_response'
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
# some info needed to process everything
@@ -360,6 +469,7 @@ class CacheHandler(BaseHandler):
pass
else:
# raise an error, via urllib handlers
headers['Morss'] = 'from_cache'
resp = addinfourl(BytesIO(), headers, req.get_full_url(), 409)
resp.msg = 'Conflict'
@@ -378,14 +488,18 @@ class CacheHandler(BaseHandler):
return None
elif code == 301 and cache_age < 7*24*3600:
# "301 Moved Permanently" has to be cached...as long as we want (awesome HTTP specs), let's say a week (why not?)
# use force_min=0 if you want to bypass this (needed for a proper refresh)
# "301 Moved Permanently" has to be cached...as long as we want
# (awesome HTTP specs), let's say a week (why not?). Use force_min=0
# if you want to bypass this (needed for a proper refresh)
pass
elif self.force_min is None and ('no-cache' in cc_list
or 'no-store' in cc_list
or ('private' in cc_list and not self.private)):
or ('private' in cc_list and not self.private_cache)):
# kindly follow web servers indications, refresh
# if the same settings are used all along, this section shouldn't be
# of any use, since the page woudln't be cached in the first place
# the check is only performed "just in case"
return None
elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
@@ -400,7 +514,7 @@ class CacheHandler(BaseHandler):
# according to the www, we have to refresh when nothing is said
return None
# return the cache as a response
# return the cache as a response. This code is reached with 'pass' above
headers['morss'] = 'from_cache' # TODO delete the morss header from incoming pages, to avoid websites messing up with us
resp = addinfourl(BytesIO(data), headers, req.get_full_url(), code)
resp.msg = msg
@@ -419,7 +533,7 @@ class CacheHandler(BaseHandler):
cc_list = [x for x in cache_control if '=' not in x]
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private):
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache):
# kindly follow web servers indications
return resp
@@ -431,6 +545,8 @@ class CacheHandler(BaseHandler):
data = resp.read()
self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
# the below is only needed because of 'resp.read()' above, as we can't
# seek(0) on arbitraty file-like objects (e.g. sockets)
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
@@ -450,10 +566,14 @@ class CacheHandler(BaseHandler):
unverifiable=True)
new.add_unredirected_header('Morss', 'from_304')
# create a "fake" new request to just re-run through the various
# handlers
return self.parent.open(new, timeout=req.timeout)
return None
return None # when returning 'None', the next-available handler is used
# the 'HTTPRedirectHandler' has no 'handler_order', i.e.
# uses the default of 500, therefore executed after this
https_request = http_request
https_open = http_open
@@ -461,6 +581,8 @@ class CacheHandler(BaseHandler):
class BaseCache:
""" Subclasses must behave like a dict """
def __contains__(self, url):
try:
self[url]
@@ -477,7 +599,7 @@ import sqlite3
class SQLiteCache(BaseCache):
def __init__(self, filename=':memory:'):
self.con = sqlite3.connect(filename or sqlite_default, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
self.con = sqlite3.connect(filename, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
with self.con:
self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)')
@@ -499,32 +621,28 @@ class SQLiteCache(BaseCache):
value[3] = sqlite3.Binary(value[3]) # data
value = tuple(value)
if url in self:
with self.con:
self.con.execute('UPDATE data SET code=?, msg=?, headers=?, data=?, timestamp=? WHERE url=?',
value + (url,))
else:
with self.con:
self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?)', (url,) + value)
with self.con:
self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?) ON CONFLICT(url) DO UPDATE SET code=?, msg=?, headers=?, data=?, timestamp=?', (url,) + value + value)
import pymysql.cursors
class MySQLCacheHandler(BaseCache):
" NB. Requires mono-threading, as pymysql isn't thread-safe "
def __init__(self, user, password, database, host='localhost'):
self.con = pymysql.connect(host=host, user=user, password=password, database=database, charset='utf8', autocommit=True)
self.user = user
self.password = password
self.database = database
self.host = host
with self.con.cursor() as cursor:
with self.cursor() as cursor:
cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)')
def __del__(self):
self.con.close()
def cursor(self):
return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
def __getitem__(self, url):
cursor = self.con.cursor()
cursor = self.cursor()
cursor.execute('SELECT * FROM data WHERE url=%s', (url,))
row = cursor.fetchone()
@@ -534,11 +652,32 @@ class MySQLCacheHandler(BaseCache):
return row[1:]
def __setitem__(self, url, value): # (code, msg, headers, data, timestamp)
if url in self:
with self.con.cursor() as cursor:
cursor.execute('UPDATE data SET code=%s, msg=%s, headers=%s, data=%s, timestamp=%s WHERE url=%s',
value + (url,))
with self.cursor() as cursor:
cursor.execute('INSERT INTO data VALUES (%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE code=%s, msg=%s, headers=%s, data=%s, timestamp=%s',
(url,) + value + value)
else:
with self.con.cursor() as cursor:
cursor.execute('INSERT INTO data VALUES (%s,%s,%s,%s,%s,%s)', (url,) + value)
if 'CACHE' in os.environ:
if os.environ['CACHE'] == 'mysql':
default_cache = MySQLCacheHandler(
user = os.getenv('MYSQL_USER'),
password = os.getenv('MYSQL_PWD'),
database = os.getenv('MYSQL_DB'),
host = os.getenv('MYSQL_HOST')
)
elif os.environ['CACHE'] == 'sqlite':
default_cache = SQLiteCache(os.getenv('SQLITE_PATH', ':memory:'))
else:
default_cache = {}
if __name__ == '__main__':
req = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
if sys.flags.interactive:
print('>>> Interactive shell: try using `req`')
else:
print(req['data'].decode(req['encoding']))

View File

@@ -90,8 +90,11 @@ item_updated = updated
[html]
mode = html
path =
http://localhost/
title = //div[@id='header']/h1
desc = //div[@id='header']/h2
desc = //div[@id='header']/p
items = //div[@id='content']/div
item_title = ./a
@@ -99,7 +102,7 @@ item_link = ./a/@href
item_desc = ./div[class=desc]
item_content = ./div[class=content]
base = <!DOCTYPE html> <html> <head> <title>Feed reader by morss</title> <meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" /> </head> <body> <div id="header"> <h1>@feed.title</h1> <h2>@feed.desc</h2> <p>- via morss</p> </div> <div id="content"> <div class="item"> <a class="title link" href="@item.link" target="_blank">@item.title</a> <div class="desc">@item.desc</div> <div class="content">@item.content</div> </div> </div> <script> var items = document.getElementsByClassName('item') for (var i in items) items[i].onclick = function() { this.classList.toggle('active') document.body.classList.toggle('noscroll') } </script> </body> </html>
base = file:sheet.xsl
[twitter]
mode = html

View File

@@ -1,28 +0,0 @@
import re
import json
from . import crawler
try:
basestring
except NameError:
basestring = str
def pre_worker(url):
if url.startswith('http://itunes.apple.com/') or url.startswith('https://itunes.apple.com/'):
match = re.search('/id([0-9]+)(\?.*)?$', url)
if match:
iid = match.groups()[0]
redirect = 'https://itunes.apple.com/lookup?id=%s' % iid
try:
con = crawler.custom_handler(basic=True).open(redirect, timeout=4)
data = con.read()
except (IOError, HTTPException):
raise
return json.loads(data.decode('utf-8', 'replace'))['results'][0]['feedUrl']
return None

View File

@@ -15,6 +15,7 @@ import dateutil.parser
from copy import deepcopy
import lxml.html
from .readabilite import parse as html_parse
json.encoder.c_make_encoder = None
@@ -45,14 +46,32 @@ def parse_rules(filename=None):
rules = dict([(x, dict(config.items(x))) for x in config.sections()])
for section in rules.keys():
# for each ruleset
for arg in rules[section].keys():
if '\n' in rules[section][arg]:
# for each rule
if rules[section][arg].startswith('file:'):
paths = [os.path.join(sys.prefix, 'share/morss/www', rules[section][arg][5:]),
os.path.join(os.path.dirname(__file__), '../www', rules[section][arg][5:]),
os.path.join(os.path.dirname(__file__), '../..', rules[section][arg][5:])]
for path in paths:
try:
file_raw = open(path).read()
file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
rules[section][arg] = file_clean
except IOError:
pass
elif '\n' in rules[section][arg]:
rules[section][arg] = rules[section][arg].split('\n')[1:]
return rules
def parse(data, url=None, mimetype=None):
def parse(data, url=None, encoding=None):
" Determine which ruleset to use "
rulesets = parse_rules()
@@ -66,28 +85,22 @@ def parse(data, url=None, mimetype=None):
for path in ruleset['path']:
if fnmatch(url, path):
parser = [x for x in parsers if x.mode == ruleset['mode']][0]
return parser(data, ruleset)
return parser(data, ruleset, encoding=encoding)
# 2) Look for a parser based on mimetype
if mimetype is not None:
parser_candidates = [x for x in parsers if mimetype in x.mimetype]
if mimetype is None or parser_candidates is None:
parser_candidates = parsers
# 2) Try each and every parser
# 3) Look for working ruleset for given parser
# 3a) See if parsing works
# 3b) See if .items matches anything
for parser in parser_candidates:
for parser in parsers:
ruleset_candidates = [x for x in rulesets.values() if x['mode'] == parser.mode and 'path' not in x]
# 'path' as they should have been caught beforehands
try:
feed = parser(data)
feed = parser(data, encoding=encoding)
except (ValueError):
except (ValueError, SyntaxError):
# parsing did not work
pass
@@ -112,7 +125,7 @@ def parse(data, url=None, mimetype=None):
class ParserBase(object):
def __init__(self, data=None, rules=None, parent=None):
def __init__(self, data=None, rules=None, parent=None, encoding=None):
if rules is None:
rules = parse_rules()[self.default_ruleset]
@@ -121,9 +134,10 @@ class ParserBase(object):
if data is None:
data = rules['base']
self.root = self.parse(data)
self.parent = parent
self.encoding = encoding
self.root = self.parse(data)
def parse(self, raw):
pass
@@ -148,15 +162,15 @@ class ParserBase(object):
c = csv.writer(out, dialect=csv.excel)
for item in self.items:
row = [getattr(item, x) for x in item.dic]
if encoding != 'unicode':
row = [x.encode(encoding) if isinstance(x, unicode) else x for x in row]
c.writerow(row)
c.writerow([getattr(item, x) for x in item.dic])
out.seek(0)
return out.read()
out = out.read()
if encoding != 'unicode':
out = out.encode(encoding)
return out
def tohtml(self, **k):
return self.convert(FeedHTML).tostring(**k)
@@ -267,8 +281,15 @@ class ParserBase(object):
except AttributeError:
# does not exist, have to create it
self.rule_create(self.rules[rule_name])
self.rule_set(self.rules[rule_name], value)
try:
self.rule_create(self.rules[rule_name])
except AttributeError:
# no way to create it, give up
pass
else:
self.rule_set(self.rules[rule_name], value)
def rmv(self, rule_name):
# easy deleter
@@ -286,10 +307,7 @@ class ParserXML(ParserBase):
NSMAP = {'atom': 'http://www.w3.org/2005/Atom',
'atom03': 'http://purl.org/atom/ns#',
'media': 'http://search.yahoo.com/mrss/',
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'slash': 'http://purl.org/rss/1.0/modules/slash/',
'dc': 'http://purl.org/dc/elements/1.1/',
'content': 'http://purl.org/rss/1.0/modules/content/',
'rssfake': 'http://purl.org/rss/1.0/'}
@@ -301,7 +319,7 @@ class ParserXML(ParserBase):
return self.root.getparent().remove(self.root)
def tostring(self, encoding='unicode', **k):
return etree.tostring(self.root, encoding=encoding, **k)
return etree.tostring(self.root, encoding=encoding, method='xml', **k)
def _rule_parse(self, rule):
test = re.search(r'^(.*)/@([a-z]+)$', rule) # to match //div/a/@href
@@ -383,7 +401,8 @@ class ParserXML(ParserBase):
return
elif key is not None:
del x.attrib[key]
if key in match.attrib:
del match.attrib[key]
else:
match.getparent().remove(match)
@@ -401,13 +420,14 @@ class ParserXML(ParserBase):
else:
if html_rich:
# atom stuff
if 'atom' in rule:
match.attrib['type'] = 'xhtml'
self._clean_node(match)
match.append(lxml.html.fragment_fromstring(value, create_parent='div'))
match.find('div').drop_tag()
if self.rules['mode'] == 'html':
match.find('div').drop_tag() # not supported by lxml.etree
else: # i.e. if atom
match.attrib['type'] = 'xhtml'
else:
if match is not None and len(match):
@@ -440,11 +460,10 @@ class ParserHTML(ParserXML):
mimetype = ['text/html', 'application/xhtml+xml']
def parse(self, raw):
parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
return etree.fromstring(raw, parser)
return html_parse(raw, encoding=self.encoding)
def tostring(self, encoding='unicode', **k):
return lxml.html.tostring(self.root, encoding=encoding, **k)
return lxml.html.tostring(self.root, encoding=encoding, method='html', **k)
def rule_search_all(self, rule):
try:
@@ -467,6 +486,9 @@ class ParserHTML(ParserXML):
element = deepcopy(match)
match.getparent().append(element)
else:
raise AttributeError('no way to create item')
def parse_time(value):
if value is None or value == 0:
@@ -474,13 +496,13 @@ def parse_time(value):
elif isinstance(value, basestring):
if re.match(r'^[0-9]+$', value):
return datetime.fromtimestamp(int(value), tz.UTC)
return datetime.fromtimestamp(int(value), tz.tzutc())
else:
return dateutil.parser.parse(value)
return dateutil.parser.parse(value).replace(tzinfo=tz.tzutc())
elif isinstance(value, int):
return datetime.fromtimestamp(value, tz.UTC)
return datetime.fromtimestamp(value, tz.tzutc())
elif isinstance(value, datetime):
return value
@@ -696,13 +718,30 @@ class Item(Uniq):
class FeedXML(Feed, ParserXML):
itemsClass = 'ItemXML'
def root_siblings(self):
out = []
current = self.root.getprevious()
while current is not None:
out.append(current)
current = current.getprevious()
return out
def tostring(self, encoding='unicode', **k):
# override needed due to "getroottree" inclusion
# and to add stylesheet
if self.root.getprevious() is None:
self.root.addprevious(etree.PI('xml-stylesheet', 'type="text/xsl" href="/sheet.xsl"'))
stylesheets = [x for x in self.root_siblings() if isinstance(x, etree.PIBase) and x.target == 'xml-stylesheet']
return etree.tostring(self.root.getroottree(), encoding=encoding, **k)
for stylesheet in stylesheets:
# remove all stylesheets present (be that ours or others')
self.root.append(stylesheet) # needed as we can't delete root siblings https://stackoverflow.com/a/60232366
self.root.remove(stylesheet)
self.root.addprevious(etree.PI('xml-stylesheet', 'type="text/xsl" href="/sheet.xsl"'))
return etree.tostring(self.root.getroottree(), encoding=encoding, method='xml', **k)
class ItemXML(Item, ParserXML):
@@ -732,3 +771,17 @@ class ItemJSON(Item, ParserJSON):
return
cur = cur[node]
if __name__ == '__main__':
from . import crawler
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')
feed = parse(req['data'], url=req['url'], encoding=req['encoding'])
if sys.flags.interactive:
print('>>> Interactive shell: try using `feed`')
else:
for item in feed.items:
print(item.title, item.link)

View File

@@ -1,9 +1,8 @@
import sys
import os
import os.path
import time
import threading
import time
from datetime import datetime
from dateutil import tz
from fnmatch import fnmatch
import re
@@ -12,67 +11,49 @@ import lxml.etree
import lxml.html
from . import feeds
from . import feedify
from . import crawler
from . import readabilite
import wsgiref.simple_server
import wsgiref.handlers
try:
# python 2
from Queue import Queue
from httplib import HTTPException
from urllib import quote_plus
from urlparse import urlparse, urljoin, parse_qs
except ImportError:
# python 3
from queue import Queue
from http.client import HTTPException
from urllib.parse import quote_plus
from urllib.parse import urlparse, urljoin, parse_qs
LIM_ITEM = 100 # deletes what's beyond
LIM_TIME = 7 # deletes what's after
MAX_ITEM = 50 # cache-only beyond
MAX_TIME = 7 # cache-only after (in sec)
DELAY = 10 * 60 # xml cache & ETag cache (in sec)
TIMEOUT = 4 # http timeout (in sec)
THREADS = 10 # number of threads (1 for single-threaded)
DEBUG = False
PORT = 8080
MAX_ITEM = int(os.getenv('MAX_ITEM', 5)) # cache-only beyond
MAX_TIME = int(os.getenv('MAX_TIME', 2)) # cache-only after (in sec)
PROTOCOL = ['http', 'https', 'ftp']
LIM_ITEM = int(os.getenv('LIM_ITEM', 10)) # deletes what's beyond
LIM_TIME = int(os.getenv('LIM_TIME', 2.5)) # deletes what's after
def filterOptions(options):
return options
# example of filtering code below
#allowed = ['proxy', 'clip', 'keep', 'cache', 'force', 'silent', 'pro', 'debug']
#filtered = dict([(key,value) for (key,value) in options.items() if key in allowed])
#return filtered
DELAY = int(os.getenv('DELAY', 10 * 60)) # xml cache & ETag cache (in sec)
TIMEOUT = int(os.getenv('TIMEOUT', 4)) # http timeout (in sec)
class MorssException(Exception):
pass
def log(txt, force=False):
if DEBUG or force:
def log(txt):
if 'DEBUG' in os.environ:
if 'REQUEST_URI' in os.environ:
# when running on Apache
open('morss.log', 'a').write("%s\n" % repr(txt))
else:
# when using internal server or cli
print(repr(txt))
def len_html(txt):
if len(txt):
return len(lxml.html.fromstring(txt).text_content())
else:
return 0
@@ -80,6 +61,7 @@ def len_html(txt):
def count_words(txt):
if len(txt):
return len(lxml.html.fromstring(txt).text_content().split())
return 0
@@ -88,12 +70,14 @@ class Options:
if len(args):
self.options = args
self.options.update(options or {})
else:
self.options = options or {}
def __getattr__(self, key):
if key in self.options:
return self.options[key]
else:
return False
@@ -104,28 +88,11 @@ class Options:
return key in self.options
def parseOptions(options):
""" Turns ['md=True'] into {'md':True} """
out = {}
for option in options:
split = option.split('=', 1)
if len(split) > 1:
if split[0].lower() == 'true':
out[split[0]] = True
elif split[0].lower() == 'false':
out[split[0]] = False
else:
out[split[0]] = split[1]
else:
out[split[0]] = True
return out
def ItemFix(item, feedurl='/'):
def ItemFix(item, options, feedurl='/'):
""" Improves feed items (absolute links, resolve feedburner links, etc) """
# check unwanted uppercase title
if len(item.title) > 20 and item.title.isupper():
if item.title is not None and len(item.title) > 20 and item.title.isupper():
item.title = item.title.title()
# check if it includes link
@@ -140,6 +107,13 @@ def ItemFix(item, feedurl='/'):
item.link = match[0]
log(item.link)
# at user's election, use first <a>
if options.firstlink and (item.desc or item.content):
match = lxml.html.fromstring(item.desc or item.content).xpath('//a/@href')
if len(match):
item.link = match[0]
log(item.link)
# check relative urls
item.link = urljoin(feedurl, item.link)
@@ -158,6 +132,11 @@ def ItemFix(item, feedurl='/'):
item.link = parse_qs(urlparse(item.link).query)['url'][0]
log(item.link)
# pocket
if fnmatch(item.link, 'https://getpocket.com/redirect?url=*'):
item.link = parse_qs(urlparse(item.link).query)['url'][0]
log(item.link)
# facebook
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
item.link = parse_qs(urlparse(item.link).query)['u'][0]
@@ -183,7 +162,7 @@ def ItemFix(item, feedurl='/'):
# reddit
if urlparse(feedurl).netloc == 'www.reddit.com':
match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
match = lxml.html.fromstring(item.content).xpath('//a[text()="[link]"]/@href')
if len(match):
item.link = match[0]
log(item.link)
@@ -196,55 +175,36 @@ def ItemFill(item, options, feedurl='/', fast=False):
if not item.link:
log('no link')
return item
return True
log(item.link)
link = item.link
# twitter
if urlparse(feedurl).netloc == 'twitter.com':
match = lxml.html.fromstring(item.desc).xpath('//a/@data-expanded-url')
if len(match):
link = match[0]
log(link)
else:
link = None
# facebook
if urlparse(feedurl).netloc == 'graph.facebook.com':
match = lxml.html.fromstring(item.content).xpath('//a/@href')
if len(match) and urlparse(match[0]).netloc != 'www.facebook.com':
link = match[0]
log(link)
else:
link = None
if link is None:
log('no used link')
return True
# download
delay = -1
if fast:
# super-fast mode
if fast or options.fast:
# force cache, don't fetch
delay = -2
elif options.force:
# force refresh
delay = 0
else:
delay = 24*60*60 # 24h
try:
con = crawler.custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT)
data = con.read()
req = crawler.adv_get(url=item.link, delay=delay, timeout=TIMEOUT)
except (IOError, HTTPException) as e:
log('http error')
return False # let's just delete errors stuff when in cache mode
contenttype = con.info().get('Content-Type', '').split(';')[0]
if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
if req['contenttype'] not in crawler.MIMETYPE['html'] and req['contenttype'] != 'text/plain':
log('non-text page')
return True
out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con))
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
if out is not None:
item.content = out
@@ -265,10 +225,7 @@ def ItemBefore(item, options):
def ItemAfter(item, options):
if options.clip and item.desc and item.content:
item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
del item.desc
if not options.keep and not options.proxy:
item.content = item.desc + "<br/><br/><hr/><br/><br/>" + item.content
del item.desc
if options.nolink and item.content:
@@ -276,7 +233,7 @@ def ItemAfter(item, options):
for link in content.xpath('//a'):
log(link.text_content())
link.drop_tag()
item.content = lxml.etree.tostring(content)
item.content = lxml.etree.tostring(content, method='html')
if options.noref:
item.link = ''
@@ -285,71 +242,50 @@ def ItemAfter(item, options):
def FeedFetch(url, options):
# basic url clean-up
if url is None:
raise MorssException('No url provided')
if urlparse(url).scheme not in PROTOCOL:
url = 'http://' + url
log(url)
url = url.replace(' ', '%20')
if isinstance(url, bytes):
url = url.decode()
# allow for code execution for feedify
pre = feedify.pre_worker(url)
if pre:
url = pre
log('url redirect')
log(url)
# fetch feed
delay = DELAY
if options.theforce:
if options.force:
delay = 0
try:
con = crawler.custom_handler(accept='xml', strict=True, delay=delay,
encoding=options.encoding, basic=not options.items) \
.open(url, timeout=TIMEOUT * 2)
xml = con.read()
req = crawler.adv_get(url=url, follow=('rss' if not options.items else None), delay=delay, timeout=TIMEOUT * 2)
except (IOError, HTTPException):
raise MorssException('Error downloading feed')
contenttype = con.info().get('Content-Type', '').split(';')[0]
if options.items:
# using custom rules
rss = feeds.FeedHTML(xml, url, contenttype)
feed.rule
rss = feeds.FeedHTML(req['data'], encoding=req['encoding'])
rss.rules['title'] = options.title if options.title else '//head/title'
rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content'
rss.rules['items'] = options.items
if options.item_title:
rss.rules['item_title'] = options.item_title
if options.item_link:
rss.rules['item_link'] = options.item_link
rss.rules['item_title'] = options.item_title if options.item_title else '.'
rss.rules['item_link'] = options.item_link if options.item_link else './@href|.//a/@href|ancestor::a/@href'
if options.item_content:
rss.rules['item_content'] = options.item_content
if options.item_time:
rss.rules['item_time'] = options.item_time
rss = rss.convert(feeds.FeedXML)
else:
try:
rss = feeds.parse(xml, url, contenttype)
rss = feeds.parse(req['data'], url=url, encoding=req['encoding'])
rss = rss.convert(feeds.FeedXML)
# contains all fields, otherwise much-needed data can be lost
except TypeError:
log('random page')
log(contenttype)
log(req['contenttype'])
raise MorssException('Link provided is not a valid feed')
return rss
return req['url'], rss
def FeedGather(rss, url, options):
@@ -361,42 +297,37 @@ def FeedGather(rss, url, options):
lim_time = LIM_TIME
max_item = MAX_ITEM
max_time = MAX_TIME
threads = THREADS
if options.cache:
max_time = 0
if options.mono:
threads = 1
if options.newest:
# :newest take the newest items
now = datetime.now(tz.tzutc())
sorted_items = sorted(rss.items, key=lambda x:x.updated or x.time or now, reverse=True)
# set
def runner(queue):
while True:
value = queue.get()
try:
worker(*value)
except Exception as e:
log('Thread Error: %s' % e.message)
queue.task_done()
else:
# default behavior, take the first items (in appearing order)
sorted_items = list(rss.items)
def worker(i, item):
for i, item in enumerate(sorted_items):
if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0:
log('dropped')
item.remove()
return
continue
item = ItemBefore(item, options)
if item is None:
return
continue
item = ItemFix(item, url)
item = ItemFix(item, options, url)
if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
if not options.proxy:
if ItemFill(item, options, url, True) is False:
item.remove()
return
continue
else:
if not options.proxy:
@@ -404,22 +335,6 @@ def FeedGather(rss, url, options):
item = ItemAfter(item, options)
queue = Queue()
for i in range(threads):
t = threading.Thread(target=runner, args=(queue,))
t.daemon = True
t.start()
for i, item in enumerate(list(rss.items)):
if threads == 1:
worker(*[i, item])
else:
queue.put([i, item])
if threads != 1:
queue.join()
if options.ad:
new = rss.items.append()
new.title = "Are you hungry?"
@@ -433,37 +348,38 @@ def FeedGather(rss, url, options):
return rss
def FeedFormat(rss, options):
def FeedFormat(rss, options, encoding='utf-8'):
if options.callback:
if re.match(r'^[a-zA-Z0-9\.]+$', options.callback) is not None:
return '%s(%s)' % (options.callback, rss.tojson())
out = '%s(%s)' % (options.callback, rss.tojson(encoding='unicode'))
return out if encoding == 'unicode' else out.encode(encoding)
else:
raise MorssException('Invalid callback var name')
elif options.json:
elif options.format == 'json':
if options.indent:
return rss.tojson(encoding='UTF-8', indent=4)
return rss.tojson(encoding=encoding, indent=4)
else:
return rss.tojson(encoding='UTF-8')
return rss.tojson(encoding=encoding)
elif options.csv:
return rss.tocsv(encoding='UTF-8')
elif options.format == 'csv':
return rss.tocsv(encoding=encoding)
elif options.reader:
elif options.format == 'html':
if options.indent:
return rss.tohtml(encoding='UTF-8', pretty_print=True)
return rss.tohtml(encoding=encoding, pretty_print=True)
else:
return rss.tohtml(encoding='UTF-8')
return rss.tohtml(encoding=encoding)
else:
else: # i.e. format == 'rss'
if options.indent:
return rss.torss(xml_declaration=True, encoding='UTF-8', pretty_print=True)
return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding, pretty_print=True)
else:
return rss.torss(xml_declaration=True, encoding='UTF-8')
return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding)
def process(url, cache=None, options=None):
@@ -475,187 +391,7 @@ def process(url, cache=None, options=None):
if cache:
crawler.default_cache = crawler.SQLiteCache(cache)
rss = FeedFetch(url, options)
url, rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options)
return FeedFormat(rss, options)
def cgi_app(environ, start_response):
# get options
if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:]
else:
url = environ['PATH_INFO'][1:]
if environ['QUERY_STRING']:
url += '?' + environ['QUERY_STRING']
url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
if url.startswith(':'):
split = url.split('/', 1)
options = split[0].replace('|', '/').replace('\\\'', '\'').split(':')[1:]
if len(split) > 1:
url = split[1]
else:
url = ''
else:
options = []
# init
options = Options(filterOptions(parseOptions(options)))
headers = {}
global DEBUG
DEBUG = options.debug
# headers
headers['status'] = '200 OK'
headers['cache-control'] = 'max-age=%s' % DELAY
if options.cors:
headers['access-control-allow-origin'] = '*'
if options.html or options.reader:
headers['content-type'] = 'text/html'
elif options.txt or options.silent:
headers['content-type'] = 'text/plain'
elif options.json:
headers['content-type'] = 'application/json'
elif options.callback:
headers['content-type'] = 'application/javascript'
elif options.csv:
headers['content-type'] = 'text/csv'
headers['content-disposition'] = 'attachment; filename="feed.csv"'
else:
headers['content-type'] = 'text/xml'
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
# get the work done
rss = FeedFetch(url, options)
if headers['content-type'] == 'text/xml':
headers['content-type'] = rss.mimetype[0]
start_response(headers['status'], list(headers.items()))
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options)
if not options.silent:
return out
def cgi_wrapper(environ, start_response):
# simple http server for html and css
files = {
'': 'text/html',
'index.html': 'text/html'}
if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:]
else:
url = environ['PATH_INFO'][1:]
if url in files:
headers = {}
if url == '':
url = 'index.html'
if '--root' in sys.argv[1:]:
path = os.path.join(sys.argv[-1], url)
else:
path = url
try:
body = open(path, 'rb').read()
headers['status'] = '200 OK'
headers['content-type'] = files[url]
start_response(headers['status'], list(headers.items()))
return [body]
except IOError:
headers['status'] = '404 Not found'
start_response(headers['status'], list(headers.items()))
return ['Error %s' % headers['status']]
# actual morss use
try:
return [cgi_app(environ, start_response) or '(empty)']
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
headers = {'status': '500 Oops', 'content-type': 'text/plain'}
start_response(headers['status'], list(headers.items()), sys.exc_info())
log('ERROR <%s>: %s' % (url, e.message), force=True)
return ['An error happened:\n%s' % e.message]
def cli_app():
options = Options(filterOptions(parseOptions(sys.argv[1:-1])))
url = sys.argv[-1]
global DEBUG
DEBUG = options.debug
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options)
if not options.silent:
print(out.decode('utf-8', 'replace') if isinstance(out, bytes) else out)
log('done')
def isInt(string):
try:
int(string)
return True
except ValueError:
return False
def main():
if 'REQUEST_URI' in os.environ:
# mod_cgi
wsgiref.handlers.CGIHandler().run(cgi_wrapper)
elif len(sys.argv) <= 1 or isInt(sys.argv[1]) or '--root' in sys.argv[1:]:
# start internal (basic) http server
if len(sys.argv) > 1 and isInt(sys.argv[1]):
argPort = int(sys.argv[1])
if argPort > 0:
port = argPort
else:
raise MorssException('Port must be positive integer')
else:
port = PORT
print('Serving http://localhost:%s/'%port)
httpd = wsgiref.simple_server.make_server('', port, cgi_wrapper)
httpd.serve_forever()
else:
# as a CLI app
try:
cli_app()
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
print('ERROR: %s' % e.message)
if __name__ == '__main__':
main()
return FeedFormat(rss, options, 'unicode')

View File

@@ -1,13 +1,17 @@
import lxml.etree
import lxml.html
from bs4 import BeautifulSoup
import re
def parse(data, encoding=None):
if encoding:
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding=encoding)
data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')
else:
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
data = BeautifulSoup(data, 'lxml').prettify('utf-8')
parser = lxml.html.HTMLParser(remove_comments=True, encoding='utf-8')
return lxml.html.fromstring(data, parser=parser)
@@ -60,9 +64,10 @@ class_good = ['and', 'article', 'body', 'column', 'main',
regex_good = re.compile('|'.join(class_good), re.I)
tags_junk = ['script', 'head', 'iframe', 'object', 'noscript',
'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea',
'button', 'footer']
tags_dangerous = ['script', 'head', 'iframe', 'object', 'style', 'link', 'meta']
tags_junk = tags_dangerous + ['noscript', 'param', 'embed', 'layer', 'applet',
'form', 'input', 'textarea', 'button', 'footer']
tags_bad = tags_junk + ['a', 'aside']
@@ -90,13 +95,24 @@ def score_node(node):
" Score individual node "
score = 0
class_id = node.get('class', '') + node.get('id', '')
class_id = (node.get('class') or '') + (node.get('id') or '')
if (isinstance(node, lxml.html.HtmlComment)
or node.tag in tags_bad
or regex_bad.search(class_id)):
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
return 0
if node.tag in tags_dangerous:
return 0
if node.tag in tags_junk:
score += -1 # actuall -2 as tags_junk is included tags_bad
if node.tag in tags_bad:
score += -1
if regex_bad.search(class_id):
score += -1
if node.tag in tags_good:
score += 4
@@ -109,38 +125,47 @@ def score_node(node):
if wc != 0:
wca = count_words(' '.join([x.text_content() for x in node.findall('.//a')]))
score = score * ( 1 - float(wca)/wc )
score = score * ( 1 - 2 * float(wca)/wc )
return score
def score_all(node, grades=None):
def score_all(node):
" Fairly dumb loop to score all worthwhile nodes. Tries to be fast "
if grades is None:
grades = {}
for child in node:
score = score_node(child)
child.attrib['seen'] = 'yes, ' + str(int(score))
child.attrib['morss_own_score'] = str(float(score))
if score > 0:
spread_score(child, score, grades)
score_all(child, grades)
return grades
if score > 0 or len(list(child.iterancestors())) <= 2:
spread_score(child, score)
score_all(child)
def spread_score(node, score, grades):
def set_score(node, value):
node.attrib['morss_score'] = str(float(value))
def get_score(node):
return float(node.attrib.get('morss_score', 0))
def incr_score(node, delta):
set_score(node, get_score(node) + delta)
def get_all_scores(node):
return {x:get_score(x) for x in list(node.iter()) if get_score(x) != 0}
def spread_score(node, score):
" Spread the node's score to its parents, on a linear way "
delta = score / 2
for ancestor in [node,] + list(node.iterancestors()):
if score >= 1 or ancestor is node:
try:
grades[ancestor] += score
except KeyError:
grades[ancestor] = score
incr_score(ancestor, score)
score -= delta
@@ -148,26 +173,29 @@ def spread_score(node, score, grades):
break
def write_score_all(root, grades):
" Useful for debugging "
for node in root.iter():
node.attrib['score'] = str(int(grades.get(node, 0)))
def clean_root(root):
def clean_root(root, keep_threshold=None):
for node in list(root):
clean_root(node)
clean_node(node)
# bottom-up approach, i.e. starting with children before cleaning current node
clean_root(node, keep_threshold)
clean_node(node, keep_threshold)
def clean_node(node):
def clean_node(node, keep_threshold=None):
parent = node.getparent()
if parent is None:
# this is <html/> (or a removed element waiting for GC)
return
# remove dangerous tags, no matter what
if node.tag in tags_dangerous:
parent.remove(node)
return
# high score, so keep
if keep_threshold is not None and get_score(node) >= keep_threshold:
return
gdparent = parent.getparent()
# remove shitty tags
@@ -266,41 +294,54 @@ def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
return nodeA # should always find one tho, at least <html/>, but needed for max_depth
def rank_nodes(grades):
return sorted(grades.items(), key=lambda x: x[1], reverse=True)
def get_best_node(grades):
" To pick the best (raw) node. Another function will clean it "
if len(grades) == 1:
return grades[0]
top = rank_nodes(grades)
lowest = lowest_common_ancestor(top[0][0], top[1][0], 3)
return lowest
def get_article(data, url=None, encoding=None):
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5):
" Input a raw html string, returns a raw html string of the article "
html = parse(data, encoding)
scores = score_all(html)
html = parse(data, encoding_in)
score_all(html)
if not len(scores):
# rank all nodes (largest to smallest)
ranked_nodes = sorted(html.iter(), key=lambda x: get_score(x), reverse=True)
# minimum threshold
if not len(ranked_nodes) or get_score(ranked_nodes[0]) < threshold:
return None
best = get_best_node(scores)
# take common ancestor or the two highest rated nodes
if len(ranked_nodes) > 1:
best = lowest_common_ancestor(ranked_nodes[0], ranked_nodes[1], 3)
else:
best = ranked_nodes[0]
# clean up
if not debug:
keep_threshold = get_score(ranked_nodes[0]) * 3/4
clean_root(best, keep_threshold)
# check for spammy content (links only)
wc = count_words(best.text_content())
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
if wc - wca < 50 or float(wca) / wc > 0.3:
if not debug and (wc - wca < 50 or float(wca) / wc > 0.3):
return None
# fix urls
if url:
best.make_links_absolute(url)
clean_root(best)
return lxml.etree.tostring(best if not debug else html, method='html', encoding=encoding_out)
return lxml.etree.tostring(best, pretty_print=True)
if __name__ == '__main__':
import sys
from . import crawler
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
article = get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
if sys.flags.interactive:
print('>>> Interactive shell: try using `article`')
else:
print(article)

View File

@@ -1,210 +0,0 @@
@require(feed)
<!DOCTYPE html>
<html>
<head>
<title>@feed.title &#8211; via morss</title>
<meta charset="UTF-8" />
<meta name="description" content="@feed.desc (via morss)" />
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
<style type="text/css">
/* columns - from https://thisisdallas.github.io/Simple-Grid/simpleGrid.css */
* {
box-sizing: border-box;
}
#content {
width: 100%;
max-width: 1140px;
min-width: 755px;
margin: 0 auto;
overflow: hidden;
padding-top: 20px;
padding-left: 20px; /* grid-space to left */
padding-right: 0px; /* grid-space to right: (grid-space-left - column-space) e.g. 20px-20px=0 */
}
.item {
width: 33.33%;
float: left;
padding-right: 20px; /* column-space */
}
@@media handheld, only screen and (max-width: 767px) { /* @@ to escape from the template engine */
#content {
width: 100%;
min-width: 0;
margin-left: 0px;
margin-right: 0px;
padding-left: 20px; /* grid-space to left */
padding-right: 10px; /* grid-space to right: (grid-space-left - column-space) e.g. 20px-10px=10px */
}
.item {
width: auto;
float: none;
margin-left: 0px;
margin-right: 0px;
margin-top: 10px;
margin-bottom: 10px;
padding-left: 0px;
padding-right: 10px; /* column-space */
}
}
/* design */
#header h1, #header h2, #header p {
font-family: sans;
text-align: center;
margin: 0;
padding: 0;
}
#header h1 {
font-size: 2.5em;
font-weight: bold;
padding: 1em 0 0.25em;
}
#header h2 {
font-size: 1em;
font-weight: normal;
}
#header p {
color: gray;
font-style: italic;
font-size: 0.75em;
}
#content {
text-align: justify;
}
.item .title {
font-weight: bold;
display: block;
text-align: center;
}
.item .link {
color: inherit;
text-decoration: none;
}
.item:not(.active) {
cursor: pointer;
height: 20em;
margin-bottom: 20px;
overflow: hidden;
text-overflow: ellpisps;
padding: 0.25em;
position: relative;
}
.item:not(.active) .title {
padding-bottom: 0.1em;
margin-bottom: 0.1em;
border-bottom: 1px solid silver;
}
.item:not(.active):before {
content: " ";
display: block;
width: 100%;
position: absolute;
top: 18.5em;
height: 1.5em;
background: linear-gradient(to bottom, rgba(255,255,255,0) 0%, rgba(255,255,255,1) 100%);
}
.item:not(.active) .article * {
max-width: 100%;
font-size: 1em !important;
font-weight: normal;
display: inline;
margin: 0;
}
.item.active {
background: white;
position: fixed;
overflow: auto;
top: 0;
left: 0;
height: 100%;
width: 100%;
z-index: 1;
}
body.noscroll {
overflow: hidden;
}
.item.active > * {
max-width: 700px;
margin: auto;
}
.item.active .title {
font-size: 2em;
padding: 0.5em 0;
}
.item.active .article object,
.item.active .article video,
.item.active .article audio {
display: none;
}
.item.active .article img {
max-height: 20em;
max-width: 100%;
}
</style>
</head>
<body>
<div id="header">
<h1>@feed.title</h1>
@if feed.desc:
<h2>@feed.desc</h2>
@end
<p>- via morss</p>
</div>
<div id="content">
@for item in feed.items:
<div class="item">
@if item.link:
<a class="title link" href="@item.link" target="_blank">@item.title</a>
@else:
<span class="title">@item.title</span>
@end
<div class="article">
@if item.content:
@item.content
@else:
@item.desc
@end
</div>
</div>
@end
</div>
<script>
var items = document.getElementsByClassName('item')
for (var i in items)
items[i].onclick = function()
{
this.classList.toggle('active')
document.body.classList.toggle('noscroll')
}
</script>
</body>
</html>

255
morss/wsgi.py Normal file
View File

@@ -0,0 +1,255 @@
import sys
import os.path
import re
import lxml.etree
import cgitb
try:
# python 2
from urllib import unquote
except ImportError:
# python 3
from urllib.parse import unquote
from . import crawler
from . import readabilite
from .morss import FeedFetch, FeedGather, FeedFormat
from .morss import Options, log, TIMEOUT, DELAY, MorssException
def parse_options(options):
""" Turns ['md=True'] into {'md':True} """
out = {}
for option in options:
split = option.split('=', 1)
if len(split) > 1:
out[split[0]] = split[1]
else:
out[split[0]] = True
return out
def cgi_parse_environ(environ):
# get options
if 'REQUEST_URI' in environ:
# when running on Apache
url = environ['REQUEST_URI'][1:]
else:
# when using internal server
url = environ['PATH_INFO'][1:]
if environ['QUERY_STRING']:
url += '?' + environ['QUERY_STRING']
url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
if url.startswith(':'):
split = url.split('/', 1)
raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:]
if len(split) > 1:
url = split[1]
else:
url = ''
else:
raw_options = []
# init
options = Options(parse_options(raw_options))
return (url, options)
def cgi_app(environ, start_response):
url, options = cgi_parse_environ(environ)
headers = {}
# headers
headers['status'] = '200 OK'
headers['cache-control'] = 'max-age=%s' % DELAY
headers['x-content-type-options'] = 'nosniff' # safari work around
if options.cors:
headers['access-control-allow-origin'] = '*'
if options.format == 'html':
headers['content-type'] = 'text/html'
elif options.txt or options.silent:
headers['content-type'] = 'text/plain'
elif options.format == 'json':
headers['content-type'] = 'application/json'
elif options.callback:
headers['content-type'] = 'application/javascript'
elif options.format == 'csv':
headers['content-type'] = 'text/csv'
headers['content-disposition'] = 'attachment; filename="feed.csv"'
else:
headers['content-type'] = 'text/xml'
headers['content-type'] += '; charset=utf-8'
# get the work done
url, rss = FeedFetch(url, options)
start_response(headers['status'], list(headers.items()))
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options)
if options.silent:
return ['']
else:
return [out]
def middleware(func):
" Decorator to turn a function into a wsgi middleware "
# This is called when parsing the "@middleware" code
def app_builder(app):
# This is called when doing app = cgi_wrapper(app)
def app_wrap(environ, start_response):
# This is called when a http request is being processed
return func(environ, start_response, app)
return app_wrap
return app_builder
@middleware
def cgi_file_handler(environ, start_response, app):
" Simple HTTP server to serve static files (.html, .css, etc.) "
files = {
'': 'text/html',
'index.html': 'text/html',
'sheet.xsl': 'text/xsl'}
if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:]
else:
url = environ['PATH_INFO'][1:]
if url in files:
headers = {}
if url == '':
url = 'index.html'
paths = [os.path.join(sys.prefix, 'share/morss/www', url),
os.path.join(os.path.dirname(__file__), '../www', url)]
for path in paths:
try:
body = open(path, 'rb').read()
headers['status'] = '200 OK'
headers['content-type'] = files[url]
start_response(headers['status'], list(headers.items()))
return [body]
except IOError:
continue
else:
# the for loop did not return, so here we are, i.e. no file found
headers['status'] = '404 Not found'
start_response(headers['status'], list(headers.items()))
return ['Error %s' % headers['status']]
else:
return app(environ, start_response)
def cgi_get(environ, start_response):
url, options = cgi_parse_environ(environ)
# get page
req = crawler.adv_get(url=url, timeout=TIMEOUT)
if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
if options.get == 'page':
html = readabilite.parse(req['data'], encoding=req['encoding'])
html.make_links_absolute(req['url'])
kill_tags = ['script', 'iframe', 'noscript']
for tag in kill_tags:
for elem in html.xpath('//'+tag):
elem.getparent().remove(elem)
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
elif options.get == 'article':
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
else:
raise MorssException('no :get option passed')
else:
output = req['data']
# return html page
headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
start_response(headers['status'], list(headers.items()))
return [output]
dispatch_table = {
'get': cgi_get,
}
@middleware
def cgi_dispatcher(environ, start_response, app):
url, options = cgi_parse_environ(environ)
for key in dispatch_table.keys():
if key in options:
return dispatch_table[key](environ, start_response)
return app(environ, start_response)
@middleware
def cgi_error_handler(environ, start_response, app):
try:
return app(environ, start_response)
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
headers = {'status': '500 Oops', 'content-type': 'text/html'}
start_response(headers['status'], list(headers.items()), sys.exc_info())
log('ERROR: %s' % repr(e), force=True)
return [cgitb.html(sys.exc_info())]
@middleware
def cgi_encode(environ, start_response, app):
out = app(environ, start_response)
return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
application = cgi_app
application = cgi_file_handler(application)
application = cgi_dispatcher(application)
application = cgi_error_handler(application)
application = cgi_encode(application)

View File

@@ -1,4 +0,0 @@
lxml
python-dateutil <= 1.5
chardet
pymysql

View File

@@ -1,14 +1,24 @@
from setuptools import setup, find_packages
from setuptools import setup
from glob import glob
package_name = 'morss'
setup(
name=package_name,
description='Get full-text RSS feeds',
author='pictuga, Samuel Marks',
author_email='contact at pictuga dot com',
url='http://morss.it/',
license='AGPL v3',
package_dir={package_name: package_name},
packages=find_packages(),
package_data={package_name: ['feedify.ini', 'reader.html.template']},
test_suite=package_name + '.tests')
name = package_name,
description = 'Get full-text RSS feeds',
author = 'pictuga, Samuel Marks',
author_email = 'contact at pictuga dot com',
url = 'http://morss.it/',
download_url = 'https://git.pictuga.com/pictuga/morss',
license = 'AGPL v3',
packages = [package_name],
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet', 'pymysql'],
package_data = {package_name: ['feedify.ini']},
data_files = [
('share/' + package_name, ['README.md', 'LICENSE']),
('share/' + package_name + '/www', glob('www/*.*')),
('share/' + package_name + '/www/cgi', [])
],
entry_points = {
'console_scripts': [package_name + '=' + package_name + ':main']
})

View File

@@ -4,6 +4,12 @@ ErrorDocument 403 "Access forbidden"
ErrorDocument 404 /cgi/main.py
ErrorDocument 500 "A very nasty bug found his way onto this very server"
# Uncomment below line to turn debug on for all requests
#SetEnv DEBUG 1
# Uncomment below line to turn debug on for requests with :debug in the url
#SetEnvIf Request_URI :debug DEBUG=1
<Files ~ "\.(py|pyc|db|log)$">
deny from all
</Files>

View File

@@ -35,8 +35,8 @@
<input type="text" id="url" name="url" placeholder="Feed url (http://example.com/feed.xml)" />
</form>
<code>Copyright: pictuga 2013-2014<br/>
Source code: https://github.com/pictuga/morss</code>
<code>Copyright: pictuga 2013-2020<br/>
Source code: https://git.pictuga.com/pictuga/morss</code>
<script>
form = document.forms[0]

View File

@@ -1,5 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet version="1.1" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:stylesheet version="1.1"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:atom="http://www.w3.org/2005/Atom"
xmlns:atom03="http://purl.org/atom/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:rssfake="http://purl.org/rss/1.0/"
>
<xsl:output method="html"/>
@@ -7,116 +14,288 @@
<html>
<head>
<title>RSS feed by morss</title>
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
<meta name="viewport" content="width=device-width; initial-scale=1.0;" />
<meta name="robots" content="noindex" />
<style type="text/css">
body * {
box-sizing: border-box;
}
body {
overflow-wrap: anywhere;
word-wrap: anywhere;
word-break: break-word;
font-family: sans-serif;
-webkit-tap-highlight-color: transparent; /* safari work around */
}
#url {
background-color: rgba(255, 165, 0, 0.25);
padding: 1% 5%;
display: inline-block;
input, select {
font-family: inherit;
font-size: inherit;
text-align: inherit;
}
header {
text-align: justify;
text-align-last: center;
border-bottom: 1px solid silver;
}
.input-combo {
display: flex;
flex-flow: row;
align-items: stretch;
width: 800px;
max-width: 100%;
}
margin: auto;
body > ul {
border: 1px solid grey;
padding: .5em .5em;
background-color: #FFFAF4;
}
.input-combo * {
display: inline-block;
line-height: 2em;
border: 0;
background: transparent;
}
.input-combo > :not(.button) {
max-width: 100%;
flex-grow: 1;
flex-shrink 0;
white-space: nowrap;
text-overflow: ellipsis;
overflow: hidden;
}
.input-combo .button {
flex-grow: 0;
flex-shrink 1;
cursor: pointer;
min-width: 2em;
text-align: center;
border-left: 1px solid silver;
color: #06f;
}
[onclick_title] {
cursor: pointer;
position: relative;
}
[onclick_title]::before {
opacity: 0;
content: attr(onclick_title);
font-weight: normal;
position: absolute;
left: -300%;
z-index: 1;
background: grey;
color: white;
border-radius: 0.5em;
padding: 0 1em;
}
[onclick_title]:not(:active)::before {
transition: opacity 1s ease-in-out;
}
[onclick_title]:active::before {
opacity: 1;
}
header > form {
margin: 1%;
}
header a {
text-decoration: inherit;
color: #FF7B0A;
font-weight: bold;
}
.item {
background-color: #FFFAF4;
border: 1px solid silver;
margin: 1%;
max-width: 100%;
}
.item > * {
padding: 1%;
}
.item > *:empty {
display: none;
}
.item > :not(:last-child) {
border-bottom: 1px solid silver;
}
.item > a {
display: block;
font-weight: bold;
font-size: 1.5em;
}
.desc, .content {
overflow: hidden;
}
.desc *, .content * {
max-width: 100%;
}
ul {
list-style-type: none;
}
.tag {
color: darkred;
}
.attr {
color: darksalmon;
}
.value {
color: darkblue;
}
.comment {
color: lightgrey;
}
pre {
margin: 0;
max-width: 100%;
white-space: normal;
}
</style>
</head>
<body>
<h1>RSS feed by morss</h1>
<header>
<h1>RSS feed by morss</h1>
<p>Your RSS feed is <strong style="color: green">ready</strong>. You
can enter the following url in your newsreader:</p>
<p>Your RSS feed is <strong style="color: green">ready</strong>. You
can enter the following url in your newsreader:</p>
<div id="url"></div>
<div class="input-combo">
<input id="url" readonly="readonly"/>
<span class="button" onclick="copy_link()" title="Copy" onclick_title="Copied">
<svg width="16px" height="16px" viewBox="0 0 16 16" fill="currentColor" xmlns="http://www.w3.org/2000/svg">
<path fill-rule="evenodd" d="M4 1.5H3a2 2 0 00-2 2V14a2 2 0 002 2h10a2 2 0 002-2V3.5a2 2 0 00-2-2h-1v1h1a1 1 0 011 1V14a1 1 0 01-1 1H3a1 1 0 01-1-1V3.5a1 1 0 011-1h1v-1z" clip-rule="evenodd"/>
<path fill-rule="evenodd" d="M9.5 1h-3a.5.5 0 00-.5.5v1a.5.5 0 00.5.5h3a.5.5 0 00.5-.5v-1a.5.5 0 00-.5-.5zm-3-1A1.5 1.5 0 005 1.5v1A1.5 1.5 0 006.5 4h3A1.5 1.5 0 0011 2.5v-1A1.5 1.5 0 009.5 0h-3z" clip-rule="evenodd"/>
</svg>
</span>
</div>
<ul>
<xsl:apply-templates/>
</ul>
<form onchange="open_feed()">
More options: Output the
<select>
<option value="">full-text</option>
<option value=":proxy">original</option>
<option value=":clip" title="original + full-text: keep the original description above the full article. Useful for reddit feeds for example, to keep the comment links">combined (?)</option>
</select>
feed as
<select>
<option value="">RSS</option>
<option value=":json:cors">JSON</option>
<option value=":html">HTML</option>
<option value=":csv">CSV</option>
</select>
using the
<select>
<option value="">standard</option>
<option value=":firstlink" title="Pull the article from the first available link in the description, instead of the standard link. Useful for Twitter feeds for example, to get the articles referred to in tweets rather than the tweet itself">first (?)</option>
</select>
link of the
<select>
<option value="">first</option>
<option value=":newest" title="Select feed items by publication date (instead of appearing order)">newest (?)</option>
</select>
items and
<select>
<option value="">keep</option>
<option value=":nolink:noref">remove</option>
</select>
links
<input type="hidden" value="" name="extra_options"/>
</form>
<p>You can find a <em>preview</em> of the feed below. You need a <em>feed reader</em> for optimal use</p>
<p>Click <a href="/">here</a> to go back to morss and/or to use the tool on another feed</p>
</header>
<div id="header" dir="auto">
<h1>
<xsl:value-of select="rdf:RDF/rssfake:channel/rssfake:title|rss/channel/title|atom:feed/atom:title|atom03:feed/atom03:title"/>
</h1>
<p>
<xsl:value-of select="rdf:RDF/rssfake:channel/rssfake:description|rss/channel/description|atom:feed/atom:subtitle|atom03:feed/atom03:subtitle"/>
</p>
</div>
<div id="content">
<xsl:for-each select="rdf:RDF/rssfake:channel/rssfake:item|rss/channel/item|atom:feed/atom:entry|atom03:feed/atom03:entry">
<div class="item" dir="auto">
<a target="_blank"><xsl:attribute name="href"><xsl:value-of select="rssfake:link|link|atom:link/@href|atom03:link/@href"/></xsl:attribute>
<xsl:value-of select="rssfake:title|title|atom:title|atom03:title"/>
</a>
<div class="desc">
<xsl:copy-of select="rssfake:description|description|atom:summary|atom03:summary"/>
</div>
<div class="content">
<xsl:copy-of select="content:encoded|atom:content|atom03:content"/>
</div>
</div>
</xsl:for-each>
</div>
<script>
document.getElementById("url").innerHTML = window.location.href;
//<![CDATA[
document.getElementById("url").value = window.location.href
if (!/:html/.test(window.location.href))
for (var content of document.querySelectorAll(".desc,.content"))
content.innerHTML = (content.innerText.match(/>/g) || []).length > 3 ? content.innerText : content.innerHTML
var options = parse_location()[0]
if (options) {
for (var select of document.forms[0].elements)
if (select.tagName == 'SELECT')
for (var option of select)
if (option.value && options.match(option.value)) {
select.value = option.value
options = options.replace(option.value, '')
break
}
document.forms[0]['extra_options'].value = options
}
function copy_content(input) {
input.focus()
input.select()
document.execCommand('copy')
input.blur()
}
function copy_link() {
copy_content(document.getElementById("url"))
}
function parse_location() {
return (window.location.pathname + window.location.search).match(/^\/(?:(:[^\/]+)\/)?(.*$)$/).slice(1)
}
function open_feed() {
var url = parse_location()[1]
var options = Array.from(document.forms[0].elements).map(x=>x.value).join('')
var target = '/' + (options ? options + '/' : '') + url
if (target != window.location.pathname)
window.location.href = target
}
//]]>
</script>
</body>
</html>
</xsl:template>
<xsl:template match="*">
<li>
<span class="element">
&lt;
<span class="tag"><xsl:value-of select="name()"/></span>
<xsl:for-each select="@*">
<span class="attr"> <xsl:value-of select="name()"/></span>
=
"<span class="value"><xsl:value-of select="."/></span>"
</xsl:for-each>
&gt;
</span>
<xsl:if test="node()">
<ul>
<xsl:apply-templates/>
</ul>
</xsl:if>
<span class="element">
&lt;/
<span class="tag"><xsl:value-of select="name()"/></span>
&gt;
</span>
</li>
</xsl:template>
<xsl:template match="comment()">
<li>
<pre class="comment"><![CDATA[<!--]]><xsl:value-of select="."/><![CDATA[-->]]></pre>
</li>
</xsl:template>
<xsl:template match="text()">
<li>
<pre>
<xsl:value-of select="normalize-space(.)"/>
</pre>
</li>
</xsl:template>
<xsl:template match="text()[not(normalize-space())]"/>
</xsl:stylesheet>