Compare commits

...

283 Commits

Author SHA1 Message Date
1837eda25f heroku: add WORKERS to env vars
Some checks failed
continuous-integration/drone/push Build is failing
2021-11-24 21:40:46 +01:00
321763710d heroku: make env var customizable 2021-11-24 21:40:34 +01:00
e79c426c6e Add ability to change workers count for gunicorn 2021-11-24 21:36:28 +01:00
92a28be0b0 drone: add gevent dep
All checks were successful
continuous-integration/drone/push Build is passing
2021-11-24 21:25:29 +01:00
70db524664 docker: allow sh run
Some checks failed
continuous-integration/drone/push Build is failing
2021-11-24 21:23:16 +01:00
d8cc07223e readabilite: fix bug when nothing above threshold
Some checks failed
continuous-integration/drone/push Build is failing
2021-11-23 20:53:00 +01:00
37e08f8b4c Include gunicorn and gevent in [full]
Some checks failed
continuous-integration/drone/push Build is failing
2021-11-23 20:32:22 +01:00
8f576adb64 Add gevents deps
All checks were successful
continuous-integration/drone/push Build is passing
2021-11-23 20:22:47 +01:00
528b3448e4 Get gevents from pkgs (long to build)
All checks were successful
continuous-integration/drone/push Build is passing
2021-11-23 20:14:22 +01:00
f627f1b12b cloud-init: rename py3 packages for ubuntu 2021-11-23 20:13:40 +01:00
53fd97651e Fix [full] install instructions 2021-11-23 20:09:52 +01:00
4dd77b4bcc Add gevent to for deployment to get the latest one
All checks were successful
continuous-integration/drone/push Build is passing
2021-11-23 20:07:56 +01:00
deffeebd85 gunicorn: use more aggressive multi-threading settings
All checks were successful
continuous-integration/drone/push Build is passing
2021-11-23 20:00:10 +01:00
765e0ba728 Pass py error msg in http headers
All checks were successful
continuous-integration/drone/push Build is passing
2021-11-22 23:22:13 +01:00
12073ac7d8 Simplify cloud-init code
All checks were successful
continuous-integration/drone/push Build is passing
2021-11-22 21:53:18 +01:00
6d049935e3 More cloud instructions
All checks were successful
continuous-integration/drone/push Build is passing
2021-11-21 22:38:26 +01:00
7b64c963c4 README: nicer links 2021-11-21 22:37:25 +01:00
6900b9053c Heroku click to deploy
All checks were successful
continuous-integration/drone/push Build is passing
incl. workaround for their weird use of entrypoint
2021-11-21 21:57:06 +01:00
6ec3fb47d1 readabilite: .strip() first to save time
All checks were successful
continuous-integration/drone/push Build is passing
2021-11-15 21:54:07 +01:00
1083f3ffbc crawler: make sure to use HTTPMessage
All checks were successful
continuous-integration/drone/push Build is passing
2021-11-11 10:21:48 +01:00
7eeb1d696c crawler: clean up code
All checks were successful
continuous-integration/drone/push Build is passing
2021-11-10 23:25:03 +01:00
e42df98f83 crawler: fix regression brought with 44a6b2591
All checks were successful
continuous-integration/drone/push Build is passing
2021-11-10 23:08:31 +01:00
cb21871c35 crawler: clean up caching code
All checks were successful
continuous-integration/drone/push Build is passing
2021-11-08 22:02:23 +01:00
c71cf5d5ce caching: fix diskcache implementation 2021-11-08 21:57:43 +01:00
44a6b2591d crawler: cleaner http header object import 2021-11-07 19:44:36 +01:00
a890536601 morss: comment code a bit 2021-11-07 18:26:07 +01:00
8de309f2d4 caching: add diskcache backend 2021-11-07 18:15:20 +01:00
cbf7b3f77b caching: simplify sqlite code 2021-11-07 18:14:18 +01:00
1ff7e4103c Docker: make it possible to use it as cli
All checks were successful
continuous-integration/drone/push Build is passing
2021-10-31 17:15:08 +01:00
3f12258e98 Docker: default to non-root exec
All checks were successful
continuous-integration/drone/push Build is passing
2021-10-19 22:20:44 +02:00
d023ec8d73 Change default port to 8000 2021-10-19 22:19:59 +02:00
5473b77416 Post-clean up isort
All checks were successful
continuous-integration/drone/push Build is passing
2021-09-21 08:11:04 +02:00
0365232a73 readabilite: custom xpath for article detection
Some checks failed
continuous-integration/drone/push Build is failing
2021-09-21 08:04:45 +02:00
a523518ae8 cache: avoid name collision 2021-09-21 08:04:45 +02:00
52c48b899f readability: better var names 2021-09-21 08:04:45 +02:00
9649cabb1b morss: do not crash on empty pages 2021-09-21 08:04:45 +02:00
0c29102788 drone: full module install 2021-09-21 08:04:45 +02:00
10535a17c5 cache: fix isort 2021-09-21 08:04:45 +02:00
7d86972e58 Add Redis cache backend 2021-09-21 08:04:45 +02:00
62e04549ac Optdeps in REAMDE and Dockerfile 2021-09-21 08:04:45 +02:00
5da7121a77 Fix Options class behaviour 2021-09-21 08:04:45 +02:00
bb82902ad1 Move cache code to its own file 2021-09-21 08:04:45 +02:00
04afa28fe7 crawler: cache pickle'd array 2021-09-21 08:04:45 +02:00
75bb69f0fd Make mysql optdep 2021-09-21 08:04:45 +02:00
97d9dda547 crawler: support 308 redirects 2021-09-21 08:04:45 +02:00
0c31d9f6db ci: add spell check dict 2021-09-21 08:04:45 +02:00
49e29208ef ci: fix spell check 2021-09-21 08:04:45 +02:00
d8d608a4de ci: fix pylint install 2021-09-21 08:04:45 +02:00
5437e40a15 drone: use alpine image (to benefit from pkgs) 2021-09-21 08:04:45 +02:00
6c1f8da692 ci: added pylint (triggered upon error w/ score < 8 only) 2021-09-21 08:04:45 +02:00
a1a26d8209 README: add ci badge 2021-09-21 08:04:45 +02:00
edbb580f33 ci/cd: fix isort args 2021-09-21 08:04:45 +02:00
4fd730b983 Further isort implementation 2021-09-21 08:04:45 +02:00
198353d6b9 ci/cd attempt 2021-09-21 08:04:45 +02:00
0b3e6d7749 Apply isort 2021-09-21 08:04:23 +02:00
06e0ada95b Allow POST requests 2021-09-08 20:43:21 +02:00
71d9c7a027 README: updated ttrss link 2021-09-07 20:30:18 +02:00
37f5a92b05 wsgi: fix apache / workload 2021-09-06 22:01:48 +02:00
24c26d3850 wsgi: handle url decoding 2021-08-31 21:51:21 +02:00
8f24214915 crawler: better name for custom fns 2021-08-29 00:22:40 +02:00
d5942fe5a7 feeds: fix issues when mode not explicited in ruleset 2021-08-29 00:20:29 +02:00
6f50443995 morss: Options return None instead of False if no match
Better for default fn values
2021-08-29 00:19:09 +02:00
5582fbef31 crawler: comment 2021-08-29 00:18:50 +02:00
da5442a1dc feedify: support any type (json, xml, html) 2021-08-29 00:17:28 +02:00
f9d7794bcc feedifi.ini: json time zone handling 2021-04-23 20:43:00 +02:00
e37c8346d0 feeds: add fallback for time parser 2021-04-22 21:57:16 +02:00
3a1d564992 feeds: fix time zone handling 2021-04-22 21:51:00 +02:00
6880a443e0 crawler: improve CacheHandler code 2021-03-25 23:54:08 +01:00
7342ab26d2 crawler: comment on how urllib works 2021-03-25 23:49:58 +01:00
981da9e66a crawler: SQLITE_PATH point to .db file instead of folder 2021-03-25 23:48:21 +01:00
6ea9d012a2 readme: fix newsreader hook 2021-01-23 21:40:46 +01:00
95d6143636 docker: log to stdout 2021-01-15 23:31:27 +01:00
03cad120d0 README: comment on http timeout 2021-01-14 22:34:57 +01:00
01a7667032 Fix error due to remaining log force code 2021-01-14 00:51:47 +01:00
3e886caaab crawler: drop encoding setting 2020-10-30 22:41:16 +01:00
ad927e03a7 crawler: use regex instead of lxml
Less reliable but should be faster
2020-10-30 22:21:19 +01:00
0efb096fa7 crawler: shift gzip & encoding-fix to intermediary handler 2020-10-30 22:16:51 +01:00
9ab2e488ef crawler: add intermediary handlers 2020-10-30 22:15:35 +01:00
b525ab0d26 crawler: fix typo 2020-10-30 22:12:43 +01:00
fb19b1241f sheet.xsl: update to :format 2020-10-11 22:49:33 +02:00
9d062ef24b README: indicate time unit for env vars 2020-10-03 22:22:20 +02:00
447f62dc45 README: indicate to use build --no-cache --pull by default 2020-10-03 22:18:51 +02:00
18ec10fe44 Use hopefully more-up-to-date pip gunicorn 2020-10-03 21:59:43 +02:00
891c385b69 Dockerfile: no cache for pip to save space 2020-10-03 21:59:17 +02:00
0629bb98fb dockerfile: add python wheel 2020-10-03 21:16:04 +02:00
ae7ba458ce README fixes 2020-10-03 20:56:40 +02:00
bd0bca69fc crawler: ignore ssl via env var 2020-10-03 19:57:08 +02:00
8abd951d40 More sensible default values for cache autotrim (1k entries, 1min) 2020-10-03 19:55:57 +02:00
2514fabd38 Replace memory-leak-prone Uniq with @uniq_wrapper 2020-10-03 19:43:55 +02:00
8cb7002fe6 feeds: make it possible to append empty items
And return the newly appended items, to make it easy to edit them
2020-10-03 16:56:07 +02:00
6966e03bef Clean up itemClass code
To avoid globals()
2020-10-03 16:25:29 +02:00
03a122c41f Dockerfile: add --no-cache to save some space 2020-10-01 22:33:29 +02:00
5cd6c22d73 Reorganise the README file 2020-10-01 22:25:53 +02:00
e1b41b5f64 Typo in README 2020-10-01 00:18:48 +02:00
9ce6acba20 Fix gunicorn related typo 2020-10-01 00:07:41 +02:00
6192ff4081 gunicorn with --preload
To only load the code once (and start autotrim once)
2020-10-01 00:05:39 +02:00
056a1b143f crawler: autotrim: make ctrl+c working 2020-10-01 00:04:36 +02:00
eed949736a crawler: add ability to limit cache size 2020-09-30 23:59:55 +02:00
2fc7cd391c Shift __main__'s wsgi code where it belongs 2020-09-30 23:24:51 +02:00
d9f46b23a6 crawler: default value for MYSQL_HOST (localhost) 2020-09-30 13:17:02 +02:00
bbada0436a Quick guide to ignore SSL certs 2020-09-27 16:48:22 +02:00
039a672f4e wsgi: clean up url reconstruction 2020-09-27 16:28:26 +02:00
b290568e14 README: decent line length
Obtained from the output of:
python -m morss --help | cat
2020-09-15 23:01:42 +02:00
9ecf856f10 Add :resolve to remove (some?) tracking links 2020-09-15 22:57:52 +02:00
504ede624d Logo CC BY-NC-SA 4.0 2020-09-03 13:17:58 +02:00
0d89f0e6f2 Add a logo
B&W edition of the logo at https://morss.it/
2020-08-28 21:28:07 +02:00
56e0c2391d Missing import for served files 2020-08-28 20:53:03 +02:00
679f406a12 Default mimetype for served files 2020-08-28 20:52:43 +02:00
f6d641eeef Serve any file in www/
Also fixes #41
2020-08-28 20:45:39 +02:00
2456dd9bbc Fix broken pieces
Including #43
2020-08-28 19:38:48 +02:00
0f33db248a Add license info in each file 2020-08-26 20:08:22 +02:00
d57f543c7b README: remove todo 2020-08-24 21:17:31 +02:00
fba112147c README: make it clear that the internal server is _very_ basic 2020-08-24 21:14:48 +02:00
8697c3f0df Remove remaining --debug from README 2020-08-24 19:39:27 +02:00
75935114e4 Remove leftover code 2020-08-23 19:07:12 +02:00
5bd2557619 Fix typo in provided .htaccess 2020-08-23 19:01:34 +02:00
598a2591f1 Dockerfile: remove confusing one-liner code 2020-08-23 18:59:16 +02:00
e76ab2b631 Update gunicorn instructions 2020-08-23 18:59:02 +02:00
aa9143302b Remove now-unused isInt code 2020-08-23 18:51:09 +02:00
0d62a7625b Define http port via env vars as well 2020-08-23 18:50:18 +02:00
bd0efb1529 crawler: missing os import 2020-08-23 18:45:44 +02:00
47a17614ef Rename morss/cgi.py into morss/wsgi.py
To avoid name collision with the built-in cgi lib
2020-08-23 18:44:49 +02:00
4dfebe78f7 Pick caching backend via env vars 2020-08-23 18:43:18 +02:00
dcd3e4a675 cgi.py: add missing impots 2020-08-23 18:31:05 +02:00
e968b2ea7f Remove leftover :debug code 2020-08-23 16:59:34 +02:00
0ac590c798 Set MAX_/LIM_* settings via env var 2020-08-23 16:09:58 +02:00
fa1b5aef09 Instructions for DEBUG= use 2020-08-23 15:31:11 +02:00
7f6309f618 README: :silent was explained twice 2020-08-23 14:34:04 +02:00
f65fb45030 :debug completely deprecated in favour of DEBUG= 2020-08-23 14:33:32 +02:00
6dd40e5cc4 cli.py: fix Options code 2020-08-23 14:25:09 +02:00
0acfce5a22 cli.py: remove log 2020-08-23 14:24:57 +02:00
97ccc15db0 cgi.py: rename parseOptions to parse_options 2020-08-23 14:24:23 +02:00
7a560181f7 Use env var for DEBUG 2020-08-23 14:23:45 +02:00
baccd3b22b Move parseOptions to cgi.py
As it is no longer used in cli.py
2020-08-22 00:37:34 +02:00
f79938ab11 Add :silent to readme & argparse 2020-08-22 00:02:08 +02:00
5b8bd47829 cli.py: remove draft code 2020-08-21 23:59:12 +02:00
b5b355aa6e readabilite: increase penalty for high link density 2020-08-21 23:55:04 +02:00
94097f481a sheet.xsl: better handle some corner cases 2020-08-21 23:54:35 +02:00
8161baa7ae sheet.xsl: improve css 2020-08-21 23:54:12 +02:00
bd182bcb85 Move cli code to argParse
Related code changes (incl. :format=xyz)
2020-08-21 23:52:56 +02:00
c7c2c5d749 Removed unused filterOptions code 2020-08-21 23:23:33 +02:00
c6b52e625f split morss.py into __main__/cgi/cli.py
Should hopefully allow cleaner code in the future
2020-08-21 22:17:55 +02:00
c6d3a0eb53 readabilite: clean up code 2020-07-15 00:49:34 +02:00
c628ee802c README: add docker-compose instructions 2020-07-13 20:50:39 +02:00
6021b912ff morss: fix item removal
Usual issue when editing a list while looping over it
2020-07-06 19:25:48 +02:00
f18a128ee6 Change :first for :newest
i.e. toggle default for the more-obvious option
2020-07-06 19:25:17 +02:00
64af86c11e crawler: catch html parsing errors 2020-07-06 12:25:38 +02:00
15951d228c Add :first to NOT sort items by date 2020-07-06 11:39:08 +02:00
c1b1f5f58a morss: restrict iframe use from :get to avoid abuse 2020-06-09 12:33:37 +02:00
985185f47f morss: more flexible feed creator auto-detection 2020-06-08 13:03:24 +02:00
3190d1ec5a feeds: remove useless if(len) before loop 2020-06-02 13:57:45 +02:00
9815794a97 sheet.xsl: make text more self explanatory 2020-05-27 21:42:00 +02:00
758b6861b9 sheet.xsl: fix text alignment 2020-05-27 21:36:11 +02:00
ce4cf01aa6 crawler: clean up encoding detection code 2020-05-27 21:35:24 +02:00
dcfdb75a15 crawler: fix chinese encoding support 2020-05-27 21:34:43 +02:00
4ccc0dafcd Basic help for sub-lib interactive use 2020-05-26 19:34:20 +02:00
2fe3e0b8ee feeds: clean up other stylesheets before putting ours 2020-05-26 19:26:36 +02:00
ad3ba9de1a sheet.xsl: add <select/> to use :firstlink 2020-05-13 12:33:12 +02:00
68c46a1823 morss: remove deprecated twitter/fb link handling 2020-05-13 12:31:09 +02:00
91be2d229e morss: ability to use first link from desc instead of default link 2020-05-13 12:29:53 +02:00
038f267ea2 Rename :theforce into :force 2020-05-13 11:49:15 +02:00
22005065e8 Use etree.tostring 'method' arg
Gives appropriately formatted html code.
Some pages might otherwise be rendered as blank.
2020-05-13 11:44:34 +02:00
7d0d416610 morss: cache articles for 24hrs
Also make it possible to refetch articles, regardless of cache
2020-05-12 21:10:31 +02:00
5dac4c69a1 crawler: more code comments 2020-05-12 20:44:25 +02:00
36e2a1c3fd crawler: increase size limit from 100KiB to 500
I'm looking at you, worldbankgroup.csod.com/ats/careersite/search.aspx
2020-05-12 19:34:16 +02:00
83dd2925d3 readabilite: better parsing
Keeping blank_text keeps the tree more as-it, making the final output closer to expectations
2020-05-12 14:15:53 +02:00
e09d0abf54 morss: remove deprecated peace of code 2020-05-07 16:05:30 +02:00
ff26a560cb Shift safari work around to morss.py 2020-05-07 16:04:54 +02:00
74d7a1eca2 sheet.xsl: fix word wrap 2020-05-06 16:58:28 +02:00
eba295cba8 sheet.xsl: fixes for safari 2020-05-06 12:01:27 +02:00
f27631954e .htaccess: bypass Safari RSS detection 2020-05-06 11:47:24 +02:00
c74abfa2f4 sheet.xsl: use CDATA for js code 2020-05-06 11:46:38 +02:00
1d5272c299 sheet.xsl: allow zooming on mobile 2020-05-04 14:44:43 +02:00
f685139137 crawler: use UPSERT statements
Avoid potential race conditions
2020-05-03 21:27:45 +02:00
73b477665e morss: separate :clip with <hr> instead of stars 2020-05-02 19:19:54 +02:00
b425992783 morss: don't follow alt=rss with custom feeds
To have the same page as with :get=page and to avoid shitty feeds
2020-05-02 19:18:58 +02:00
271ac8f80f crawler: comment code a bit 2020-05-02 19:18:01 +02:00
64e41b807d crawler: handle http:/ (single slash)
Fixing one more corner case! malayalam.oneindia.com
2020-05-02 19:17:15 +02:00
a2c4691090 sheet.xsl: dir=auto for rtl languages (arabic, etc.) 2020-04-29 15:01:33 +02:00
b6000923bc README: clean up deprecated code 2020-04-28 22:31:11 +02:00
27a42c47aa morss: use final request url
Code is not very elegant...
2020-04-28 22:30:21 +02:00
c27c38f7c7 crawler: return dict instead of tuple 2020-04-28 22:29:07 +02:00
a1dc96cb50 feeds: remove mimetype from function call as no longer used 2020-04-28 22:07:25 +02:00
749acc87fc Centralize url clean up in crawler.py 2020-04-28 22:03:49 +02:00
c186188557 README: warning about lxml installation 2020-04-28 21:58:26 +02:00
cb69e3167f crawler: accept non-ascii urls
Covering one more corner case!
2020-04-28 14:47:23 +02:00
c3f06da947 morss: process(): specify encoding for clarity 2020-04-28 14:45:00 +02:00
44a3e0edc4 readabilite: specify in- and out-going encoding 2020-04-28 14:44:35 +02:00
4a9b505499 README: update python lib instructions 2020-04-27 18:12:14 +02:00
818cdaaa9b Make it possible to call sub-libs in non interactive mode
Run `python -m morss.feeds http://lemonde.fr` and so on
2020-04-27 18:00:14 +02:00
2806c64326 Make it possible to directly run sub-libs (feeds, crawler, readabilite)
Run `python -im morss.feeds http://website.sample/rss.xml` and so on
2020-04-27 17:19:31 +02:00
d39d7bb19d sheet.xsl: limit overflow 2020-04-25 15:27:49 +02:00
e5e3746fc6 sheet.xsl: show plain url 2020-04-25 15:27:13 +02:00
960c9d10d6 sheet.xsl: customize output feed form 2020-04-25 15:26:47 +02:00
0e7a5b9780 sheet.xsl: wrap header in <header> 2020-04-25 15:24:57 +02:00
186bedcf62 sheet.xsl: smarter html reparser 2020-04-25 15:22:25 +02:00
5847e18e42 sheet: improved feed address output (w/ c/c) 2020-04-25 15:21:47 +02:00
f6bc23927f readabilite: drop dangerous tags (script, style) 2020-04-25 12:25:02 +02:00
c86572374e readabilite: minimum score requirement 2020-04-25 12:24:36 +02:00
59ef5af9e2 feeds: fix bug when deleting attr in html 2020-04-24 22:12:05 +02:00
6a0531ca03 crawler: randomize user agent 2020-04-24 11:28:39 +02:00
8187876a06 crawler: stop at first alternative link
Should save a few ms and the first one is usually (?) the most relevant/generic
2020-04-23 11:23:45 +02:00
325a373e3e feeds: add SyntaxError catch 2020-04-20 16:15:15 +02:00
2719bd6776 crawler: fix chinese encoding 2020-04-20 16:14:55 +02:00
285e1e5f42 docker: pip install local 2020-04-19 13:25:53 +02:00
41a63900c2 README: improve docker instructions 2020-04-19 13:01:08 +02:00
ec8edb02f1 Various small bug fixes 2020-04-19 12:54:02 +02:00
d01b943597 Remove leftover threading var 2020-04-19 12:51:11 +02:00
b361aa2867 Add timeout to :get 2020-04-19 12:50:26 +02:00
4ce3c7cb32 Small code clean ups 2020-04-19 12:50:05 +02:00
7e45b2611d Disable multi-threading
Impact was mostly negative due to locks
2020-04-19 12:29:52 +02:00
036e5190f1 crawler: remove unused code 2020-04-18 21:40:02 +02:00
e99c5b3b71 morss: more sensible default MAX/LIM values 2020-04-18 17:21:45 +02:00
4f44df8d63 Make all ports default to 8080 2020-04-18 17:15:59 +02:00
497c14db81 Add dockerfile & how to in README 2020-04-18 17:04:44 +02:00
a4e1dba8b7 sheet.xsl: improve url display 2020-04-16 10:33:36 +02:00
7375adce33 sheet.xsl: fix & improve 2020-04-15 23:34:28 +02:00
663212de0a sheet.xsl: various cosmetic improvements 2020-04-15 23:22:45 +02:00
4a2ea1bce9 README: add gunicorn instructions 2020-04-15 22:31:21 +02:00
fe82b19c91 Merge .xsl & html template
Turns out they somehow serve a similar purpose
2020-04-15 22:30:45 +02:00
0b31e97492 morss: remove debug code in http file handler 2020-04-14 23:20:03 +02:00
b0ad7c259d Add README & LICENSE to data_files 2020-04-14 19:34:12 +02:00
bffb23f884 README: how to use cli 2020-04-14 18:21:32 +02:00
59139272fd Auto-detect the location of www/
Either ../www or /usr/share/morss
Adapted README accordingly
2020-04-14 18:07:19 +02:00
39b0a1d7cc setup.py: fix deps & files 2020-04-14 17:36:42 +02:00
65803b328d New git url and updated date in provided index.html 2020-04-13 15:30:32 +02:00
e6b7c0eb33 Fix app definition for uwsgi 2020-04-13 15:30:09 +02:00
67c096ad5b feeds: add fake path to default html parser
Without it, some websites were accidentally matching it (false positives)
2020-04-12 13:00:56 +02:00
f018437544 crawler: make mysql backend thread safe 2020-04-12 12:53:05 +02:00
8e5e8d24a4 Timezone fixes 2020-04-10 20:33:59 +02:00
ee78a7875a morss: focus on the most recent feed items 2020-04-10 16:08:13 +02:00
9e7b9d95ee feeds: properly use html template 2020-04-09 20:00:51 +02:00
987a719c4e feeds: try all parsers regardless of contenttype
Turns out some websites send the wrong contenttype (json for html, html for xml, etc.)
2020-04-09 19:17:51 +02:00
47b33f4baa morss: specify server output encoding 2020-04-09 19:10:45 +02:00
3c7f512583 feeds: handle several errors 2020-04-09 19:09:10 +02:00
a32f5a8536 readabilite: add debug option (also used by :get) 2020-04-09 19:08:13 +02:00
63a06524b7 morss: various encoding fixes 2020-04-09 19:06:51 +02:00
b0f80c6d3c morss: fix csv output encoding 2020-04-09 19:05:50 +02:00
78cea10ead morss: replace :getpage with :get
Also provides readabilite debugging
2020-04-09 18:43:20 +02:00
e5a82ff1f4 crawler: drop auto-referer
Was solving some issues. But creating even more issues.
2020-04-07 10:39:21 +02:00
f3d1f92b39 Detect encoding everytime 2020-04-07 10:38:36 +02:00
7691df5257 Use wrapper for http calls 2020-04-07 10:30:17 +02:00
0ae0dbc175 README: mention csv output 2020-04-07 09:24:32 +02:00
f1d0431e68 morss: drop :html, replaced with :reader
README updated accordingly
2020-04-07 09:23:29 +02:00
a09831415f feeds: fix bug when mimetype matches nothing 2020-04-06 18:53:07 +02:00
bfad6b7a4a readabilite: clean before counting
To remove links which are not kept anyway
2020-04-06 16:55:39 +02:00
6b8c3e51e7 readabilite: fix threshold feature
Awkward typo...
2020-04-06 16:52:06 +02:00
dc9e425247 readabilite: don't clean-out the top 10% nodes
Loosen up the code once again to limit over-kill
2020-04-06 14:26:28 +02:00
2f48e18bb1 readabilite: put scores directly in html node
Probably slower but makes code somewhat cleaner...
2020-04-06 14:21:41 +02:00
31cac921c7 README: remove ref to iTunes 2020-04-05 22:20:33 +02:00
a82ec96eb7 Delete feedify.py leftover code
iTunes integration untested, unreliable and not working...
2020-04-05 22:16:52 +02:00
aad2398e69 feeds: turns out lxml.etree doesn't have drop_tag 2020-04-05 21:50:38 +02:00
eeac630855 crawler: add more "realistic" headers 2020-04-05 21:11:57 +02:00
e136b0feb2 readabilite: loosen the slayer
Previous impl. lead to too many empty results
2020-04-05 20:47:30 +02:00
6cf32af6c0 readabilite: also use BS 2020-04-05 20:46:42 +02:00
568e7d7dd2 feeds: make BS's output bytes for lxml's sake 2020-04-05 20:46:04 +02:00
3617f86e9d morss: make cgi_encore more robust 2020-04-05 16:43:11 +02:00
d90756b337 morss: drop 'keep' option
Because the Firefox behaviour it is working around is no longer in use
2020-04-05 16:37:27 +02:00
40c69f17d2 feeds: parse html with BS
More robust & to make it consistent with :getpage
2020-04-05 16:12:41 +02:00
99461ea185 crawler: fix var name issues (private_cache) 2020-04-05 16:11:36 +02:00
bf86c1e962 crawler: make AutoUA match http(s) type 2020-04-05 16:07:51 +02:00
d20f6237bd crawler: replace ContentNegoHandler with AlternateHandler
More basic. Sends the same headers no matter what. Make requests more "replicable".
Also, drop "text/xml" from RSS contenttype, too broad, matches garbage
2020-04-05 16:05:59 +02:00
8a4d68d72c crawler: drop 'basic' toggle
Can't even remember the use case
2020-04-05 16:03:06 +02:00
e6811138fd morss: use redirected url in :getpage
Still have to find how to do the same thing with feeds...
2020-04-04 20:04:57 +02:00
35b702fffd morss: default values for feed creation 2020-04-04 19:39:32 +02:00
4a88886767 morss: get_page to act as a basic proxy (for iframes) 2020-04-04 16:37:15 +02:00
1653394cf7 morss: cgi_dispatcher to be able to create extra functions 2020-04-04 16:35:16 +02:00
a8a90cf414 morss: move url/options parsing to own function
For future re-use
2020-04-04 16:33:52 +02:00
bdbaf0f8a7 morss/cgi: fix handling of special chars in url 2020-04-04 16:21:37 +02:00
d0e447a2a6 ItemFix: clean up Pocket links 2020-04-04 16:20:39 +02:00
e6817e01b4 sheet.xsl: set font to "sans"
Browsers don't all have the same default font. Overriding for consistency
2020-04-03 17:47:19 +02:00
7c3091d64c morss: code spacing
One of those commits that make me feel useful
2020-03-21 23:41:46 +01:00
37b4e144a9 morss: small fixes
Includes dropping off ftp support
2020-03-21 23:30:18 +01:00
bd4b7b5bb2 morss: convert HTML feeds to XML ones for completeness 2020-03-21 23:27:42 +01:00
68d920d4b5 morss: make FeedFormat more flexible with encoding 2020-03-21 23:26:35 +01:00
758ff404a8 morss: fix cgi_app silent output
*Must* return sth
2020-03-21 23:25:25 +01:00
463530f02c morss: middleware to enforce encoding
bytes are always expected
2020-03-21 23:23:50 +01:00
ec0a28a91d morss: use middleware for wsgi apps 2020-03-21 23:23:21 +01:00
421acb439d morss: make errors more readable over http 2020-03-21 23:08:29 +01:00
42c5d09ccb morss: split "options" var into "raw_options" & "options"
To make it clearer who-is-what
2020-03-21 23:07:07 +01:00
056de12484 morss: add sheet.xsl to file handled by http server 2020-03-21 23:06:28 +01:00
961a31141f morss: fix url fixing 2020-03-21 17:28:00 +01:00
a7b01ee85e readabilite: further html processing instructions fix 2020-03-21 17:23:50 +01:00
26 changed files with 2312 additions and 1336 deletions

15
.drone.yml Normal file
View File

@@ -0,0 +1,15 @@
kind: pipeline
name: default
steps:
- name: isort
image: python:alpine
commands:
- pip install isort
- isort --check-only --diff .
- name: pylint
image: alpine
commands:
- apk add --no-cache python3 py3-lxml py3-gevent py3-pip py3-wheel py3-pylint py3-enchant hunspell-en
- pip3 install --no-cache-dir .[full]
- pylint morss --rcfile=.pylintrc --disable=C,R,W --fail-under=8

50
.pylintrc Normal file
View File

@@ -0,0 +1,50 @@
[MASTER]
ignore=CVS
suggestion-mode=yes
extension-pkg-allow-list=lxml.etree
[MESSAGES CONTROL]
disable=missing-function-docstring,
missing-class-docstring,
missing-module-docstring,
wrong-spelling-in-comment,
[REPORTS]
reports=yes
score=yes
[SPELLING]
spelling-dict=en_GB
spelling-ignore-words=morss
[STRING]
check-quote-consistency=yes
check-str-concat-over-line-jumps=yes
[VARIABLES]
allow-global-unused-variables=no
init-import=no
[FORMAT]
expected-line-ending-format=LF
indent-string=' '
max-line-length=120
max-module-lines=1000
[BASIC]
argument-naming-style=snake_case
attr-naming-style=snake_case
class-attribute-naming-style=snake_case
class-const-naming-style=UPPER_CASE
class-naming-style=PascalCase
const-naming-style=UPPER_CASE
function-naming-style=snake_case
inlinevar-naming-style=snake_case
method-naming-style=snake_case
module-naming-style=snake_case
variable-naming-style=snake_case
include-naming-hint=yes
bad-names=foo, bar
good-names=i, j, k

11
Dockerfile Normal file
View File

@@ -0,0 +1,11 @@
FROM alpine:latest
RUN apk add --no-cache python3 py3-pip py3-wheel git py3-lxml py3-gevent py3-zope-interface py3-zope-event
ADD . /app
RUN pip3 install --no-cache-dir /app[full]
USER 1000:1000
ENTRYPOINT ["/bin/sh", "/app/docker-entry.sh"]
CMD ["run"]

421
README.md
View File

@@ -1,10 +1,13 @@
# Morss - Get full-text RSS feeds
_GNU AGPLv3 code_
[![Build Status](https://ci.pictuga.com/api/badges/pictuga/morss/status.svg)](https://ci.pictuga.com/pictuga/morss)
Upstream source code: https://git.pictuga.com/pictuga/morss
Github mirror (for Issues & Pull requests): https://github.com/pictuga/morss
Homepage: https://morss.it/
_GNU AGPLv3 code_
_Provided logo is CC BY-NC-SA 4.0_
Upstream source code: <https://git.pictuga.com/pictuga/morss>
Github mirror (for Issues & Pull requests): <https://github.com/pictuga/morss>
Homepage: <https://morss.it/>
This tool's goal is to get full-text RSS feeds out of striped RSS feeds,
commonly available on internet. Indeed most newspapers only make a small
@@ -18,100 +21,182 @@ Morss also provides additional features, such as: .csv and json export, extended
control over output. A strength of morss is its ability to deal with broken
feeds, and to replace tracking links with direct links to the actual content.
Morss can also generate feeds from html and json files (see `feedify.py`), which
Morss can also generate feeds from html and json files (see `feeds.py`), which
for instance makes it possible to get feeds for Facebook or Twitter, using
hand-written rules (ie. there's no automatic detection of links to build feeds).
Please mind that feeds based on html files may stop working unexpectedly, due to
html structure changes on the target website.
Additionally morss can grab the source xml feed of iTunes podcast, and detect
rss feeds in html pages' `<meta>`.
Additionally morss can detect rss feeds in html pages' `<meta>`.
You can use this program online for free at **[morss.it](https://morss.it/)**.
Some features of morss:
- Read RSS/Atom feeds
- Create RSS feeds from json/html pages
- Convert iTunes podcast links into xml links
- Export feeds as RSS/JSON/CSV/HTML
- Fetch full-text content of feed items
- Follow 301/meta redirects
- Recover xml feeds with corrupt encoding
- Supports gzip-compressed http content
- HTTP caching with 3 different backends (in-memory/sqlite/mysql)
- HTTP caching with different backends (in-memory/sqlite/mysql/redis/diskcache)
- Works as server/cli tool
- Deobfuscate various tracking links
## Dependencies
## Install
You do need:
### Python package
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
- [lxml](http://lxml.de/) for xml parsing
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
- [chardet](https://pypi.python.org/pypi/chardet)
- [six](https://pypi.python.org/pypi/six), a dependency of chardet
- pymysql
Simplest way to get these:
Simple install (without optional dependencies)
```shell
pip install -r requirements.txt
pip install git+https://git.pictuga.com/pictuga/morss.git
```
You may also need:
Full installation (including optional dependencies)
- Apache, with python-cgi support, to run on a server
- a fast internet connection
```shell
pip install git+https://git.pictuga.com/pictuga/morss.git#egg=morss[full]
```
## Arguments
The full install includes mysql, redis and diskcache (possible cache backends).
Otherwise, only in-memory and sqlite3 caches are available. The full install
also includes gunicorn and gevent (for more efficient HTTP handling).
morss accepts some arguments, to lightly alter the output of morss. Arguments
may need to have a value (usually a string or a number). In the different "Use
cases" below is detailed how to pass those arguments to morss.
The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
C code needs to be compiled). If possible on your distribution, try installing
it with the system package manager.
The arguments are:
### Docker
- Change what morss does
- `json`: output as JSON
- `proxy`: doesn't fill the articles
- `clip`: stick the full article content under the original feed content (useful for twitter)
- `keep`: by default, morss does drop feed description whenever the full-content is found (so as not to mislead users who use Firefox, since the latter only shows the description in the feed preview, so they might believe morss doens't work), but with this argument, the description is kept
- `search=STRING`: does a basic case-sensitive search in the feed
- Advanced
- `csv`: export to csv
- `indent`: returns indented XML or JSON, takes more place, but human-readable
- `nolink`: drop links, but keeps links' inner text
- `noref`: drop items' link
- `cache`: only take articles from the cache (ie. don't grab new articles' content), so as to save time
- `debug`: to have some feedback from the script execution. Useful for debugging
- `mono`: disable multithreading while fetching, makes debugging easier
- `theforce`: force download the rss feed and ignore cached http errros
- `silent`: don't output the final RSS (useless on its own, but can be nice when debugging)
- `encoding=ENCODING`: overrides the encoding auto-detection of the crawler. Some web developers did not quite understand the importance of setting charset/encoding tags correctly...
- http server only
- `callback=NAME`: for JSONP calls
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other servers)
- `html`: changes the http content-type to html, so that python cgi erros (written in html) are readable in a web browser
- `txt`: changes the http content-type to txt (for faster "`view-source:`")
- Custom feeds: you can turn any HTML page into a RSS feed using morss, using xpath rules. The article content will be fetched as usual (with readabilite). Please note that you will have to **replace** any `/` in your rule with a `|` when using morss as a webserver
- `items`: (**mandatory** to activate the custom feeds function) xpath rule to match all the RSS entries
- `item_link`: xpath rule relative to `items` to point to the entry's link
- `item_title`: entry's title
- `item_content`: entry's description
- `item_time`: entry's date & time (accepts a wide range of time formats)
Build
## Use cases
```shell
docker build --tag morss https://git.pictuga.com/pictuga/morss.git --no-cache --pull
```
With docker-compose:
```yml
services:
app:
build: https://git.pictuga.com/pictuga/morss.git
image: morss
ports:
- '8000:8000'
```
Then execute
```shell
docker-compose build --no-cache --pull
```
### Cloud providers
One-click deployment:
* Heroku: <https://heroku.com/deploy?template=https://github.com/pictuga/morss>
* Google Cloud: <https://deploy.cloud.run/?git_repo=https://github.com/pictuga/morss.git>
Providers supporting `cloud-init` (AWS, Oracle Cloud Infrastructure), based on Ubuntu:
``` yml
#cloud-config
packages:
- python3-pip
- python3-wheels
- python3-lxml
- python3-gevent
- python3-zope.interface
- python3-zope.event
- git
- ca-certificates
write_files:
- path: /etc/environment
content: |
DEBUG=1
CACHE=diskcache
CACHE_SIZE=1073741824
runcmd:
- update-ca-certificates
- pip install git+https://git.pictuga.com/pictuga/morss.git#egg=morss[full]
- gunicorn --bind 0.0.0.0:${PORT:-8000} --workers ${WORKERS:-4} --worker-class=gevent --preload --access-logfile - morss
```
## Run
morss will auto-detect what "mode" to use.
### Running on a server
### Running on/as a server
Set up the server as indicated below, then visit:
```
http://PATH/TO/MORSS/[main.py/][:argwithoutvalue[:argwithvalue=value[...]]]/FEEDURL
```
For example: `http://morss.example/:clip/https://twitter.com/pictuga`
*(Brackets indicate optional text)*
The `main.py` part is only needed if your server doesn't support the Apache
redirect rule set in the provided `.htaccess`.
Works like a charm with [Tiny Tiny RSS](https://tt-rss.org/), and most probably
other clients.
#### Using Docker
Run
```shell
docker run -p 8000:8000 morss
```
With docker-compose:
```shell
docker-compose up
```
#### Using Gunicorn
```shell
gunicorn --preload morss
```
#### Using uWSGI
Running this command should do:
```shell
uwsgi --http :8000 --plugin python --wsgi-file main.py
```
#### Using morss' internal HTTP server
Morss can run its own, **very basic**, HTTP server, meant for debugging mostly.
The latter should start when you run morss without any argument, on port 8000.
I'd highly recommend you to use gunicorn or something similar for better
performance.
```shell
morss
```
You can change the port using environment variables like this `PORT=9000 morss`.
#### Via mod_cgi/FastCGI with Apache/nginx
For this, you'll want to change a bit the architecture of the files, for example
into something like this.
```
/
├── cgi
@@ -138,50 +223,24 @@ method uses HTTP calls to fetch the RSS feeds, which will be handled through
Please pay attention to `main.py` permissions for it to be executable. Also
ensure that the provided `/www/.htaccess` works well with your server.
#### Using uWSGI
Running this command should do:
```shell
uwsgi --http :9090 --plugin python --wsgi-file main.py
```
However, one problem might be how to serve the provided `index.html` file if it
isn't in the same directory. Therefore you can add this at the end of the
command to point to another directory `--pyargv '--root ../../www/'`.
#### Using morss' internal HTTP server
Morss can run its own HTTP server. The later should start when you run morss
without any argument, on port 8080.
You can change the port and the location of the `www/` folder like this `python -m morss 9000 --root ../../www`.
#### Passing arguments
Then visit:
```
http://PATH/TO/MORSS/[main.py/][:argwithoutvalue[:argwithvalue=value[...]]]/FEEDURL
```
For example: `http://morss.example/:clip/https://twitter.com/pictuga`
*(Brackets indicate optional text)*
The `main.py` part is only needed if your server doesn't support the Apache redirect rule set in the provided `.htaccess`.
Works like a charm with [Tiny Tiny RSS](http://tt-rss.org/redmine/projects/tt-rss/wiki), and most probably other clients.
### As a CLI application
Run:
```
python[2.7] -m morss [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
morss [--argwithoutvalue] [--argwithvalue=value] [...] FEEDURL
```
For example: `python -m morss debug http://feeds.bbci.co.uk/news/rss.xml`
For example: `morss --clip http://feeds.bbci.co.uk/news/rss.xml`
*(Brackets indicate optional text)*
If using Docker:
```shell
docker run morss --clip http://feeds.bbci.co.uk/news/rss.xml
```
### As a newsreader hook
To use it, the newsreader [Liferea](http://lzone.de/liferea/) is required
@@ -189,17 +248,21 @@ To use it, the newsreader [Liferea](http://lzone.de/liferea/) is required
scripts can be run on top of the RSS feed, using its
[output](http://lzone.de/liferea/scraping.htm) as an RSS feed.
To use this script, you have to enable "(Unix) command" in liferea feed settings, and use the command:
To use this script, you have to enable "(Unix) command" in liferea feed
settings, and use the command:
```
[python[2.7]] PATH/TO/MORSS/main.py [argwithoutvalue] [argwithvalue=value] [...] FEEDURL
morss [--argwithoutvalue] [--argwithvalue=value] [...] FEEDURL
```
For example: `python2.7 PATH/TO/MORSS/main.py http://feeds.bbci.co.uk/news/rss.xml`
For example: `morss http://feeds.bbci.co.uk/news/rss.xml`
*(Brackets indicate optional text)*
### As a python library
Quickly get a full-text feed:
```python
>>> import morss
>>> xml_string = morss.process('http://feeds.bbci.co.uk/news/rss.xml')
@@ -208,6 +271,7 @@ Quickly get a full-text feed:
```
Using cache and passing arguments:
```python
>>> import morss
>>> url = 'http://feeds.bbci.co.uk/news/rss.xml'
@@ -223,6 +287,7 @@ possible to call the simpler functions, to have more control on what's happening
under the hood.
Doing it step-by-step:
```python
import morss, morss.crawler
@@ -230,46 +295,155 @@ url = 'http://newspaper.example/feed.xml'
options = morss.Options(csv=True) # arguments
morss.crawler.sqlite_default = '/tmp/morss-cache.db' # sqlite cache location
rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
url, rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
rss = morss.FeedGather(rss, url, options) # this fills the feed and cleans it up
output = morss.Format(rss, options) # formats final feed
output = morss.FeedFormat(rss, options, 'unicode') # formats final feed
```
## Cache information
## Arguments and settings
morss uses caching to make loading faster. There are 2 possible cache backends
(visible in `morss/crawler.py`):
### Arguments
- `SQLiteCache`: sqlite3 cache. Default file location is in-memory (i.e. it will
be cleared every time the program is run
- `MySQLCacheHandler`: /!\ Does NOT support multi-threading
morss accepts some arguments, to lightly alter the output of morss. Arguments
may need to have a value (usually a string or a number). How to pass those
arguments to morss is explained in Run above.
## Configuration
### Length limitation
The list of arguments can be obtained by running `morss --help`
```
usage: morss [-h] [--post STRING] [--xpath XPATH]
[--format {rss,json,html,csv}] [--search STRING] [--clip]
[--indent] [--cache] [--force] [--proxy] [--newest] [--firstlink]
[--resolve] [--items XPATH] [--item_link XPATH]
[--item_title XPATH] [--item_content XPATH] [--item_time XPATH]
[--nolink] [--noref] [--silent]
url
Get full-text RSS feeds
positional arguments:
url feed url
optional arguments:
-h, --help show this help message and exit
--post STRING POST request
--xpath XPATH xpath rule to manually detect the article
output:
--format {rss,json,html,csv}
output format
--search STRING does a basic case-sensitive search in the feed
--clip stick the full article content under the original feed
content (useful for twitter)
--indent returns indented XML or JSON, takes more place, but
human-readable
action:
--cache only take articles from the cache (ie. don't grab new
articles' content), so as to save time
--force force refetch the rss feed and articles
--proxy doesn't fill the articles
--newest return the feed items in chronological order (morss
ohterwise shows the items by appearing order)
--firstlink pull the first article mentioned in the description
instead of the default link
--resolve replace tracking links with direct links to articles
(not compatible with --proxy)
custom feeds:
--items XPATH (mandatory to activate the custom feeds function)
xpath rule to match all the RSS entries
--item_link XPATH xpath rule relative to items to point to the entry's
link
--item_title XPATH entry's title
--item_content XPATH entry's content
--item_time XPATH entry's date & time (accepts a wide range of time
formats)
misc:
--nolink drop links, but keeps links' inner text
--noref drop items' link
--silent don't output the final RSS (useless on its own, but
can be nice when debugging)
GNU AGPLv3 code
```
Further HTTP-only options:
- `callback=NAME`: for JSONP calls
- `cors`: allow Cross-origin resource sharing (allows XHR calls from other
servers)
- `txt`: changes the http content-type to txt (for faster "`view-source:`")
### Environment variables
To pass environment variables:
- Docker-cli: `docker run -p 8000:8000 morss --env KEY=value`
- docker-compose: add an `environment:` section in the .yml file
- Gunicorn/uWSGI/CLI: prepend `KEY=value` before the command
- Apache: via the `SetEnv` instruction (see sample `.htaccess` provided)
Generic:
- `DEBUG=1`: to have some feedback from the script execution. Useful for
debugging.
- `IGNORE_SSL=1`: to ignore SSL certs when fetch feeds and articles
- `DELAY` (seconds) sets the browser cache delay, only for HTTP clients
- `TIMEOUT` (seconds) sets the HTTP timeout when fetching rss feeds and articles
- `WORKERS` (number) sets the number of gunicorn workers to use
When parsing long feeds, with a lot of items (100+), morss might take a lot of
time to parse it, or might even run into a memory overflow on some shared
hosting plans (limits around 10Mb), in which case you might want to adjust the
different values at the top of the script.
below settings via environment variables.
- `MAX_TIME` sets the maximum amount of time spent *fetching* articles, more time might be spent taking older articles from cache. `-1` for unlimited.
- `MAX_ITEM` sets the maximum number of articles to fetch. `-1` for unlimited. More articles will be taken from cache following the nexts settings.
- `LIM_TIME` sets the maximum amount of time spent working on the feed (whether or not it's already cached). Articles beyond that limit will be dropped from the feed. `-1` for unlimited.
- `LIM_ITEM` sets the maximum number of article checked, limiting both the number of articles fetched and taken from cache. Articles beyond that limit will be dropped from the feed, even if they're cached. `-1` for unlimited.
Also, if the request takes too long to process, the http request might be
discarded. See relevant config for
[gunicorn](https://docs.gunicorn.org/en/stable/settings.html#timeout) or
[nginx](http://nginx.org/en/docs/http/ngx_http_proxy_module.html#proxy_read_timeout).
### Other settings
- `MAX_TIME` (seconds) sets the maximum amount of time spent *fetching*
articles, more time might be spent taking older articles from cache. `-1` for
unlimited.
- `MAX_ITEM` sets the maximum number of articles to fetch. `-1` for unlimited.
More articles will be taken from cache following the nexts settings.
- `LIM_TIME` (seconds) sets the maximum amount of time spent working on the feed
(whether or not it's already cached). Articles beyond that limit will be dropped
from the feed. `-1` for unlimited.
- `LIM_ITEM` sets the maximum number of article checked, limiting both the
number of articles fetched and taken from cache. Articles beyond that limit will
be dropped from the feed, even if they're cached. `-1` for unlimited.
- `DELAY` sets the browser cache delay, only for HTTP clients
- `TIMEOUT` sets the HTTP timeout when fetching rss feeds and articles
- `THREADS` sets the number of threads to use. `1` makes no use of multithreading.
morss uses caching to make loading faster. There are 3 possible cache backends:
- `(nothing/default)`: a simple python in-memory dict-like object.
- `CACHE=sqlite`: sqlite3 cache. Default file location is in-memory (i.e. it
will be cleared every time the program is run). Path can be defined with
`SQLITE_PATH`.
- `CACHE=mysql`: MySQL cache. Connection can be defined with the following
environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
- `CACHE=redis`: Redis cache. Connection can be defined with the following
environment variables: `REDIS_HOST`, `REDIS_PORT`, `REDIS_DB`, `REDIS_PWD`
- `CACHE=diskcache`: disk-based cache. Target directory canbe defined with
`DISKCAHE_DIR`.
To limit the size of the cache:
- `CACHE_SIZE` sets the target number of items in the cache (further items will
be deleted but the cache might be temporarily bigger than that). Defaults to 1k
entries. NB. When using `diskcache`, this is the cache max size in Bytes.
- `CACHE_LIFESPAN` (seconds) sets how often the cache must be trimmed (i.e. cut
down to the number of items set in `CACHE_SIZE`). Defaults to 1min.
### Content matching
The content of articles is grabbed with our own readability fork. This means
that most of the time the right content is matched. However sometimes it fails,
therefore some tweaking is required. Most of the time, what has to be done is to
add some "rules" in the main script file in *readability* (not in morss).
add some "rules" in the main script file in `readabilite.py` (not in morss).
Most of the time when hardly nothing is matched, it means that the main content
of the article is made of images, videos, pictures, etc., which readability
@@ -280,14 +454,3 @@ morss will also try to figure out whether the full content is already in place
(for those websites which understood the whole point of RSS feeds). However this
detection is very simple, and only works if the actual content is put in the
"content" section in the feed and not in the "summary" section.
***
## Todo
You can contribute to this project. If you're not sure what to do, you can pick
from this list:
- Add ability to run morss.py as an update daemon
- Add ability to use custom xpath rule instead of readability
- More ideas here <https://github.com/pictuga/morss/issues/15>

21
app.json Normal file
View File

@@ -0,0 +1,21 @@
{
"stack": "container",
"env": {
"DEBUG": {
"value": 1,
"required": false
},
"WORKERS": {
"value": 4,
"required": false
},
"CACHE": {
"value": "diskcache",
"required": false
},
"CACHE_SIZE": {
"value": 1073741824,
"required": false
}
}
}

12
docker-entry.sh Normal file
View File

@@ -0,0 +1,12 @@
#! /bin/sh
if [ "$1" = "sh" ] || [ "$1" = "bash" ]; then
exec $@
elif [ -z "$1" ] || [ "$@" = "run" ]; then
gunicorn --bind 0.0.0.0:${PORT:-8000} --workers ${WORKERS:-4} --worker-class=gevent --preload --access-logfile - morss
else
morss $@
fi

3
heroku.yml Normal file
View File

@@ -0,0 +1,3 @@
build:
docker:
web: Dockerfile

20
main.py
View File

@@ -1,6 +1,24 @@
#!/usr/bin/env python
from morss import main, cgi_wrapper as application
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
from morss.__main__ import main
from morss.wsgi import application
if __name__ == '__main__':
main()

View File

@@ -1,2 +1,23 @@
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
# ran on `import morss`
# pylint: disable=unused-import,unused-variable
from .morss import *
from .wsgi import application

View File

@@ -1,5 +1,48 @@
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
# ran on `python -m morss`
from .morss import main
import os
import sys
from . import cli, wsgi
from .morss import MorssException
def main():
if 'REQUEST_URI' in os.environ:
# mod_cgi (w/o file handler)
wsgi.cgi_handle_request()
elif len(sys.argv) <= 1:
# start internal (basic) http server (w/ file handler)
wsgi.cgi_start_server()
else:
# as a CLI app
try:
cli.cli_app()
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
print('ERROR: %s' % e.message)
if __name__ == '__main__':
main()

213
morss/caching.py Normal file
View File

@@ -0,0 +1,213 @@
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import pickle
import threading
import time
from collections import OrderedDict
CACHE_SIZE = int(os.getenv('CACHE_SIZE', 1000)) # max number of items in cache (default: 1k items)
CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60)) # how often to auto-clear the cache (default: 1min)
class BaseCache:
""" Subclasses must behave like a dict """
def trim(self):
pass
def autotrim(self, delay=CACHE_LIFESPAN):
# trim the cache every so often
self.trim()
t = threading.Timer(delay, self.autotrim)
t.daemon = True
t.start()
def __contains__(self, url):
try:
self[url]
except KeyError:
return False
else:
return True
try:
import sqlite3 # isort:skip
except ImportError:
pass
class SQLiteCache(BaseCache):
def __init__(self, path=':memory:'):
self.con = sqlite3.connect(path, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
with self.con:
self.con.execute('CREATE TABLE IF NOT EXISTS data (ky UNICODE PRIMARY KEY, data BLOB, timestamp INT)')
self.con.execute('pragma journal_mode=WAL')
self.trim()
def __del__(self):
self.con.close()
def trim(self):
with self.con:
self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,))
def __getitem__(self, key):
row = self.con.execute('SELECT * FROM data WHERE ky=?', (key,)).fetchone()
if not row:
raise KeyError
return row[1]
def __setitem__(self, key, data):
with self.con:
self.con.execute('INSERT INTO data VALUES (?,?,?) ON CONFLICT(ky) DO UPDATE SET data=?, timestamp=?', (key, data, time.time(), data, time.time()))
try:
import pymysql.cursors # isort:skip
except ImportError:
pass
class MySQLCacheHandler(BaseCache):
def __init__(self, user, password, database, host='localhost'):
self.user = user
self.password = password
self.database = database
self.host = host
with self.cursor() as cursor:
cursor.execute('CREATE TABLE IF NOT EXISTS data (ky VARCHAR(255) NOT NULL PRIMARY KEY, data MEDIUMBLOB, timestamp INT)')
self.trim()
def cursor(self):
return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
def trim(self):
with self.cursor() as cursor:
cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,))
def __getitem__(self, key):
cursor = self.cursor()
cursor.execute('SELECT * FROM data WHERE ky=%s', (key,))
row = cursor.fetchone()
if not row:
raise KeyError
return row[1]
def __setitem__(self, key, data):
with self.cursor() as cursor:
cursor.execute('INSERT INTO data VALUES (%s,%s,%s) ON DUPLICATE KEY UPDATE data=%s, timestamp=%s',
(key, data, time.time(), data, time.time()))
class CappedDict(OrderedDict, BaseCache):
def trim(self):
if CACHE_SIZE >= 0:
for i in range( max( len(self) - CACHE_SIZE , 0 )):
self.popitem(False)
def __setitem__(self, key, data):
# https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
if key in self:
del self[key]
OrderedDict.__setitem__(self, key, data)
try:
import redis # isort:skip
except ImportError:
pass
class RedisCacheHandler(BaseCache):
def __init__(self, host='localhost', port=6379, db=0, password=None):
self.r = redis.Redis(host=host, port=port, db=db, password=password)
def __getitem__(self, key):
return self.r.get(key)
def __setitem__(self, key, data):
self.r.set(key, data)
try:
import diskcache # isort:skip
except ImportError:
pass
class DiskCacheHandler(BaseCache):
def __init__(self, directory=None, **kwargs):
self.cache = diskcache.Cache(directory=directory, eviction_policy='least-frequently-used', **kwargs)
def __del__(self):
self.cache.close()
def trim(self):
self.cache.cull()
def __getitem__(self, key):
return self.cache[key]
def __setitem__(self, key, data):
self.cache.set(key, data)
if 'CACHE' in os.environ:
if os.environ['CACHE'] == 'mysql':
default_cache = MySQLCacheHandler(
user = os.getenv('MYSQL_USER'),
password = os.getenv('MYSQL_PWD'),
database = os.getenv('MYSQL_DB'),
host = os.getenv('MYSQL_HOST', 'localhost')
)
elif os.environ['CACHE'] == 'sqlite':
default_cache = SQLiteCache(
os.getenv('SQLITE_PATH', ':memory:')
)
elif os.environ['CACHE'] == 'redis':
default_cache = RedisCacheHandler(
host = os.getenv('REDIS_HOST', 'localhost'),
port = int(os.getenv('REDIS_PORT', 6379)),
db = int(os.getenv('REDIS_DB', 0)),
password = os.getenv('REDIS_PWD', None)
)
elif os.environ['CACHE'] == 'diskcache':
default_cache = DiskCacheHandler(
directory = os.getenv('DISKCAHE_DIR', '/tmp/morss-diskcache'),
size_limit = CACHE_SIZE # in Bytes
)
else:
default_cache = CappedDict()

71
morss/cli.py Normal file
View File

@@ -0,0 +1,71 @@
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import argparse
import os.path
import sys
from .morss import FeedFetch, FeedFormat, FeedGather, Options
def cli_app():
parser = argparse.ArgumentParser(
prog='morss',
description='Get full-text RSS feeds',
epilog='GNU AGPLv3 code'
)
parser.add_argument('url', help='feed url')
parser.add_argument('--post', action='store', type=str, metavar='STRING', help='POST request')
parser.add_argument('--xpath', action='store', type=str, metavar='XPATH', help='xpath rule to manually detect the article')
group = parser.add_argument_group('output')
group.add_argument('--format', default='rss', choices=('rss', 'json', 'html', 'csv'), help='output format')
group.add_argument('--search', action='store', type=str, metavar='STRING', help='does a basic case-sensitive search in the feed')
group.add_argument('--clip', action='store_true', help='stick the full article content under the original feed content (useful for twitter)')
group.add_argument('--indent', action='store_true', help='returns indented XML or JSON, takes more place, but human-readable')
group = parser.add_argument_group('action')
group.add_argument('--cache', action='store_true', help='only take articles from the cache (ie. don\'t grab new articles\' content), so as to save time')
group.add_argument('--force', action='store_true', help='force refetch the rss feed and articles')
group.add_argument('--proxy', action='store_true', help='doesn\'t fill the articles')
group.add_argument('--newest', action='store_true', help='return the feed items in chronological order (morss ohterwise shows the items by appearing order)')
group.add_argument('--firstlink', action='store_true', help='pull the first article mentioned in the description instead of the default link')
group.add_argument('--resolve', action='store_true', help='replace tracking links with direct links to articles (not compatible with --proxy)')
group = parser.add_argument_group('custom feeds')
group.add_argument('--items', action='store', type=str, metavar='XPATH', help='(mandatory to activate the custom feeds function) xpath rule to match all the RSS entries')
group.add_argument('--item_link', action='store', type=str, metavar='XPATH', help='xpath rule relative to items to point to the entry\'s link')
group.add_argument('--item_title', action='store', type=str, metavar='XPATH', help='entry\'s title')
group.add_argument('--item_content', action='store', type=str, metavar='XPATH', help='entry\'s content')
group.add_argument('--item_time', action='store', type=str, metavar='XPATH', help='entry\'s date & time (accepts a wide range of time formats)')
group = parser.add_argument_group('misc')
group.add_argument('--nolink', action='store_true', help='drop links, but keeps links\' inner text')
group.add_argument('--noref', action='store_true', help='drop items\' link')
group.add_argument('--silent', action='store_true', help='don\'t output the final RSS (useless on its own, but can be nice when debugging)')
options = Options(vars(parser.parse_args()))
url = options.url
url, rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options, 'unicode')
if not options.silent:
print(out)

View File

@@ -1,21 +1,52 @@
import sys
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import zlib
from io import BytesIO, StringIO
import os
import pickle
import random
import re
import chardet
from cgi import parse_header
import lxml.html
import sys
import time
import zlib
from cgi import parse_header
from collections import OrderedDict
from io import BytesIO, StringIO
import chardet
from .caching import default_cache
try:
# python 2
from urllib2 import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
import mimetools
from urllib import quote
from httplib import HTTPMessage
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
Request, addinfourl, build_opener, parse_http_list,
parse_keqv_list)
from urlparse import urlparse, urlunparse
except ImportError:
# python 3
from urllib.request import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
import email
from email import message_from_string
from http.client import HTTPMessage
from urllib.parse import quote, urlparse, urlunparse
from urllib.request import (BaseHandler, HTTPCookieProcessor,
HTTPRedirectHandler, Request, addinfourl,
build_opener, parse_http_list, parse_keqv_list)
try:
# python 2
@@ -27,13 +58,59 @@ except NameError:
MIMETYPE = {
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
DEFAULT_UAS = [
#https://gist.github.com/fijimunkii/952acac988f2d25bef7e0284bc63c406
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
]
def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=False):
PROTOCOL = ['http', 'https']
def get(*args, **kwargs):
return adv_get(*args, **kwargs)['data']
def adv_get(url, post=None, timeout=None, *args, **kwargs):
url = sanitize_url(url)
if post is not None:
post = post.encode('utf-8')
if timeout is None:
con = custom_opener(*args, **kwargs).open(url, data=post)
else:
con = custom_opener(*args, **kwargs).open(url, data=post, timeout=timeout)
data = con.read()
contenttype = con.info().get('Content-Type', '').split(';')[0]
encoding= detect_encoding(data, con)
return {
'data': data,
'url': con.geturl(),
'con': con,
'contenttype': contenttype,
'encoding': encoding
}
def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
handlers = []
# as per urllib2 source code, these Handelers are added first
@@ -43,28 +120,122 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F
# HTTPDefaultErrorHandler, HTTPRedirectHandler,
# FTPHandler, FileHandler, HTTPErrorProcessor]
# & HTTPSHandler
#
# when processing a request:
# (1) all the *_request are run
# (2) the *_open are run until sth is returned (other than None)
# (3) all the *_response are run
#
# During (3), if an http error occurs (i.e. not a 2XX response code), the
# http_error_* are run until sth is returned (other than None). If they all
# return nothing, a python error is raised
#handlers.append(DebugHandler())
handlers.append(SizeLimitHandler(100*1024)) # 100KiB
handlers.append(SizeLimitHandler(500*1024)) # 500KiB
handlers.append(HTTPCookieProcessor())
handlers.append(GZIPHandler())
handlers.append(HTTPAllRedirectHandler())
handlers.append(HTTPEquivHandler())
handlers.append(HTTPRefreshHandler())
handlers.append(UAHandler(DEFAULT_UA))
handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
handlers.append(BrowserlyHeaderHandler())
handlers.append(EncodingFixHandler())
if not basic:
handlers.append(AutoRefererHandler())
if follow:
handlers.append(AlternateHandler(MIMETYPE[follow]))
handlers.append(EncodingFixHandler(encoding))
if accept:
handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict))
handlers.append(CacheHandler(force_min=delay))
handlers.append(CacheHandler(policy=policy, force_min=force_min, force_max=force_max))
return build_opener(*handlers)
def is_ascii(string):
# there's a native function in py3, but home-made fix for backward compatibility
try:
string.encode('ascii')
except UnicodeError:
return False
else:
return True
def sanitize_url(url):
# make sure the url is unicode, i.e. not bytes
if isinstance(url, bytes):
url = url.decode()
# make sure there's a protocol (http://)
if url.split(':', 1)[0] not in PROTOCOL:
url = 'http://' + url
# turns out some websites have really badly fomatted urls (fix http:/badurl)
url = re.sub('^(https?):/([^/])', r'\1://\2', url)
# escape spaces
url = url.replace(' ', '%20')
# escape non-ascii unicode characters
# https://stackoverflow.com/a/4391299
parts = list(urlparse(url))
for i in range(len(parts)):
if not is_ascii(parts[i]):
if i == 1:
parts[i] = parts[i].encode('idna').decode('ascii')
else:
parts[i] = quote(parts[i].encode('utf-8'))
return urlunparse(parts)
class RespDataHandler(BaseHandler):
" Make it easier to use the reponse body "
def data_reponse(self, req, resp, data):
pass
def http_response(self, req, resp):
# read data
data = resp.read()
# process data and use returned content (if any)
data = self.data_response(req, resp, data) or data
# reformat the stuff
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
class RespStrHandler(RespDataHandler):
" Make it easier to use the _decoded_ reponse body "
def str_reponse(self, req, resp, data_str):
pass
def data_response(self, req, resp, data):
#decode
enc = detect_encoding(data, resp)
data_str = data.decode(enc, 'replace')
#process
data_str = self.str_response(req, resp, data_str)
# return
data = data_str.encode(enc) if data_str is not None else data
#return
return data
class DebugHandler(BaseHandler):
handler_order = 2000
@@ -85,7 +256,7 @@ class SizeLimitHandler(BaseHandler):
handler_order = 450
def __init__(self, limit=5*1024^2):
def __init__(self, limit=5*1024**2):
self.limit = limit
def http_response(self, req, resp):
@@ -106,32 +277,29 @@ def UnGzip(data):
return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data)
class GZIPHandler(BaseHandler):
class GZIPHandler(RespDataHandler):
def http_request(self, req):
req.add_unredirected_header('Accept-Encoding', 'gzip')
return req
def http_response(self, req, resp):
def data_response(self, req, resp, data):
if 200 <= resp.code < 300:
if resp.headers.get('Content-Encoding') == 'gzip':
data = resp.read()
data = UnGzip(data)
resp.headers['Content-Encoding'] = 'identity'
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
https_request = http_request
return UnGzip(data)
def detect_encoding(data, resp=None):
enc = detect_raw_encoding(data, resp)
if enc.lower() == 'gb2312':
enc = 'gbk'
return enc
def detect_raw_encoding(data, resp=None):
if resp is not None:
enc = resp.headers.get('charset')
if enc is not None:
@@ -156,32 +324,9 @@ def detect_encoding(data, resp=None):
return 'utf-8'
class EncodingFixHandler(BaseHandler):
def __init__(self, encoding=None):
self.encoding = encoding
def http_response(self, req, resp):
maintype = resp.info().get('Content-Type', '').split('/')[0]
if 200 <= resp.code < 300 and maintype == 'text':
data = resp.read()
if not self.encoding:
enc = detect_encoding(data, resp)
else:
enc = self.encoding
if enc:
data = data.decode(enc, 'replace')
data = data.encode(enc)
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
class EncodingFixHandler(RespStrHandler):
def str_response(self, req, resp, data_str):
return data_str
class UAHandler(BaseHandler):
@@ -196,83 +341,69 @@ class UAHandler(BaseHandler):
https_request = http_request
class AutoRefererHandler(BaseHandler):
class BrowserlyHeaderHandler(BaseHandler):
""" Add more headers to look less suspicious """
def http_request(self, req):
req.add_unredirected_header('Referer', 'http://%s' % req.host)
req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
req.add_unredirected_header('Accept-Language', 'en-US,en;q=0.5')
return req
https_request = http_request
class ContentNegociationHandler(BaseHandler):
" Handler for content negociation. Also parses <link rel='alternate' type='application/rss+xml' href='...' /> "
def iter_html_tag(html_str, tag_name):
" To avoid parsing whole pages when looking for a simple tag "
def __init__(self, accept=None, strict=False):
self.accept = accept
self.strict = strict
re_tag = r'<%s(\s*[^>])*>' % tag_name
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
def http_request(self, req):
if self.accept is not None:
if isinstance(self.accept, basestring):
self.accept = (self.accept,)
for tag_match in re.finditer(re_tag, html_str):
attr_match = re.findall(re_attr, tag_match.group(0))
string = ','.join(self.accept)
if attr_match is not None:
yield dict(attr_match)
if self.strict:
string += ',*/*;q=0.9'
req.add_unredirected_header('Accept', string)
class AlternateHandler(RespStrHandler):
" Follow <link rel='alternate' type='application/rss+xml' href='...' /> "
return req
def __init__(self, follow=None):
self.follow = follow or []
def http_response(self, req, resp):
def str_response(self, req, resp, data_str):
contenttype = resp.info().get('Content-Type', '').split(';')[0]
if 200 <= resp.code < 300 and self.accept is not None and self.strict and contenttype in MIMETYPE['html'] and contenttype not in self.accept:
if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
data = resp.read()
links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
for link in links:
if link.get('type', '') in self.accept:
for link in iter_html_tag(data_str[:10000], 'link'):
if (link.get('rel') == 'alternate'
and link.get('type') in self.follow
and 'href' in link):
resp.code = 302
resp.msg = 'Moved Temporarily'
resp.headers['location'] = link.get('href')
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_request = http_request
https_response = http_response
break
class HTTPEquivHandler(BaseHandler):
class HTTPEquivHandler(RespStrHandler):
" Handler to support <meta http-equiv='...' content='...' />, since it defines HTTP headers "
handler_order = 600
def http_response(self, req, resp):
def str_response(self, req, resp, data_str):
contenttype = resp.info().get('Content-Type', '').split(';')[0]
if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
data = resp.read()
headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
for meta in iter_html_tag(data_str[:10000], 'meta'):
if 'http-equiv' in meta and 'content' in meta:
resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
for header in headers:
resp.headers[header.get('http-equiv').lower()] = header.get('content')
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
class HTTPAllRedirectHandler(HTTPRedirectHandler):
def http_error_308(self, req, fp, code, msg, headers):
return self.http_error_301(req, fp, 301, msg, headers)
class HTTPRefreshHandler(BaseHandler):
@@ -297,139 +428,99 @@ class HTTPRefreshHandler(BaseHandler):
https_response = http_response
default_cache = {}
def parse_headers(text=u'\n\n'):
if sys.version_info[0] >= 3:
# python 3
return message_from_string(text, _class=HTTPMessage)
else:
# python 2
return HTTPMessage(StringIO(text))
def error_response(code, msg, url=''):
# return an error as a response
resp = addinfourl(BytesIO(), parse_headers(), url, code)
resp.msg = msg
return resp
class CacheHandler(BaseHandler):
" Cache based on etags/last-modified "
private_cache = False # False to behave like a CDN (or if you just don't care), True like a PC
privacy = 'private' # Websites can indicate whether the page should be cached
# by CDNs (e.g. shouldn't be the case for
# private/confidential/user-specific pages. With this
# setting, decide whether you want the cache to behave
# like a CDN (i.e. don't cache private pages, 'public'),
# or to behave like a end-user private pages
# ('private'). If unsure, 'public' is the safest bet,
# but many websites abuse this feature...
# NB. This overrides all the other min/max/policy settings.
handler_order = 499
def __init__(self, cache=None, force_min=None):
def __init__(self, cache=None, force_min=None, force_max=None, policy=None):
self.cache = cache or default_cache
self.force_min = force_min # force_min (seconds) to bypass http headers, -1 forever, 0 never, -2 do nothing if not in cache
self.force_min = force_min
self.force_max = force_max
self.policy = policy # can be cached/refresh/offline/None (default)
# Servers indicate how long they think their content is "valid". With
# this parameter (force_min/max, expressed in seconds), we can override
# the validity period (i.e. bypassing http headers)
# Special choices, via "policy":
# cached: use the cache no matter what (and fetch the page online if
# not present in cache)
# refresh: valid zero second, i.e. force refresh
# offline: same as cached, i.e. use the cache no matter what, but do
# NOT fetch the page online if not present in cache, throw an
# error instead
# None: just follow protocols
# sanity checks
assert self.force_max is None or self.force_max >= 0
assert self.force_min is None or self.force_min >= 0
assert self.force_max is None or self.force_min is None or self.force_max >= self.force_min
def load(self, url):
try:
out = list(self.cache[url])
data = pickle.loads(self.cache[url])
except KeyError:
out = [None, None, unicode(), bytes(), 0]
if sys.version_info[0] >= 3:
out[2] = email.message_from_string(out[2] or unicode()) # headers
else:
out[2] = mimetools.Message(StringIO(out[2] or unicode()))
return out
def save(self, url, code, msg, headers, data, timestamp):
self.cache[url] = (code, msg, unicode(headers), data, timestamp)
def http_request(self, req):
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
if 'etag' in headers:
req.add_unredirected_header('If-None-Match', headers['etag'])
if 'last-modified' in headers:
req.add_unredirected_header('If-Modified-Since', headers.get('last-modified'))
return req
def http_open(self, req):
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
# some info needed to process everything
cache_control = parse_http_list(headers.get('cache-control', ()))
cache_control += parse_http_list(headers.get('pragma', ()))
cc_list = [x for x in cache_control if '=' not in x]
cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
cache_age = time.time() - timestamp
# list in a simple way what to do when
if req.get_header('Morss') == 'from_304': # for whatever reason, we need an uppercase
# we're just in the middle of a dirty trick, use cache
pass
elif self.force_min == -2:
if code is not None:
# already in cache, perfect, use cache
pass
else:
headers['Morss'] = 'from_cache'
resp = addinfourl(BytesIO(), headers, req.get_full_url(), 409)
resp.msg = 'Conflict'
return resp
elif code is None:
# cache empty, refresh
return None
elif self.force_min == -1:
# force use cache
pass
elif self.force_min == 0:
# force refresh
return None
elif code == 301 and cache_age < 7*24*3600:
# "301 Moved Permanently" has to be cached...as long as we want (awesome HTTP specs), let's say a week (why not?)
# use force_min=0 if you want to bypass this (needed for a proper refresh)
pass
elif self.force_min is None and ('no-cache' in cc_list
or 'no-store' in cc_list
or ('private' in cc_list and not self.private)):
# kindly follow web servers indications, refresh
return None
elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
# server says it's still fine (and we trust him, if not, use force_min=0), use cache
pass
elif self.force_min is not None and self.force_min > cache_age:
# still recent enough for us, use cache
pass
data = None
else:
# according to the www, we have to refresh when nothing is said
return None
data['headers'] = parse_headers(data['headers'] or unicode())
# return the cache as a response
headers['morss'] = 'from_cache' # TODO delete the morss header from incoming pages, to avoid websites messing up with us
resp = addinfourl(BytesIO(data), headers, req.get_full_url(), code)
resp.msg = msg
return data
return resp
def save(self, key, data):
data['headers'] = unicode(data['headers'])
self.cache[key] = pickle.dumps(data, 0)
def http_response(self, req, resp):
# code for after-fetch, to know whether to save to hard-drive (if stiking to http headers' will)
def cached_response(self, req, fallback=None):
data = self.load(req.get_full_url())
if resp.code == 304:
if data is not None:
# return the cache as a response
resp = addinfourl(BytesIO(data['data']), data['headers'], req.get_full_url(), data['code'])
resp.msg = data['msg']
return resp
if ('cache-control' in resp.headers or 'pragma' in resp.headers) and self.force_min is None:
cache_control = parse_http_list(resp.headers.get('cache-control', ()))
cache_control += parse_http_list(resp.headers.get('pragma', ()))
else:
return fallback
cc_list = [x for x in cache_control if '=' not in x]
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private):
# kindly follow web servers indications
return resp
if resp.headers.get('Morss') == 'from_cache':
# it comes from cache, so no need to save it again
return resp
# save to disk
def save_response(self, req, resp):
data = resp.read()
self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
self.save(req.get_full_url(), {
'code': resp.code,
'msg': resp.msg,
'headers': resp.headers,
'data': data,
'timestamp': time.time()
})
fp = BytesIO(data)
old_resp = resp
@@ -438,107 +529,139 @@ class CacheHandler(BaseHandler):
return resp
def http_error_304(self, req, fp, code, msg, headers):
cache = list(self.load(req.get_full_url()))
def http_request(self, req):
data = self.load(req.get_full_url())
if cache[0]:
cache[-1] = time.time()
self.save(req.get_full_url(), *cache)
if data is not None:
if 'etag' in data['headers']:
req.add_unredirected_header('If-None-Match', data['headers']['etag'])
new = Request(req.get_full_url(),
headers=req.headers,
unverifiable=True)
if 'last-modified' in data['headers']:
req.add_unredirected_header('If-Modified-Since', data['headers']['last-modified'])
new.add_unredirected_header('Morss', 'from_304')
return req
return self.parent.open(new, timeout=req.timeout)
def http_open(self, req):
# Reminder of how/when this function is called by urllib2:
# If 'None' is returned, try your chance with the next-available handler
# If a 'resp' is returned, stop there, and proceed with 'http_response'
return None
# Here, we try to see whether we want to use data from cache (i.e.
# return 'resp'), or whether we want to refresh the content (return
# 'None')
data = self.load(req.get_full_url())
if data is not None:
# some info needed to process everything
cache_control = parse_http_list(data['headers'].get('cache-control', ()))
cache_control += parse_http_list(data['headers'].get('pragma', ()))
cc_list = [x for x in cache_control if '=' not in x]
cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
cache_age = time.time() - data['timestamp']
# list in a simple way what to do in special cases
if data is not None and 'private' in cc_list and self.privacy == 'public':
# private data but public cache, do not use cache
# privacy concern, so handled first and foremost
# (and doesn't need to be addressed anymore afterwards)
return None
elif self.policy == 'offline':
# use cache, or return an error
return self.cached_response(
req,
error_response(409, 'Conflict', req.get_full_url())
)
elif self.policy == 'cached':
# use cache, or fetch online
return self.cached_response(req, None)
elif self.policy == 'refresh':
# force refresh
return None
elif data is None:
# we have already settled all the cases that don't need the cache.
# all the following ones need the cached item
return None
elif self.force_max is not None and cache_age > self.force_max:
# older than we want, refresh
return None
elif self.force_min is not None and cache_age < self.force_min:
# recent enough, use cache
return self.cached_response(req)
elif data['code'] == 301 and cache_age < 7*24*3600:
# "301 Moved Permanently" has to be cached...as long as we want
# (awesome HTTP specs), let's say a week (why not?). Use force_min=0
# if you want to bypass this (needed for a proper refresh)
return self.cached_response(req)
elif self.force_min is None and ('no-cache' in cc_list or 'no-store' in cc_list):
# kindly follow web servers indications, refresh if the same
# settings are used all along, this section shouldn't be of any use,
# since the page woudln't be cached in the first place the check is
# only performed "just in case"
# NB. NOT respected if force_min is set
return None
elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
# server says it's still fine (and we trust him, if not, use overrides), use cache
return self.cached_response(req)
else:
# according to the www, we have to refresh when nothing is said
return None
def http_response(self, req, resp):
# code for after-fetch, to know whether to save to hard-drive (if stiking to http headers' will)
# NB. It might re-save requests pulled from cache, which will re-set the time() to the latest, i.e. lenghten its useful life
if resp.code == 304 and resp.url in self.cache:
# we are hopefully the first after the HTTP handler, so no need
# to re-run all the *_response
# here: cached page, returning from cache
return self.cached_response(req)
elif self.force_min is None and ('cache-control' in resp.headers or 'pragma' in resp.headers):
cache_control = parse_http_list(resp.headers.get('cache-control', ()))
cache_control += parse_http_list(resp.headers.get('pragma', ()))
cc_list = [x for x in cache_control if '=' not in x]
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and self.privacy == 'public'):
# kindly follow web servers indications (do not save & return)
return resp
else:
# save
return self.save_response(req, resp)
else:
return self.save_response(req, resp)
https_request = http_request
https_open = http_open
https_response = http_response
class BaseCache:
def __contains__(self, url):
try:
self[url]
except KeyError:
return False
else:
return True
if 'IGNORE_SSL' in os.environ:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import sqlite3
if __name__ == '__main__':
req = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
if sys.flags.interactive:
print('>>> Interactive shell: try using `req`')
class SQLiteCache(BaseCache):
def __init__(self, filename=':memory:'):
self.con = sqlite3.connect(filename or sqlite_default, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
with self.con:
self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)')
self.con.execute('pragma journal_mode=WAL')
def __del__(self):
self.con.close()
def __getitem__(self, url):
row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
if not row:
raise KeyError
return row[1:]
def __setitem__(self, url, value): # value = (code, msg, headers, data, timestamp)
value = list(value)
value[3] = sqlite3.Binary(value[3]) # data
value = tuple(value)
if url in self:
with self.con:
self.con.execute('UPDATE data SET code=?, msg=?, headers=?, data=?, timestamp=? WHERE url=?',
value + (url,))
else:
with self.con:
self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?)', (url,) + value)
import pymysql.cursors
class MySQLCacheHandler(BaseCache):
" NB. Requires mono-threading, as pymysql isn't thread-safe "
def __init__(self, user, password, database, host='localhost'):
self.con = pymysql.connect(host=host, user=user, password=password, database=database, charset='utf8', autocommit=True)
with self.con.cursor() as cursor:
cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)')
def __del__(self):
self.con.close()
def __getitem__(self, url):
cursor = self.con.cursor()
cursor.execute('SELECT * FROM data WHERE url=%s', (url,))
row = cursor.fetchone()
if not row:
raise KeyError
return row[1:]
def __setitem__(self, url, value): # (code, msg, headers, data, timestamp)
if url in self:
with self.con.cursor() as cursor:
cursor.execute('UPDATE data SET code=%s, msg=%s, headers=%s, data=%s, timestamp=%s WHERE url=%s',
value + (url,))
else:
with self.con.cursor() as cursor:
cursor.execute('INSERT INTO data VALUES (%s,%s,%s,%s,%s,%s)', (url,) + value)
else:
print(req['data'].decode(req['encoding']))

View File

@@ -73,7 +73,7 @@ item_updated = atom03:updated
mode = json
mimetype = application/json
timeformat = %Y-%m-%dT%H:%M:%SZ
timeformat = %Y-%m-%dT%H:%M:%S%z
base = {}
title = title
@@ -90,8 +90,11 @@ item_updated = updated
[html]
mode = html
path =
http://localhost/
title = //div[@id='header']/h1
desc = //div[@id='header']/h2
desc = //div[@id='header']/p
items = //div[@id='content']/div
item_title = ./a
@@ -99,7 +102,7 @@ item_link = ./a/@href
item_desc = ./div[class=desc]
item_content = ./div[class=content]
base = <!DOCTYPE html> <html> <head> <title>Feed reader by morss</title> <meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" /> </head> <body> <div id="header"> <h1>@feed.title</h1> <h2>@feed.desc</h2> <p>- via morss</p> </div> <div id="content"> <div class="item"> <a class="title link" href="@item.link" target="_blank">@item.title</a> <div class="desc">@item.desc</div> <div class="content">@item.content</div> </div> </div> <script> var items = document.getElementsByClassName('item') for (var i in items) items[i].onclick = function() { this.classList.toggle('active') document.body.classList.toggle('noscroll') } </script> </body> </html>
base = file:sheet.xsl
[twitter]
mode = html

View File

@@ -1,28 +0,0 @@
import re
import json
from . import crawler
try:
basestring
except NameError:
basestring = str
def pre_worker(url):
if url.startswith('http://itunes.apple.com/') or url.startswith('https://itunes.apple.com/'):
match = re.search('/id([0-9]+)(\?.*)?$', url)
if match:
iid = match.groups()[0]
redirect = 'https://itunes.apple.com/lookup?id=%s' % iid
try:
con = crawler.custom_handler(basic=True).open(redirect, timeout=4)
data = con.read()
except (IOError, HTTPException):
raise
return json.loads(data.decode('utf-8', 'replace'))['results'][0]['feedUrl']
return None

View File

@@ -1,31 +1,46 @@
import sys
import os.path
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
from datetime import datetime
import re
import json
import csv
import json
import os.path
import re
import sys
from copy import deepcopy
from datetime import datetime
from fnmatch import fnmatch
from lxml import etree
from dateutil import tz
import dateutil.parser
from copy import deepcopy
import lxml.html
from dateutil import tz
from lxml import etree
from .readabilite import parse as html_parse
json.encoder.c_make_encoder = None
try:
# python 2
from StringIO import StringIO
from ConfigParser import RawConfigParser
from StringIO import StringIO
except ImportError:
# python 3
from io import StringIO
from configparser import RawConfigParser
from io import StringIO
try:
# python 2
@@ -45,62 +60,80 @@ def parse_rules(filename=None):
rules = dict([(x, dict(config.items(x))) for x in config.sections()])
for section in rules.keys():
# for each ruleset
for arg in rules[section].keys():
if '\n' in rules[section][arg]:
# for each rule
if rules[section][arg].startswith('file:'):
paths = [os.path.join(sys.prefix, 'share/morss/www', rules[section][arg][5:]),
os.path.join(os.path.dirname(__file__), '../www', rules[section][arg][5:]),
os.path.join(os.path.dirname(__file__), '../..', rules[section][arg][5:])]
for path in paths:
try:
file_raw = open(path).read()
file_clean = re.sub('<[/?]?(xsl|xml)[^>]+?>', '', file_raw)
rules[section][arg] = file_clean
except IOError:
pass
elif '\n' in rules[section][arg]:
rules[section][arg] = rules[section][arg].split('\n')[1:]
return rules
def parse(data, url=None, mimetype=None):
def parse(data, url=None, encoding=None, ruleset=None):
" Determine which ruleset to use "
rulesets = parse_rules()
if ruleset is not None:
rulesets = [ruleset]
else:
rulesets = parse_rules().values()
parsers = [FeedXML, FeedHTML, FeedJSON]
# 1) Look for a ruleset based on path
if url is not None:
for ruleset in rulesets.values():
for ruleset in rulesets:
if 'path' in ruleset:
for path in ruleset['path']:
if fnmatch(url, path):
parser = [x for x in parsers if x.mode == ruleset['mode']][0]
return parser(data, ruleset)
return parser(data, ruleset, encoding=encoding)
# 2) Look for a parser based on mimetype
if mimetype is not None:
parser_candidates = [x for x in parsers if mimetype in x.mimetype]
if mimetype is None or parser_candidates is None:
parser_candidates = parsers
# 2) Try each and every parser
# 3) Look for working ruleset for given parser
# 3a) See if parsing works
# 3b) See if .items matches anything
for parser in parser_candidates:
ruleset_candidates = [x for x in rulesets.values() if x['mode'] == parser.mode and 'path' not in x]
# 'path' as they should have been caught beforehands
for parser in parsers:
try:
feed = parser(data)
feed = parser(data, encoding=encoding)
except (ValueError):
except (ValueError, SyntaxError):
# parsing did not work
pass
else:
# parsing worked, now we try the rulesets
ruleset_candidates = [x for x in rulesets if x.get('mode', None) in (parser.mode, None) and 'path' not in x]
# 'path' as they should have been caught beforehands
# try anyway if no 'mode' specified
for ruleset in ruleset_candidates:
feed.rules = ruleset
try:
feed.items[0]
except (AttributeError, IndexError):
except (AttributeError, IndexError, TypeError):
# parsing and or item picking did not work out
pass
@@ -112,7 +145,7 @@ def parse(data, url=None, mimetype=None):
class ParserBase(object):
def __init__(self, data=None, rules=None, parent=None):
def __init__(self, data=None, rules=None, parent=None, encoding=None):
if rules is None:
rules = parse_rules()[self.default_ruleset]
@@ -121,9 +154,10 @@ class ParserBase(object):
if data is None:
data = rules['base']
self.root = self.parse(data)
self.parent = parent
self.encoding = encoding
self.root = self.parse(data)
def parse(self, raw):
pass
@@ -148,15 +182,15 @@ class ParserBase(object):
c = csv.writer(out, dialect=csv.excel)
for item in self.items:
row = [getattr(item, x) for x in item.dic]
if encoding != 'unicode':
row = [x.encode(encoding) if isinstance(x, unicode) else x for x in row]
c.writerow(row)
c.writerow([getattr(item, x) for x in item.dic])
out.seek(0)
return out.read()
out = out.read()
if encoding != 'unicode':
out = out.encode(encoding)
return out
def tohtml(self, **k):
return self.convert(FeedHTML).tostring(**k)
@@ -267,8 +301,15 @@ class ParserBase(object):
except AttributeError:
# does not exist, have to create it
self.rule_create(self.rules[rule_name])
self.rule_set(self.rules[rule_name], value)
try:
self.rule_create(self.rules[rule_name])
except AttributeError:
# no way to create it, give up
pass
else:
self.rule_set(self.rules[rule_name], value)
def rmv(self, rule_name):
# easy deleter
@@ -286,10 +327,7 @@ class ParserXML(ParserBase):
NSMAP = {'atom': 'http://www.w3.org/2005/Atom',
'atom03': 'http://purl.org/atom/ns#',
'media': 'http://search.yahoo.com/mrss/',
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'slash': 'http://purl.org/rss/1.0/modules/slash/',
'dc': 'http://purl.org/dc/elements/1.1/',
'content': 'http://purl.org/rss/1.0/modules/content/',
'rssfake': 'http://purl.org/rss/1.0/'}
@@ -301,7 +339,7 @@ class ParserXML(ParserBase):
return self.root.getparent().remove(self.root)
def tostring(self, encoding='unicode', **k):
return etree.tostring(self.root, encoding=encoding, **k)
return etree.tostring(self.root, encoding=encoding, method='xml', **k)
def _rule_parse(self, rule):
test = re.search(r'^(.*)/@([a-z]+)$', rule) # to match //div/a/@href
@@ -383,7 +421,8 @@ class ParserXML(ParserBase):
return
elif key is not None:
del x.attrib[key]
if key in match.attrib:
del match.attrib[key]
else:
match.getparent().remove(match)
@@ -401,13 +440,14 @@ class ParserXML(ParserBase):
else:
if html_rich:
# atom stuff
if 'atom' in rule:
match.attrib['type'] = 'xhtml'
self._clean_node(match)
match.append(lxml.html.fragment_fromstring(value, create_parent='div'))
match.find('div').drop_tag()
if self.rules['mode'] == 'html':
match.find('div').drop_tag() # not supported by lxml.etree
else: # i.e. if atom
match.attrib['type'] = 'xhtml'
else:
if match is not None and len(match):
@@ -419,7 +459,7 @@ class ParserXML(ParserBase):
def rule_str(self, rule):
match = self.rule_search(rule)
html_rich = ('atom' in rule or self.rules['mode'] == 'html') \
html_rich = ('atom' in rule or self.mode == 'html') \
and rule in [self.rules.get('item_desc'), self.rules.get('item_content')]
if isinstance(match, etree._Element):
@@ -440,11 +480,10 @@ class ParserHTML(ParserXML):
mimetype = ['text/html', 'application/xhtml+xml']
def parse(self, raw):
parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
return etree.fromstring(raw, parser)
return html_parse(raw, encoding=self.encoding)
def tostring(self, encoding='unicode', **k):
return lxml.html.tostring(self.root, encoding=encoding, **k)
return lxml.html.tostring(self.root, encoding=encoding, method='html', **k)
def rule_search_all(self, rule):
try:
@@ -467,26 +506,36 @@ class ParserHTML(ParserXML):
element = deepcopy(match)
match.getparent().append(element)
else:
raise AttributeError('no way to create item')
def parse_time(value):
# parsing per se
if value is None or value == 0:
return None
time = None
elif isinstance(value, basestring):
if re.match(r'^[0-9]+$', value):
return datetime.fromtimestamp(int(value), tz.UTC)
time = datetime.fromtimestamp(int(value))
else:
return dateutil.parser.parse(value)
time = dateutil.parser.parse(value)
elif isinstance(value, int):
return datetime.fromtimestamp(value, tz.UTC)
time = datetime.fromtimestamp(value)
elif isinstance(value, datetime):
return value
time = value
else:
return None
time = None
# add default time zone if none set
if time is not None and time.tzinfo is None:
time = time.replace(tzinfo=tz.tzutc())
return time
class ParserJSON(ParserBase):
@@ -587,34 +636,41 @@ class ParserJSON(ParserBase):
return out.replace('\n', '<br/>') if out else out
class Uniq(object):
_map = {}
_id = None
def wrap_uniq(wrapper_fn_name):
" Wraps the output of the function with the specified function "
# This is called when parsing "wrap_uniq('wrap_item')"
def __new__(cls, *args, **kwargs):
# check if a wrapper was already created for it
# if so, reuse it
# if not, create a new one
# note that the item itself (the tree node) is created beforehands
def decorator(func):
# This is called when parsing "@wrap_uniq('wrap_item')"
tmp_id = cls._gen_id(*args, **kwargs)
if tmp_id in cls._map:
return cls._map[tmp_id]
def wrapped_func(self, *args, **kwargs):
# This is called when the wrapped function is called
else:
obj = object.__new__(cls) #, *args, **kwargs)
cls._map[tmp_id] = obj
return obj
output = func(self, *args, **kwargs)
output_id = id(output)
try:
return self._map[output_id]
except (KeyError, AttributeError):
if not hasattr(self, '_map'):
self._map = {}
wrapper_fn = getattr(self, wrapper_fn_name)
obj = wrapper_fn(output)
self._map[output_id] = obj
return obj
return wrapped_func
return decorator
class Feed(object):
itemsClass = 'Item'
itemsClass = property(lambda x: Item) # because Item is define below, i.e. afterwards
dic = ('title', 'desc', 'items')
def wrap_items(self, items):
itemsClass = globals()[self.itemsClass]
return [itemsClass(x, self.rules, self) for x in items]
title = property(
lambda f: f.get('title'),
lambda f,x: f.set('title', x),
@@ -630,10 +686,7 @@ class Feed(object):
self.rule_create(self.rules['items'])
item = self.items[-1]
if new is None:
return
for attr in globals()[self.itemsClass].dic:
for attr in self.itemsClass.dic:
try:
setattr(item, attr, getattr(new, attr))
@@ -644,8 +697,14 @@ class Feed(object):
except (IndexError, TypeError):
pass
return item
def wrap_item(self, item):
return self.itemsClass(item, self.rules, self)
@wrap_uniq('wrap_item')
def __getitem__(self, key):
return self.wrap_items(self.get_raw('items'))[key]
return self.get_raw('items')[key]
def __delitem__(self, key):
self[key].remove()
@@ -654,7 +713,7 @@ class Feed(object):
return len(self.get_raw('items'))
class Item(Uniq):
class Item(object):
dic = ('title', 'link', 'desc', 'content', 'time', 'updated')
def __init__(self, xml=None, rules=None, parent=None):
@@ -693,32 +752,45 @@ class Item(Uniq):
lambda f: f.rmv('item_updated') )
class FeedXML(Feed, ParserXML):
itemsClass = 'ItemXML'
def tostring(self, encoding='unicode', **k):
# override needed due to "getroottree" inclusion
if self.root.getprevious() is None:
self.root.addprevious(etree.PI('xml-stylesheet', 'type="text/xsl" href="/sheet.xsl"'))
return etree.tostring(self.root.getroottree(), encoding=encoding, **k)
class ItemXML(Item, ParserXML):
pass
class FeedHTML(Feed, ParserHTML):
itemsClass = 'ItemHTML'
class FeedXML(Feed, ParserXML):
itemsClass = ItemXML
def root_siblings(self):
out = []
current = self.root.getprevious()
while current is not None:
out.append(current)
current = current.getprevious()
return out
def tostring(self, encoding='unicode', **k):
# override needed due to "getroottree" inclusion
# and to add stylesheet
stylesheets = [x for x in self.root_siblings() if isinstance(x, etree.PIBase) and x.target == 'xml-stylesheet']
for stylesheet in stylesheets:
# remove all stylesheets present (be that ours or others')
self.root.append(stylesheet) # needed as we can't delete root siblings https://stackoverflow.com/a/60232366
self.root.remove(stylesheet)
self.root.addprevious(etree.PI('xml-stylesheet', 'type="text/xsl" href="/sheet.xsl"'))
return etree.tostring(self.root.getroottree(), encoding=encoding, method='xml', **k)
class ItemHTML(Item, ParserHTML):
pass
class FeedJSON(Feed, ParserJSON):
itemsClass = 'ItemJSON'
class FeedHTML(Feed, ParserHTML):
itemsClass = ItemHTML
class ItemJSON(Item, ParserJSON):
@@ -732,3 +804,20 @@ class ItemJSON(Item, ParserJSON):
return
cur = cur[node]
class FeedJSON(Feed, ParserJSON):
itemsClass = ItemJSON
if __name__ == '__main__':
from . import crawler
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')
feed = parse(req['data'], url=req['url'], encoding=req['encoding'])
if sys.flags.interactive:
print('>>> Interactive shell: try using `feed`')
else:
for item in feed.items:
print(item.title, item.link)

View File

@@ -1,78 +1,71 @@
import sys
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import os.path
import time
import threading
from fnmatch import fnmatch
import re
import time
from datetime import datetime
from fnmatch import fnmatch
import lxml.etree
import lxml.html
from dateutil import tz
from . import feeds
from . import feedify
from . import crawler
from . import readabilite
import wsgiref.simple_server
import wsgiref.handlers
from . import caching, crawler, feeds, readabilite
try:
# python 2
from Queue import Queue
from httplib import HTTPException
from urllib import quote_plus
from urlparse import urlparse, urljoin, parse_qs
from urlparse import parse_qs, urljoin, urlparse
except ImportError:
# python 3
from queue import Queue
from http.client import HTTPException
from urllib.parse import quote_plus
from urllib.parse import urlparse, urljoin, parse_qs
LIM_ITEM = 100 # deletes what's beyond
LIM_TIME = 7 # deletes what's after
MAX_ITEM = 50 # cache-only beyond
MAX_TIME = 7 # cache-only after (in sec)
DELAY = 10 * 60 # xml cache & ETag cache (in sec)
TIMEOUT = 4 # http timeout (in sec)
THREADS = 10 # number of threads (1 for single-threaded)
DEBUG = False
PORT = 8080
PROTOCOL = ['http', 'https', 'ftp']
from urllib.parse import parse_qs, urljoin, urlparse
def filterOptions(options):
return options
MAX_ITEM = int(os.getenv('MAX_ITEM', 5)) # cache-only beyond
MAX_TIME = int(os.getenv('MAX_TIME', 2)) # cache-only after (in sec)
# example of filtering code below
LIM_ITEM = int(os.getenv('LIM_ITEM', 10)) # deletes what's beyond
LIM_TIME = int(os.getenv('LIM_TIME', 2.5)) # deletes what's after
#allowed = ['proxy', 'clip', 'keep', 'cache', 'force', 'silent', 'pro', 'debug']
#filtered = dict([(key,value) for (key,value) in options.items() if key in allowed])
#return filtered
DELAY = int(os.getenv('DELAY', 10 * 60)) # xml cache & ETag cache (in sec)
TIMEOUT = int(os.getenv('TIMEOUT', 4)) # http timeout (in sec)
class MorssException(Exception):
pass
def log(txt, force=False):
if DEBUG or force:
def log(txt):
if 'DEBUG' in os.environ:
if 'REQUEST_URI' in os.environ:
# when running on Apache
open('morss.log', 'a').write("%s\n" % repr(txt))
else:
# when using internal server or cli
print(repr(txt))
def len_html(txt):
if len(txt):
return len(lxml.html.fromstring(txt).text_content())
else:
return 0
@@ -80,6 +73,7 @@ def len_html(txt):
def count_words(txt):
if len(txt):
return len(lxml.html.fromstring(txt).text_content().split())
return 0
@@ -88,14 +82,16 @@ class Options:
if len(args):
self.options = args
self.options.update(options or {})
else:
self.options = options or {}
def __getattr__(self, key):
def __getattr__(self, key, default=None):
if key in self.options:
return self.options[key]
else:
return False
return default
def __setitem__(self, key, value):
self.options[key] = value
@@ -103,29 +99,14 @@ class Options:
def __contains__(self, key):
return key in self.options
def parseOptions(options):
""" Turns ['md=True'] into {'md':True} """
out = {}
for option in options:
split = option.split('=', 1)
if len(split) > 1:
if split[0].lower() == 'true':
out[split[0]] = True
elif split[0].lower() == 'false':
out[split[0]] = False
else:
out[split[0]] = split[1]
else:
out[split[0]] = True
return out
get = __getitem__ = __getattr__
def ItemFix(item, feedurl='/'):
def ItemFix(item, options, feedurl='/'):
""" Improves feed items (absolute links, resolve feedburner links, etc) """
# check unwanted uppercase title
if len(item.title) > 20 and item.title.isupper():
if item.title is not None and len(item.title) > 20 and item.title.isupper():
item.title = item.title.title()
# check if it includes link
@@ -140,6 +121,13 @@ def ItemFix(item, feedurl='/'):
item.link = match[0]
log(item.link)
# at user's election, use first <a>
if options.firstlink and (item.desc or item.content):
match = lxml.html.fromstring(item.desc or item.content).xpath('//a/@href')
if len(match):
item.link = match[0]
log(item.link)
# check relative urls
item.link = urljoin(feedurl, item.link)
@@ -158,6 +146,11 @@ def ItemFix(item, feedurl='/'):
item.link = parse_qs(urlparse(item.link).query)['url'][0]
log(item.link)
# pocket
if fnmatch(item.link, 'https://getpocket.com/redirect?url=*'):
item.link = parse_qs(urlparse(item.link).query)['url'][0]
log(item.link)
# facebook
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
item.link = parse_qs(urlparse(item.link).query)['u'][0]
@@ -183,7 +176,7 @@ def ItemFix(item, feedurl='/'):
# reddit
if urlparse(feedurl).netloc == 'www.reddit.com':
match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
match = lxml.html.fromstring(item.content).xpath('//a[text()="[link]"]/@href')
if len(match):
item.link = match[0]
log(item.link)
@@ -196,59 +189,46 @@ def ItemFill(item, options, feedurl='/', fast=False):
if not item.link:
log('no link')
return item
return True
log(item.link)
link = item.link
# twitter
if urlparse(feedurl).netloc == 'twitter.com':
match = lxml.html.fromstring(item.desc).xpath('//a/@data-expanded-url')
if len(match):
link = match[0]
log(link)
else:
link = None
# facebook
if urlparse(feedurl).netloc == 'graph.facebook.com':
match = lxml.html.fromstring(item.content).xpath('//a/@href')
if len(match) and urlparse(match[0]).netloc != 'www.facebook.com':
link = match[0]
log(link)
else:
link = None
if link is None:
log('no used link')
return True
# download
delay = -1
if fast:
# super-fast mode
delay = -2
if fast or options.cache:
# force cache, don't fetch
policy = 'offline'
elif options.force:
# force refresh
policy = 'refresh'
else:
policy = None
try:
con = crawler.custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT)
data = con.read()
req = crawler.adv_get(url=item.link, policy=policy, force_min=24*60*60, timeout=TIMEOUT)
except (IOError, HTTPException) as e:
log('http error')
return False # let's just delete errors stuff when in cache mode
contenttype = con.info().get('Content-Type', '').split(';')[0]
if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
if req['contenttype'] not in crawler.MIMETYPE['html'] and req['contenttype'] != 'text/plain':
log('non-text page')
return True
out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con))
if not req['data']:
log('empty page')
return True
out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode', xpath=options.xpath)
if out is not None:
item.content = out
if options.resolve:
item.link = req['url']
return True
@@ -265,10 +245,7 @@ def ItemBefore(item, options):
def ItemAfter(item, options):
if options.clip and item.desc and item.content:
item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
del item.desc
if not options.keep and not options.proxy:
item.content = item.desc + "<br/><br/><hr/><br/><br/>" + item.content
del item.desc
if options.nolink and item.content:
@@ -276,7 +253,7 @@ def ItemAfter(item, options):
for link in content.xpath('//a'):
log(link.text_content())
link.drop_tag()
item.content = lxml.etree.tostring(content)
item.content = lxml.etree.tostring(content, method='html')
if options.noref:
item.link = ''
@@ -285,71 +262,57 @@ def ItemAfter(item, options):
def FeedFetch(url, options):
# basic url clean-up
if url is None:
raise MorssException('No url provided')
if urlparse(url).scheme not in PROTOCOL:
url = 'http://' + url
log(url)
url = url.replace(' ', '%20')
if isinstance(url, bytes):
url = url.decode()
# allow for code execution for feedify
pre = feedify.pre_worker(url)
if pre:
url = pre
log('url redirect')
log(url)
# fetch feed
delay = DELAY
if options.theforce:
delay = 0
if options.cache:
policy = 'offline'
elif options.force:
policy = 'refresh'
else:
policy = None
try:
con = crawler.custom_handler(accept='xml', strict=True, delay=delay,
encoding=options.encoding, basic=not options.items) \
.open(url, timeout=TIMEOUT * 2)
xml = con.read()
req = crawler.adv_get(url=url, post=options.post, follow=('rss' if not options.items else None), policy=policy, force_min=5*60, force_max=60*60, timeout=TIMEOUT * 2)
except (IOError, HTTPException):
raise MorssException('Error downloading feed')
contenttype = con.info().get('Content-Type', '').split(';')[0]
if options.items:
# using custom rules
rss = feeds.FeedHTML(xml, url, contenttype)
feed.rule
ruleset = {}
rss.rules['items'] = options.items
ruleset['items'] = options.items
ruleset['title'] = options.get('title', '//head/title')
ruleset['desc'] = options.get('desc', '//head/meta[@name="description"]/@content')
ruleset['item_title'] = options.get('item_title', '.')
ruleset['item_link'] = options.get('item_link', './@href|.//a/@href|ancestor::a/@href')
if options.item_title:
rss.rules['item_title'] = options.item_title
if options.item_link:
rss.rules['item_link'] = options.item_link
if options.item_content:
rss.rules['item_content'] = options.item_content
ruleset['item_content'] = options.item_content
if options.item_time:
rss.rules['item_time'] = options.item_time
ruleset['item_time'] = options.item_time
rss = feeds.parse(req['data'], encoding=req['encoding'], ruleset=ruleset)
rss = rss.convert(feeds.FeedXML)
else:
try:
rss = feeds.parse(xml, url, contenttype)
rss = feeds.parse(req['data'], url=url, encoding=req['encoding'])
rss = rss.convert(feeds.FeedXML)
# contains all fields, otherwise much-needed data can be lost
except TypeError:
log('random page')
log(contenttype)
log(req['contenttype'])
raise MorssException('Link provided is not a valid feed')
return rss
return req['url'], rss
def FeedGather(rss, url, options):
@@ -361,42 +324,39 @@ def FeedGather(rss, url, options):
lim_time = LIM_TIME
max_item = MAX_ITEM
max_time = MAX_TIME
threads = THREADS
if options.cache:
max_time = 0
if options.mono:
threads = 1
if options.newest:
# :newest take the newest items (instead of appearing order)
now = datetime.now(tz.tzutc())
sorted_items = sorted(rss.items, key=lambda x:x.updated or x.time or now, reverse=True)
# set
def runner(queue):
while True:
value = queue.get()
try:
worker(*value)
except Exception as e:
log('Thread Error: %s' % e.message)
queue.task_done()
else:
# default behavior, take the first items (in appearing order)
sorted_items = list(rss.items)
def worker(i, item):
for i, item in enumerate(sorted_items):
# hard cap
if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0:
log('dropped')
item.remove()
return
continue
item = ItemBefore(item, options)
if item is None:
return
continue
item = ItemFix(item, url)
item = ItemFix(item, options, url)
# soft cap
if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
if not options.proxy:
if ItemFill(item, options, url, True) is False:
item.remove()
return
continue
else:
if not options.proxy:
@@ -404,22 +364,6 @@ def FeedGather(rss, url, options):
item = ItemAfter(item, options)
queue = Queue()
for i in range(threads):
t = threading.Thread(target=runner, args=(queue,))
t.daemon = True
t.start()
for i, item in enumerate(list(rss.items)):
if threads == 1:
worker(*[i, item])
else:
queue.put([i, item])
if threads != 1:
queue.join()
if options.ad:
new = rss.items.append()
new.title = "Are you hungry?"
@@ -433,37 +377,38 @@ def FeedGather(rss, url, options):
return rss
def FeedFormat(rss, options):
def FeedFormat(rss, options, encoding='utf-8'):
if options.callback:
if re.match(r'^[a-zA-Z0-9\.]+$', options.callback) is not None:
return '%s(%s)' % (options.callback, rss.tojson())
out = '%s(%s)' % (options.callback, rss.tojson(encoding='unicode'))
return out if encoding == 'unicode' else out.encode(encoding)
else:
raise MorssException('Invalid callback var name')
elif options.json:
elif options.format == 'json':
if options.indent:
return rss.tojson(encoding='UTF-8', indent=4)
return rss.tojson(encoding=encoding, indent=4)
else:
return rss.tojson(encoding='UTF-8')
return rss.tojson(encoding=encoding)
elif options.csv:
return rss.tocsv(encoding='UTF-8')
elif options.format == 'csv':
return rss.tocsv(encoding=encoding)
elif options.reader:
elif options.format == 'html':
if options.indent:
return rss.tohtml(encoding='UTF-8', pretty_print=True)
return rss.tohtml(encoding=encoding, pretty_print=True)
else:
return rss.tohtml(encoding='UTF-8')
return rss.tohtml(encoding=encoding)
else:
else: # i.e. format == 'rss'
if options.indent:
return rss.torss(xml_declaration=True, encoding='UTF-8', pretty_print=True)
return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding, pretty_print=True)
else:
return rss.torss(xml_declaration=True, encoding='UTF-8')
return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding)
def process(url, cache=None, options=None):
@@ -473,189 +418,9 @@ def process(url, cache=None, options=None):
options = Options(options)
if cache:
crawler.default_cache = crawler.SQLiteCache(cache)
caching.default_cache = caching.SQLiteCache(cache)
rss = FeedFetch(url, options)
url, rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options)
return FeedFormat(rss, options)
def cgi_app(environ, start_response):
# get options
if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:]
else:
url = environ['PATH_INFO'][1:]
if environ['QUERY_STRING']:
url += '?' + environ['QUERY_STRING']
url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
if url.startswith(':'):
split = url.split('/', 1)
options = split[0].replace('|', '/').replace('\\\'', '\'').split(':')[1:]
if len(split) > 1:
url = split[1]
else:
url = ''
else:
options = []
# init
options = Options(filterOptions(parseOptions(options)))
headers = {}
global DEBUG
DEBUG = options.debug
# headers
headers['status'] = '200 OK'
headers['cache-control'] = 'max-age=%s' % DELAY
if options.cors:
headers['access-control-allow-origin'] = '*'
if options.html or options.reader:
headers['content-type'] = 'text/html'
elif options.txt or options.silent:
headers['content-type'] = 'text/plain'
elif options.json:
headers['content-type'] = 'application/json'
elif options.callback:
headers['content-type'] = 'application/javascript'
elif options.csv:
headers['content-type'] = 'text/csv'
headers['content-disposition'] = 'attachment; filename="feed.csv"'
else:
headers['content-type'] = 'text/xml'
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
# get the work done
rss = FeedFetch(url, options)
if headers['content-type'] == 'text/xml':
headers['content-type'] = rss.mimetype[0]
start_response(headers['status'], list(headers.items()))
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options)
if not options.silent:
return out
def cgi_wrapper(environ, start_response):
# simple http server for html and css
files = {
'': 'text/html',
'index.html': 'text/html'}
if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:]
else:
url = environ['PATH_INFO'][1:]
if url in files:
headers = {}
if url == '':
url = 'index.html'
if '--root' in sys.argv[1:]:
path = os.path.join(sys.argv[-1], url)
else:
path = url
try:
body = open(path, 'rb').read()
headers['status'] = '200 OK'
headers['content-type'] = files[url]
start_response(headers['status'], list(headers.items()))
return [body]
except IOError:
headers['status'] = '404 Not found'
start_response(headers['status'], list(headers.items()))
return ['Error %s' % headers['status']]
# actual morss use
try:
return [cgi_app(environ, start_response) or '(empty)']
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
headers = {'status': '500 Oops', 'content-type': 'text/plain'}
start_response(headers['status'], list(headers.items()), sys.exc_info())
log('ERROR <%s>: %s' % (url, e.message), force=True)
return ['An error happened:\n%s' % e.message]
def cli_app():
options = Options(filterOptions(parseOptions(sys.argv[1:-1])))
url = sys.argv[-1]
global DEBUG
DEBUG = options.debug
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options)
if not options.silent:
print(out.decode('utf-8', 'replace') if isinstance(out, bytes) else out)
log('done')
def isInt(string):
try:
int(string)
return True
except ValueError:
return False
def main():
if 'REQUEST_URI' in os.environ:
# mod_cgi
wsgiref.handlers.CGIHandler().run(cgi_wrapper)
elif len(sys.argv) <= 1 or isInt(sys.argv[1]) or '--root' in sys.argv[1:]:
# start internal (basic) http server
if len(sys.argv) > 1 and isInt(sys.argv[1]):
argPort = int(sys.argv[1])
if argPort > 0:
port = argPort
else:
raise MorssException('Port must be positive integer')
else:
port = PORT
print('Serving http://localhost:%s/'%port)
httpd = wsgiref.simple_server.make_server('', port, cgi_wrapper)
httpd.serve_forever()
else:
# as a CLI app
try:
cli_app()
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
print('ERROR: %s' % e.message)
if __name__ == '__main__':
main()
return FeedFormat(rss, options, 'unicode')

View File

@@ -1,13 +1,35 @@
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import re
import lxml.etree
import lxml.html
import re
from bs4 import BeautifulSoup
def parse(data, encoding=None):
if encoding:
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding=encoding)
data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')
else:
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
data = BeautifulSoup(data, 'lxml').prettify('utf-8')
parser = lxml.html.HTMLParser(remove_comments=True, encoding='utf-8')
return lxml.html.fromstring(data, parser=parser)
@@ -22,6 +44,8 @@ def count_words(string):
if string is None:
return 0
string = string.strip()
i = 0
count = 0
@@ -60,9 +84,10 @@ class_good = ['and', 'article', 'body', 'column', 'main',
regex_good = re.compile('|'.join(class_good), re.I)
tags_junk = ['script', 'head', 'iframe', 'object', 'noscript',
'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea',
'button', 'footer']
tags_dangerous = ['script', 'head', 'iframe', 'object', 'style', 'link', 'meta']
tags_junk = tags_dangerous + ['noscript', 'param', 'embed', 'layer', 'applet',
'form', 'input', 'textarea', 'button', 'footer']
tags_bad = tags_junk + ['a', 'aside']
@@ -90,13 +115,24 @@ def score_node(node):
" Score individual node "
score = 0
class_id = node.get('class', '') + node.get('id', '')
class_id = (node.get('class') or '') + (node.get('id') or '')
if (isinstance(node, lxml.html.HtmlComment)
or node.tag in tags_bad
or regex_bad.search(class_id)):
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
return 0
if node.tag in tags_dangerous:
return 0
if node.tag in tags_junk:
score += -1 # actuall -2 as tags_junk is included tags_bad
if node.tag in tags_bad:
score += -1
if regex_bad.search(class_id):
score += -1
if node.tag in tags_good:
score += 4
@@ -109,38 +145,47 @@ def score_node(node):
if wc != 0:
wca = count_words(' '.join([x.text_content() for x in node.findall('.//a')]))
score = score * ( 1 - float(wca)/wc )
score = score * ( 1 - 2 * float(wca)/wc )
return score
def score_all(node, grades=None):
def score_all(node):
" Fairly dumb loop to score all worthwhile nodes. Tries to be fast "
if grades is None:
grades = {}
for child in node:
score = score_node(child)
child.attrib['seen'] = 'yes, ' + str(int(score))
child.attrib['morss_own_score'] = str(float(score))
if score > 0:
spread_score(child, score, grades)
score_all(child, grades)
return grades
if score > 0 or len(list(child.iterancestors())) <= 2:
spread_score(child, score)
score_all(child)
def spread_score(node, score, grades):
def set_score(node, value):
node.attrib['morss_score'] = str(float(value))
def get_score(node):
return float(node.attrib.get('morss_score', 0))
def incr_score(node, delta):
set_score(node, get_score(node) + delta)
def get_all_scores(node):
return {x:get_score(x) for x in list(node.iter()) if get_score(x) != 0}
def spread_score(node, score):
" Spread the node's score to its parents, on a linear way "
delta = score / 2
for ancestor in [node,] + list(node.iterancestors()):
if score >= 1 or ancestor is node:
try:
grades[ancestor] += score
except KeyError:
grades[ancestor] = score
incr_score(ancestor, score)
score -= delta
@@ -148,26 +193,29 @@ def spread_score(node, score, grades):
break
def write_score_all(root, grades):
" Useful for debugging "
for node in root.iter():
node.attrib['score'] = str(int(grades.get(node, 0)))
def clean_root(root):
def clean_root(root, keep_threshold=None):
for node in list(root):
clean_root(node)
clean_node(node)
# bottom-up approach, i.e. starting with children before cleaning current node
clean_root(node, keep_threshold)
clean_node(node, keep_threshold)
def clean_node(node):
def clean_node(node, keep_threshold=None):
parent = node.getparent()
if parent is None:
# this is <html/> (or a removed element waiting for GC)
return
# remove dangerous tags, no matter what
if node.tag in tags_dangerous:
parent.remove(node)
return
# high score, so keep
if keep_threshold is not None and keep_threshold > 0 and get_score(node) >= keep_threshold:
return
gdparent = parent.getparent()
# remove shitty tags
@@ -248,59 +296,95 @@ def clean_node(node):
gdparent.insert(gdparent.index(parent)+1, new_node)
def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
ancestorsA = list(nodeA.iterancestors())
ancestorsB = list(nodeB.iterancestors())
def lowest_common_ancestor(node_a, node_b, max_depth=None):
ancestors_a = list(node_a.iterancestors())
ancestors_b = list(node_b.iterancestors())
if max_depth is not None:
ancestorsA = ancestorsA[:max_depth]
ancestorsB = ancestorsB[:max_depth]
ancestors_a = ancestors_a[:max_depth]
ancestors_b = ancestors_b[:max_depth]
ancestorsA.insert(0, nodeA)
ancestorsB.insert(0, nodeB)
ancestors_a.insert(0, node_a)
ancestors_b.insert(0, node_b)
for ancestorA in ancestorsA:
if ancestorA in ancestorsB:
return ancestorA
for ancestor_a in ancestors_a:
if ancestor_a in ancestors_b:
return ancestor_a
return nodeA # should always find one tho, at least <html/>, but needed for max_depth
return node_a # should always find one tho, at least <html/>, but needed for max_depth
def rank_nodes(grades):
return sorted(grades.items(), key=lambda x: x[1], reverse=True)
def get_best_node(html, threshold=5):
# score all nodes
score_all(html)
# rank all nodes (largest to smallest)
ranked_nodes = sorted(html.iter(), key=lambda x: get_score(x), reverse=True)
def get_best_node(grades):
" To pick the best (raw) node. Another function will clean it "
if len(grades) == 1:
return grades[0]
top = rank_nodes(grades)
lowest = lowest_common_ancestor(top[0][0], top[1][0], 3)
return lowest
def get_article(data, url=None, encoding=None):
" Input a raw html string, returns a raw html string of the article "
html = parse(data, encoding)
scores = score_all(html)
if not len(scores):
# minimum threshold
if not len(ranked_nodes) or get_score(ranked_nodes[0]) < threshold:
return None
best = get_best_node(scores)
# take common ancestor or the two highest rated nodes
if len(ranked_nodes) > 1:
best = lowest_common_ancestor(ranked_nodes[0], ranked_nodes[1], 3)
else:
best = ranked_nodes[0]
return best
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5, xpath=None):
" Input a raw html string, returns a raw html string of the article "
html = parse(data, encoding_in)
if xpath is not None:
xpath_match = html.xpath(xpath)
if len(xpath_match):
best = xpath_match[0]
else:
best = get_best_node(html, threshold)
else:
best = get_best_node(html, threshold)
if best is None:
# if threshold not met
return None
# clean up
if not debug:
keep_threshold = get_score(best) * 3/4
clean_root(best, keep_threshold)
# check for spammy content (links only)
wc = count_words(best.text_content())
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
if wc - wca < 50 or float(wca) / wc > 0.3:
if not debug and (wc - wca < 50 or float(wca) / wc > 0.3):
return None
# fix urls
if url:
best.make_links_absolute(url)
clean_root(best)
return lxml.etree.tostring(best if not debug else html, method='html', encoding=encoding_out)
return lxml.etree.tostring(best, pretty_print=True)
if __name__ == '__main__':
import sys
from . import crawler
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
article = get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')
if sys.flags.interactive:
print('>>> Interactive shell: try using `article`')
else:
print(article)

View File

@@ -1,210 +0,0 @@
@require(feed)
<!DOCTYPE html>
<html>
<head>
<title>@feed.title &#8211; via morss</title>
<meta charset="UTF-8" />
<meta name="description" content="@feed.desc (via morss)" />
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
<style type="text/css">
/* columns - from https://thisisdallas.github.io/Simple-Grid/simpleGrid.css */
* {
box-sizing: border-box;
}
#content {
width: 100%;
max-width: 1140px;
min-width: 755px;
margin: 0 auto;
overflow: hidden;
padding-top: 20px;
padding-left: 20px; /* grid-space to left */
padding-right: 0px; /* grid-space to right: (grid-space-left - column-space) e.g. 20px-20px=0 */
}
.item {
width: 33.33%;
float: left;
padding-right: 20px; /* column-space */
}
@@media handheld, only screen and (max-width: 767px) { /* @@ to escape from the template engine */
#content {
width: 100%;
min-width: 0;
margin-left: 0px;
margin-right: 0px;
padding-left: 20px; /* grid-space to left */
padding-right: 10px; /* grid-space to right: (grid-space-left - column-space) e.g. 20px-10px=10px */
}
.item {
width: auto;
float: none;
margin-left: 0px;
margin-right: 0px;
margin-top: 10px;
margin-bottom: 10px;
padding-left: 0px;
padding-right: 10px; /* column-space */
}
}
/* design */
#header h1, #header h2, #header p {
font-family: sans;
text-align: center;
margin: 0;
padding: 0;
}
#header h1 {
font-size: 2.5em;
font-weight: bold;
padding: 1em 0 0.25em;
}
#header h2 {
font-size: 1em;
font-weight: normal;
}
#header p {
color: gray;
font-style: italic;
font-size: 0.75em;
}
#content {
text-align: justify;
}
.item .title {
font-weight: bold;
display: block;
text-align: center;
}
.item .link {
color: inherit;
text-decoration: none;
}
.item:not(.active) {
cursor: pointer;
height: 20em;
margin-bottom: 20px;
overflow: hidden;
text-overflow: ellpisps;
padding: 0.25em;
position: relative;
}
.item:not(.active) .title {
padding-bottom: 0.1em;
margin-bottom: 0.1em;
border-bottom: 1px solid silver;
}
.item:not(.active):before {
content: " ";
display: block;
width: 100%;
position: absolute;
top: 18.5em;
height: 1.5em;
background: linear-gradient(to bottom, rgba(255,255,255,0) 0%, rgba(255,255,255,1) 100%);
}
.item:not(.active) .article * {
max-width: 100%;
font-size: 1em !important;
font-weight: normal;
display: inline;
margin: 0;
}
.item.active {
background: white;
position: fixed;
overflow: auto;
top: 0;
left: 0;
height: 100%;
width: 100%;
z-index: 1;
}
body.noscroll {
overflow: hidden;
}
.item.active > * {
max-width: 700px;
margin: auto;
}
.item.active .title {
font-size: 2em;
padding: 0.5em 0;
}
.item.active .article object,
.item.active .article video,
.item.active .article audio {
display: none;
}
.item.active .article img {
max-height: 20em;
max-width: 100%;
}
</style>
</head>
<body>
<div id="header">
<h1>@feed.title</h1>
@if feed.desc:
<h2>@feed.desc</h2>
@end
<p>- via morss</p>
</div>
<div id="content">
@for item in feed.items:
<div class="item">
@if item.link:
<a class="title link" href="@item.link" target="_blank">@item.title</a>
@else:
<span class="title">@item.title</span>
@end
<div class="article">
@if item.content:
@item.content
@else:
@item.desc
@end
</div>
</div>
@end
</div>
<script>
var items = document.getElementsByClassName('item')
for (var i in items)
items[i].onclick = function()
{
this.classList.toggle('active')
document.body.classList.toggle('noscroll')
}
</script>
</body>
</html>

298
morss/wsgi.py Normal file
View File

@@ -0,0 +1,298 @@
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import cgitb
import mimetypes
import os.path
import re
import sys
import wsgiref.handlers
import wsgiref.simple_server
import wsgiref.util
import lxml.etree
try:
# python 2
from urllib import unquote
except ImportError:
# python 3
from urllib.parse import unquote
from . import caching, crawler, readabilite
from .morss import (DELAY, TIMEOUT, FeedFetch, FeedFormat, FeedGather,
MorssException, Options, log)
PORT = int(os.getenv('PORT', 8000))
def parse_options(options):
""" Turns ['md=True'] into {'md':True} """
out = {}
for option in options:
split = option.split('=', 1)
if len(split) > 1:
out[split[0]] = unquote(split[1]).replace('|', '/') # | -> / for backward compatibility (and Apache)
else:
out[split[0]] = True
return out
def request_uri(environ):
if 'REQUEST_URI' in environ:
# when running on Apache/uwsgi
url = environ['REQUEST_URI']
elif 'RAW_URI' in environ:
# gunicorn
url = environ['RAW_URI']
else:
# when using other servers
url = environ['PATH_INFO']
if environ['QUERY_STRING']:
url += '?' + environ['QUERY_STRING']
return url
def cgi_parse_environ(environ):
# get options
url = request_uri(environ)[1:]
url = re.sub(r'^(cgi/)?(morss.py|main.py)/', '', url)
if url.startswith(':'):
parts = url.split('/', 1)
raw_options = parts[0].split(':')[1:]
url = parts[1] if len(parts) > 1 else ''
else:
raw_options = []
# init
options = Options(parse_options(raw_options))
return (url, options)
def cgi_app(environ, start_response):
url, options = cgi_parse_environ(environ)
headers = {}
# headers
headers['status'] = '200 OK'
headers['cache-control'] = 'max-age=%s' % DELAY
headers['x-content-type-options'] = 'nosniff' # safari work around
if options.cors:
headers['access-control-allow-origin'] = '*'
if options.format == 'html':
headers['content-type'] = 'text/html'
elif options.txt or options.silent:
headers['content-type'] = 'text/plain'
elif options.format == 'json':
headers['content-type'] = 'application/json'
elif options.callback:
headers['content-type'] = 'application/javascript'
elif options.format == 'csv':
headers['content-type'] = 'text/csv'
headers['content-disposition'] = 'attachment; filename="feed.csv"'
else:
headers['content-type'] = 'text/xml'
headers['content-type'] += '; charset=utf-8'
# get the work done
url, rss = FeedFetch(url, options)
start_response(headers['status'], list(headers.items()))
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options)
if options.silent:
return ['']
else:
return [out]
def middleware(func):
" Decorator to turn a function into a wsgi middleware "
# This is called when parsing the "@middleware" code
def app_builder(app):
# This is called when doing app = cgi_wrapper(app)
def app_wrap(environ, start_response):
# This is called when a http request is being processed
return func(environ, start_response, app)
return app_wrap
return app_builder
@middleware
def cgi_file_handler(environ, start_response, app):
" Simple HTTP server to serve static files (.html, .css, etc.) "
url = request_uri(environ)[1:]
if url == '':
url = 'index.html'
if re.match(r'^/?([a-zA-Z0-9_-][a-zA-Z0-9\._-]+/?)*$', url):
# if it is a legitimate url (no funny relative paths)
paths = [
os.path.join(sys.prefix, 'share/morss/www', url),
os.path.join(os.path.dirname(__file__), '../www', url)
]
for path in paths:
try:
f = open(path, 'rb')
except IOError:
# problem with file (cannot open or not found)
continue
else:
# file successfully open
headers = {}
headers['status'] = '200 OK'
headers['content-type'] = mimetypes.guess_type(path)[0] or 'application/octet-stream'
start_response(headers['status'], list(headers.items()))
return wsgiref.util.FileWrapper(f)
# regex didn't validate or no file found
return app(environ, start_response)
def cgi_get(environ, start_response):
url, options = cgi_parse_environ(environ)
# get page
req = crawler.adv_get(url=url, timeout=TIMEOUT)
if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
if options['get'] == 'page':
html = readabilite.parse(req['data'], encoding=req['encoding'])
html.make_links_absolute(req['url'])
kill_tags = ['script', 'iframe', 'noscript']
for tag in kill_tags:
for elem in html.xpath('//'+tag):
elem.getparent().remove(elem)
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
elif options['get'] == 'article':
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
else:
raise MorssException('no :get option passed')
else:
output = req['data']
# return html page
headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
start_response(headers['status'], list(headers.items()))
return [output]
dispatch_table = {
'get': cgi_get,
}
@middleware
def cgi_dispatcher(environ, start_response, app):
url, options = cgi_parse_environ(environ)
for key in dispatch_table.keys():
if key in options:
return dispatch_table[key](environ, start_response)
return app(environ, start_response)
@middleware
def cgi_error_handler(environ, start_response, app):
try:
return app(environ, start_response)
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
headers = {'status': '500 Oops', 'content-type': 'text/html', 'x-morss-error': repr(e)}
start_response(headers['status'], list(headers.items()), sys.exc_info())
log('ERROR: %s' % repr(e))
return [cgitb.html(sys.exc_info())]
@middleware
def cgi_encode(environ, start_response, app):
out = app(environ, start_response)
return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
application = cgi_app
application = cgi_file_handler(application)
application = cgi_dispatcher(application)
application = cgi_error_handler(application)
application = cgi_encode(application)
def cgi_handle_request():
app = cgi_app
app = cgi_dispatcher(app)
app = cgi_error_handler(app)
app = cgi_encode(app)
wsgiref.handlers.CGIHandler().run(app)
class WSGIRequestHandlerRequestUri(wsgiref.simple_server.WSGIRequestHandler):
def get_environ(self):
env = super().get_environ()
env['REQUEST_URI'] = self.path
return env
def cgi_start_server():
caching.default_cache.autotrim()
print('Serving http://localhost:%s/' % PORT)
httpd = wsgiref.simple_server.make_server('', PORT, application, handler_class=WSGIRequestHandlerRequestUri)
httpd.serve_forever()
if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''):
caching.default_cache.autotrim()

View File

@@ -1,4 +0,0 @@
lxml
python-dateutil <= 1.5
chardet
pymysql

View File

@@ -1,14 +1,26 @@
from setuptools import setup, find_packages
from glob import glob
from setuptools import setup
package_name = 'morss'
setup(
name=package_name,
description='Get full-text RSS feeds',
author='pictuga, Samuel Marks',
author_email='contact at pictuga dot com',
url='http://morss.it/',
license='AGPL v3',
package_dir={package_name: package_name},
packages=find_packages(),
package_data={package_name: ['feedify.ini', 'reader.html.template']},
test_suite=package_name + '.tests')
name = package_name,
description = 'Get full-text RSS feeds',
author = 'pictuga, Samuel Marks',
author_email = 'contact at pictuga dot com',
url = 'http://morss.it/',
download_url = 'https://git.pictuga.com/pictuga/morss',
license = 'AGPL v3',
packages = [package_name],
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'],
extras_require = {'full': ['pymysql', 'redis', 'diskcache', 'gunicorn', 'gevent']},
package_data = {package_name: ['feedify.ini']},
data_files = [
('share/' + package_name, ['README.md', 'LICENSE']),
('share/' + package_name + '/www', glob('www/*.*')),
('share/' + package_name + '/www/cgi', [])
],
entry_points = {
'console_scripts': [package_name + '=' + package_name + '.__main__:main']
})

View File

@@ -4,6 +4,12 @@ ErrorDocument 403 "Access forbidden"
ErrorDocument 404 /cgi/main.py
ErrorDocument 500 "A very nasty bug found his way onto this very server"
# Uncomment below line to turn debug on for all requests
#SetEnv DEBUG 1
# Uncomment below line to turn debug on for requests with :debug in the url
#SetEnvIf Request_URI :debug DEBUG=1
<Files ~ "\.(py|pyc|db|log)$">
deny from all
</Files>

View File

@@ -4,6 +4,7 @@
<title>morss</title>
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
<meta charset="UTF-8" />
<link rel="shortcut icon" type="image/svg+xml" href="/logo.svg" sizes="any" />
<style type="text/css">
body
{
@@ -35,8 +36,8 @@
<input type="text" id="url" name="url" placeholder="Feed url (http://example.com/feed.xml)" />
</form>
<code>Copyright: pictuga 2013-2014<br/>
Source code: https://github.com/pictuga/morss</code>
<code>Copyright: pictuga 2013-2020<br/>
Source code: https://git.pictuga.com/pictuga/morss</code>
<script>
form = document.forms[0]

17
www/logo.svg Normal file
View File

@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<svg width="16" height="16" viewBox="0 0 16 16" shape-rendering="crispEdges" fill="black" version="1.1" xmlns="http://www.w3.org/2000/svg">
<rect x="2" y="4" width="2" height="2" />
<rect x="5" y="4" width="6" height="2" />
<rect x="12" y="4" width="2" height="2" />
<rect x="2" y="7" width="2" height="2" />
<rect x="7" y="7" width="2" height="2" />
<rect x="12" y="7" width="2" height="2" />
<rect x="2" y="10" width="2" height="2" />
<rect x="7" y="10" width="2" height="2" />
<rect x="12" y="10" width="2" height="2" />
</svg>
<!-- This work by pictuga is licensed under CC BY-NC-SA 4.0. To view a copy of
this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0 -->

After

Width:  |  Height:  |  Size: 735 B

View File

@@ -1,5 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet version="1.1" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:stylesheet version="1.1"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:atom="http://www.w3.org/2005/Atom"
xmlns:atom03="http://purl.org/atom/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:rssfake="http://purl.org/rss/1.0/"
>
<xsl:output method="html"/>
@@ -7,116 +14,288 @@
<html>
<head>
<title>RSS feed by morss</title>
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0;" />
<meta name="viewport" content="width=device-width; initial-scale=1.0;" />
<meta name="robots" content="noindex" />
<style type="text/css">
body * {
box-sizing: border-box;
}
body {
overflow-wrap: anywhere;
word-wrap: anywhere;
word-break: break-word;
font-family: sans-serif;
-webkit-tap-highlight-color: transparent; /* safari work around */
}
#url {
background-color: rgba(255, 165, 0, 0.25);
padding: 1% 5%;
display: inline-block;
input, select {
font-family: inherit;
font-size: inherit;
text-align: inherit;
}
header {
text-align: justify;
text-align-last: center;
border-bottom: 1px solid silver;
}
.input-combo {
display: flex;
flex-flow: row;
align-items: stretch;
width: 800px;
max-width: 100%;
}
margin: auto;
body > ul {
border: 1px solid grey;
padding: .5em .5em;
background-color: #FFFAF4;
}
.input-combo * {
display: inline-block;
line-height: 2em;
border: 0;
background: transparent;
}
.input-combo > :not(.button) {
max-width: 100%;
flex-grow: 1;
flex-shrink 0;
white-space: nowrap;
text-overflow: ellipsis;
overflow: hidden;
}
.input-combo .button {
flex-grow: 0;
flex-shrink 1;
cursor: pointer;
min-width: 2em;
text-align: center;
border-left: 1px solid silver;
color: #06f;
}
[onclick_title] {
cursor: pointer;
position: relative;
}
[onclick_title]::before {
opacity: 0;
content: attr(onclick_title);
font-weight: normal;
position: absolute;
left: -300%;
z-index: 1;
background: grey;
color: white;
border-radius: 0.5em;
padding: 0 1em;
}
[onclick_title]:not(:active)::before {
transition: opacity 1s ease-in-out;
}
[onclick_title]:active::before {
opacity: 1;
}
header > form {
margin: 1%;
}
header a {
text-decoration: inherit;
color: #FF7B0A;
font-weight: bold;
}
.item {
background-color: #FFFAF4;
border: 1px solid silver;
margin: 1%;
max-width: 100%;
}
.item > * {
padding: 1%;
}
.item > *:empty {
display: none;
}
.item > :not(:last-child) {
border-bottom: 1px solid silver;
}
.item > a {
display: block;
font-weight: bold;
font-size: 1.5em;
}
.desc, .content {
overflow: hidden;
}
.desc *, .content * {
max-width: 100%;
}
ul {
list-style-type: none;
}
.tag {
color: darkred;
}
.attr {
color: darksalmon;
}
.value {
color: darkblue;
}
.comment {
color: lightgrey;
}
pre {
margin: 0;
max-width: 100%;
white-space: normal;
}
</style>
</head>
<body>
<h1>RSS feed by morss</h1>
<header>
<h1>RSS feed by morss</h1>
<p>Your RSS feed is <strong style="color: green">ready</strong>. You
can enter the following url in your newsreader:</p>
<p>Your RSS feed is <strong style="color: green">ready</strong>. You
can enter the following url in your newsreader:</p>
<div id="url"></div>
<div class="input-combo">
<input id="url" readonly="readonly"/>
<span class="button" onclick="copy_link()" title="Copy" onclick_title="Copied">
<svg width="16px" height="16px" viewBox="0 0 16 16" fill="currentColor" xmlns="http://www.w3.org/2000/svg">
<path fill-rule="evenodd" d="M4 1.5H3a2 2 0 00-2 2V14a2 2 0 002 2h10a2 2 0 002-2V3.5a2 2 0 00-2-2h-1v1h1a1 1 0 011 1V14a1 1 0 01-1 1H3a1 1 0 01-1-1V3.5a1 1 0 011-1h1v-1z" clip-rule="evenodd"/>
<path fill-rule="evenodd" d="M9.5 1h-3a.5.5 0 00-.5.5v1a.5.5 0 00.5.5h3a.5.5 0 00.5-.5v-1a.5.5 0 00-.5-.5zm-3-1A1.5 1.5 0 005 1.5v1A1.5 1.5 0 006.5 4h3A1.5 1.5 0 0011 2.5v-1A1.5 1.5 0 009.5 0h-3z" clip-rule="evenodd"/>
</svg>
</span>
</div>
<ul>
<xsl:apply-templates/>
</ul>
<form onchange="open_feed()">
More options: Output the
<select>
<option value="">full-text</option>
<option value=":proxy">original</option>
<option value=":clip" title="original + full-text: keep the original description above the full article. Useful for reddit feeds for example, to keep the comment links">combined (?)</option>
</select>
feed as
<select>
<option value="">RSS</option>
<option value=":format=json:cors">JSON</option>
<option value=":format=html">HTML</option>
<option value=":format=csv">CSV</option>
</select>
using the
<select>
<option value="">standard</option>
<option value=":firstlink" title="Pull the article from the first available link in the description, instead of the standard link. Useful for Twitter feeds for example, to get the articles referred to in tweets rather than the tweet itself">first (?)</option>
</select>
link of the
<select>
<option value="">first</option>
<option value=":newest" title="Select feed items by publication date (instead of appearing order)">newest (?)</option>
</select>
items and
<select>
<option value="">keep</option>
<option value=":nolink:noref">remove</option>
</select>
links
<input type="hidden" value="" name="extra_options"/>
</form>
<p>You can find a <em>preview</em> of the feed below. You need a <em>feed reader</em> for optimal use</p>
<p>Click <a href="/">here</a> to go back to morss and/or to use the tool on another feed</p>
</header>
<div id="header" dir="auto">
<h1>
<xsl:value-of select="rdf:RDF/rssfake:channel/rssfake:title|rss/channel/title|atom:feed/atom:title|atom03:feed/atom03:title"/>
</h1>
<p>
<xsl:value-of select="rdf:RDF/rssfake:channel/rssfake:description|rss/channel/description|atom:feed/atom:subtitle|atom03:feed/atom03:subtitle"/>
</p>
</div>
<div id="content">
<xsl:for-each select="rdf:RDF/rssfake:channel/rssfake:item|rss/channel/item|atom:feed/atom:entry|atom03:feed/atom03:entry">
<div class="item" dir="auto">
<a target="_blank"><xsl:attribute name="href"><xsl:value-of select="rssfake:link|link|atom:link/@href|atom03:link/@href"/></xsl:attribute>
<xsl:value-of select="rssfake:title|title|atom:title|atom03:title"/>
</a>
<div class="desc">
<xsl:copy-of select="rssfake:description|description|atom:summary|atom03:summary"/>
</div>
<div class="content">
<xsl:copy-of select="content:encoded|atom:content|atom03:content"/>
</div>
</div>
</xsl:for-each>
</div>
<script>
document.getElementById("url").innerHTML = window.location.href;
//<![CDATA[
document.getElementById("url").value = window.location.href
if (!/:html/.test(window.location.href))
for (var content of document.querySelectorAll(".desc,.content"))
content.innerHTML = (content.innerText.match(/>/g) || []).length > 3 ? content.innerText : content.innerHTML
var options = parse_location()[0]
if (options) {
for (var select of document.forms[0].elements)
if (select.tagName == 'SELECT')
for (var option of select)
if (option.value && options.match(option.value)) {
select.value = option.value
options = options.replace(option.value, '')
break
}
document.forms[0]['extra_options'].value = options
}
function copy_content(input) {
input.focus()
input.select()
document.execCommand('copy')
input.blur()
}
function copy_link() {
copy_content(document.getElementById("url"))
}
function parse_location() {
return (window.location.pathname + window.location.search).match(/^\/(?:(:[^\/]+)\/)?(.*$)$/).slice(1)
}
function open_feed() {
var url = parse_location()[1]
var options = Array.from(document.forms[0].elements).map(x=>x.value).join('')
var target = '/' + (options ? options + '/' : '') + url
if (target != window.location.pathname)
window.location.href = target
}
//]]>
</script>
</body>
</html>
</xsl:template>
<xsl:template match="*">
<li>
<span class="element">
&lt;
<span class="tag"><xsl:value-of select="name()"/></span>
<xsl:for-each select="@*">
<span class="attr"> <xsl:value-of select="name()"/></span>
=
"<span class="value"><xsl:value-of select="."/></span>"
</xsl:for-each>
&gt;
</span>
<xsl:if test="node()">
<ul>
<xsl:apply-templates/>
</ul>
</xsl:if>
<span class="element">
&lt;/
<span class="tag"><xsl:value-of select="name()"/></span>
&gt;
</span>
</li>
</xsl:template>
<xsl:template match="comment()">
<li>
<pre class="comment"><![CDATA[<!--]]><xsl:value-of select="."/><![CDATA[-->]]></pre>
</li>
</xsl:template>
<xsl:template match="text()">
<li>
<pre>
<xsl:value-of select="normalize-space(.)"/>
</pre>
</li>
</xsl:template>
<xsl:template match="text()[not(normalize-space())]"/>
</xsl:stylesheet>