Save auto version number

Fixed #108
Clean up sqlite code
2023-06-27 22:36:29 +02:00 · 2023-06-26 01:30:47 +02:00 · 2023-06-26 01:29:00 +02:00 · 2023-06-26 01:28:33 +02:00 · 2023-06-24 01:50:12 +02:00 · 2023-06-23 23:14:32 +02:00
37 changed files with 9779 additions and 225 deletions
--- a/.drone.yml
+++ b/.drone.yml
@@ -1,93 +0,0 @@
---
-kind: pipeline
-name: test
-
-steps:
- name: lint
-  image: alpine:edge
-  commands:
-  - apk add --no-cache python3 py3-lxml py3-setproctitle py3-pip py3-wheel py3-enchant hunspell-en
-  - pip3 install --no-cache-dir .[full] .[dev]
-  - isort --check-only --diff .
-  - pylint morss --rcfile=.pylintrc --disable=C,R,W --fail-under=8
-
---
-kind: pipeline
-name: python
-
-steps:
- name: publish
-  image: plugins/pypi
-  settings:
-    username:
-      from_secret: pypi_user
-    password:
-      from_secret: pypi_pwd
-  commands:
-    - /bin/drone-pypi
-    - cp dist/morss-*.tar.gz dist/morss.tar.gz
-
- name: push
-  image: appleboy/drone-scp
-  settings:
-    host:
-      from_secret: ssh_host
-    username:
-      from_secret: ssh_user
-    key:
-      from_secret: ssh_key
-    source:
-      - dist/morss.tar.gz
-    target: /home/ubuntu
-
- name: deploy
-  image: appleboy/drone-ssh
-  settings:
-    host:
-      from_secret: ssh_host
-    username:
-      from_secret: ssh_user
-    key:
-      from_secret: ssh_key
-    script_stop: true
-    script:
-      - sudo pip install --upgrade dist/morss.tar.gz[full]
-      - sudo rm -r dist
-      - sudo morss-helper reload
-
-trigger:
-  branch:
-  - master
-  event:
-  - push
-
-depends_on:
- test
-
---
-kind: pipeline
-name: docker
-
-steps:
- name: publish
-  image: thegeeklab/drone-docker-buildx
-  # NB. this requires qemu installed on host
-  privileged: true
-  settings:
-    username:
-      from_secret: docker_user
-    password:
-      from_secret: docker_pwd
-    repo:
-      from_secret: docker_repo
-    tags: latest
-    platforms: linux/amd64,linux/arm64,linux/arm/v7
-
-trigger:
-  branch:
-  - master
-  event:
-  - push
-
-depends_on:
- test
--- a/.github/workflows/default.yml
+++ b/.github/workflows/default.yml
@@ -0,0 +1,78 @@
+name: default
+on:
+    push:
+        branches:
+            - master
+
+jobs:
+    test-lint:
+        runs-on: ubuntu-latest
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v3
+              with:
+                  fetch-depth: 0
+
+            - name: Prepare image
+              run: apt-get -y update && apt-get -y install python3-pip libenchant-2-2 aspell-en
+
+            - name: Install dependencies
+              run: pip3 install .[full] .[dev]
+            - run: isort --check-only --diff .
+            - run: pylint morss --rcfile=.pylintrc --disable=C,R,W --fail-under=8
+            - run: pytest --cov=morss tests
+
+    python-publish:
+        runs-on: ubuntu-latest
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v3
+              with:
+                  fetch-depth: 0
+
+            - name: Prepare image
+              run: apt-get -y update && apt-get -y install python3-pip python3-build
+
+            - name: Build package
+              run: python3 -m build
+
+            - name: Publish package
+              uses: https://github.com/pypa/gh-action-pypi-publish@release/v1
+              with:
+                  password: ${{ secrets.pypi_api_token }}
+
+    docker-publish-deploy:
+        runs-on: ubuntu-latest
+        container:
+            image: catthehacker/ubuntu:act-latest
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v3
+
+            - name: Set up QEMU
+              uses: https://github.com/docker/setup-qemu-action@v2
+
+            - name: Set up Docker Buildx
+              uses: https://github.com/docker/setup-buildx-action@v2
+
+            - name: Login to Docker Hub
+              uses: https://github.com/docker/login-action@v2
+              with:
+                  username: ${{ secrets.docker_user }}
+                  password: ${{ secrets.docker_pwd }}
+
+            - name: Build and push
+              uses: https://github.com/docker/build-push-action@v4
+              with:
+                  context: .
+                  platforms: linux/amd64,linux/arm64,linux/arm/v7
+                  push: true
+                  tags: ${{ secrets.docker_repo }}
+
+            - name: Deploy on server
+              uses: https://github.com/appleboy/ssh-action@v0.1.10
+              with:
+                  host: ${{ secrets.ssh_host }}
+                  username: ${{ secrets.ssh_user }}
+                  key: ${{ secrets.ssh_key }}
+                  script: morss-update
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ Some features of morss:
 - Follow 301/meta redirects
 - Recover xml feeds with corrupt encoding
 - Supports gzip-compressed http content
- HTTP caching with different backends (in-memory/sqlite/mysql/redis/diskcache)
+- HTTP caching with different backends (in-memory/redis/diskcache)
 - Works as server/cli tool
 - Deobfuscate various tracking links

@@ -81,9 +81,9 @@ From git
 pip install git+https://git.pictuga.com/pictuga/morss.git#egg=morss[full]
 ```

-The full install includes all the cache backends. Otherwise, only in-memory and
-sqlite3 caches are available. The full install also includes gunicorn (for more
-efficient HTTP handling).
+The full install includes all the cache backends. Otherwise, only in-memory
+cache is available. The full install also includes gunicorn (for more efficient
+HTTP handling).

 The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
 C code needs to be compiled). If possible on your distribution, try installing
@@ -103,7 +103,7 @@ With cli
 docker pull pictuga/morss
 ```

-With docker-compose
+With docker-compose **(recommended)**

 ```yml
 services:
@@ -215,7 +215,7 @@ From source
 docker run -p 8000:8000 morss
 ```

-With docker-compose
+With docker-compose **(recommended)**

 ```shell
 docker-compose up
@@ -353,7 +353,7 @@ Using cache and passing arguments:
 ```python
 >>> import morss
 >>> url = 'http://feeds.bbci.co.uk/news/rss.xml'
->>> cache = '/tmp/morss-cache.db' # sqlite cache location
+>>> cache = '/tmp/morss-cache' # diskcache cache location
 >>> options = {'csv':True}
 >>> xml_string = morss.process(url, cache, options)
 >>> xml_string[:50]
@@ -367,11 +367,10 @@ under the hood.
 Doing it step-by-step:

 ```python
-import morss, morss.crawler
+import morss

 url = 'http://newspaper.example/feed.xml'
 options = morss.Options(csv=True) # arguments
-morss.crawler.sqlite_default = '/tmp/morss-cache.db' # sqlite cache location

 url, rss = morss.FeedFetch(url, options) # this only grabs the RSS feed
 rss = morss.FeedGather(rss, url, options) # this fills the feed and cleans it up
@@ -395,8 +394,8 @@ usage: morss [-h] [--post STRING] [--xpath XPATH]
             [--indent] [--cache] [--force] [--proxy]
             [--order {first,last,newest,oldest}] [--firstlink] [--resolve]
             [--items XPATH] [--item_link XPATH] [--item_title XPATH]
-             [--item_content XPATH] [--item_time XPATH] [--nolink] [--noref]
-             [--silent]
+             [--item_content XPATH] [--item_time XPATH]
+             [--mode {xml,html,json}] [--nolink] [--noref] [--silent]
             url

 Get full-text RSS feeds
@@ -440,6 +439,8 @@ custom feeds:
  --item_content XPATH  entry's content
  --item_time XPATH     entry's date & time (accepts a wide range of time
                        formats)
+  --mode {xml,html,json}
+                        parser to use for the custom feeds

 misc:
  --nolink              drop links, but keeps links' inner text
@@ -501,11 +502,6 @@ be dropped from the feed, even if they're cached. `-1` for unlimited.
 morss uses caching to make loading faster. There are 3 possible cache backends:

 - `(nothing/default)`: a simple python in-memory dict-like object.
- `CACHE=sqlite`: sqlite3 cache. Default file location is in-memory (i.e. it
-will be cleared every time the program is run). Path can be defined with
-`SQLITE_PATH`.
- `CACHE=mysql`: MySQL cache. Connection can be defined with the following
-environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
 - `CACHE=redis`: Redis cache. Connection can be defined with the following
 environment variables: `REDIS_HOST`, `REDIS_PORT`, `REDIS_DB`, `REDIS_PWD`
 - `CACHE=diskcache`: disk-based cache. Target directory canbe defined with
--- a/morss.service
+++ b/morss.service
@@ -0,0 +1,13 @@
+[Unit]
+Description=morss server (gunicorn)
+After=network.target
+
+[Service]
+ExecStart=/usr/local/bin/morss-helper run
+ExecReload=/usr/local/bin/morss-helper reload
+KillMode=process
+Restart=always
+User=http
+
+[Install]
+WantedBy=multi-user.target
--- a/morss/init.py
+++ b/morss/init.py
@@ -19,5 +19,7 @@

 # pylint: disable=unused-import,unused-variable

+__version__ = ""
+
 from .morss import *
 from .wsgi import application
--- a/morss/caching.py
+++ b/morss/caching.py
@@ -16,7 +16,6 @@
 # with this program. If not, see <https://www.gnu.org/licenses/>.

 import os
-import pickle
 import threading
 import time
 from collections import OrderedDict
@@ -51,83 +50,6 @@ class BaseCache:
            return True


-try:
-    import sqlite3 # isort:skip
-except ImportError:
-    pass
-
-
-class SQLiteCache(BaseCache):
-    def __init__(self, path=':memory:'):
-        self.con = sqlite3.connect(path, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
-
-        with self.con:
-            self.con.execute('CREATE TABLE IF NOT EXISTS data (ky UNICODE PRIMARY KEY, data BLOB, timestamp INT)')
-            self.con.execute('pragma journal_mode=WAL')
-
-        self.trim()
-
-    def __del__(self):
-        self.con.close()
-
-    def trim(self):
-        with self.con:
-            self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,))
-
-    def __getitem__(self, key):
-        row = self.con.execute('SELECT * FROM data WHERE ky=?', (key,)).fetchone()
-
-        if not row:
-            raise KeyError
-
-        return row[1]
-
-    def __setitem__(self, key, data):
-        with self.con:
-            self.con.execute('INSERT INTO data VALUES (?,?,?) ON CONFLICT(ky) DO UPDATE SET data=?, timestamp=?', (key, data, time.time(), data, time.time()))
-
-
-try:
-    import pymysql.cursors # isort:skip
-except ImportError:
-    pass
-
-
-class MySQLCacheHandler(BaseCache):
-    def __init__(self, user, password, database, host='localhost'):
-        self.user = user
-        self.password = password
-        self.database = database
-        self.host = host
-
-        with self.cursor() as cursor:
-            cursor.execute('CREATE TABLE IF NOT EXISTS data (ky VARCHAR(255) NOT NULL PRIMARY KEY, data MEDIUMBLOB, timestamp INT)')
-
-        self.trim()
-
-    def cursor(self):
-        return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
-
-    def trim(self):
-        with self.cursor() as cursor:
-            cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,))
-
-    def __getitem__(self, key):
-        cursor = self.cursor()
-        cursor.execute('SELECT * FROM data WHERE ky=%s', (key,))
-        row = cursor.fetchone()
-
-        if not row:
-            raise KeyError
-
-        return row[1]
-
-    def __setitem__(self, key, data):
-        with self.cursor() as cursor:
-            cursor.execute('INSERT INTO data VALUES (%s,%s,%s) ON DUPLICATE KEY UPDATE data=%s, timestamp=%s',
-                (key, data, time.time(), data, time.time()))
-
-
 class CappedDict(OrderedDict, BaseCache):
    def trim(self):
        if CACHE_SIZE >= 0:
@@ -182,20 +104,7 @@ class DiskCacheHandler(BaseCache):


 if 'CACHE' in os.environ:
-    if os.environ['CACHE'] == 'mysql':
-        default_cache = MySQLCacheHandler(
-            user = os.getenv('MYSQL_USER'),
-            password = os.getenv('MYSQL_PWD'),
-            database = os.getenv('MYSQL_DB'),
-            host = os.getenv('MYSQL_HOST', 'localhost')
-        )
-
-    elif os.environ['CACHE'] == 'sqlite':
-        default_cache = SQLiteCache(
-            os.getenv('SQLITE_PATH', ':memory:')
-        )
-
-    elif os.environ['CACHE'] == 'redis':
+    if os.environ['CACHE'] == 'redis':
        default_cache = RedisCacheHandler(
            host = os.getenv('REDIS_HOST', 'localhost'),
            port = int(os.getenv('REDIS_PORT', 6379)),
--- a/morss/cli.py
+++ b/morss/cli.py
@@ -54,6 +54,7 @@ def cli_app():
    group.add_argument('--item_title', action='store', type=str, metavar='XPATH', help='entry\'s title')
    group.add_argument('--item_content', action='store', type=str, metavar='XPATH', help='entry\'s content')
    group.add_argument('--item_time', action='store', type=str, metavar='XPATH', help='entry\'s date & time (accepts a wide range of time formats)')
+    group.add_argument('--mode', default=None, choices=('xml', 'html', 'json'), help='parser to use for the custom feeds')

    group = parser.add_argument_group('misc')
    group.add_argument('--nolink', action='store_true', help='drop links, but keeps links\' inner text')
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -38,12 +38,12 @@ try:
    from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
                         Request, addinfourl, build_opener, parse_http_list,
                         parse_keqv_list)
-    from urlparse import urlparse, urlunparse
+    from urlparse import urlsplit
 except ImportError:
    # python 3
    from email import message_from_string
    from http.client import HTTPMessage
-    from urllib.parse import quote, urlparse, urlunparse
+    from urllib.parse import quote, urlsplit
    from urllib.request import (BaseHandler, HTTPCookieProcessor,
                                HTTPRedirectHandler, Request, addinfourl,
                                build_opener, parse_http_list, parse_keqv_list)
@@ -163,10 +163,20 @@ def is_ascii(string):
        return True


+def soft_quote(string):
+    " url-quote only when not a valid ascii string "
+
+    if is_ascii(string):
+        return string
+
+    else:
+        return quote(string.encode('utf-8'))
+
+
 def sanitize_url(url):
    # make sure the url is unicode, i.e. not bytes
    if isinstance(url, bytes):
-        url = url.decode()
+        url = url.decode('utf-8')

    # make sure there's a protocol (http://)
    if url.split(':', 1)[0] not in PROTOCOL:
@@ -179,18 +189,19 @@ def sanitize_url(url):
    url = url.replace(' ', '%20')

    # escape non-ascii unicode characters
-    # https://stackoverflow.com/a/4391299
-    parts = list(urlparse(url))
+    parts = urlsplit(url)

-    for i in range(len(parts)):
-        if not is_ascii(parts[i]):
-            if i == 1:
-                parts[i] = parts[i].encode('idna').decode('ascii')
+    parts = parts._replace(
+        netloc=parts.netloc.replace(
+            parts.hostname,
+            parts.hostname.encode('idna').decode('ascii')
+            ),
+        path=soft_quote(parts.path),
+        query=soft_quote(parts.query),
+        fragment=soft_quote(parts.fragment),
+    )

-            else:
-                parts[i] = quote(parts[i].encode('utf-8'))
-
-    return urlunparse(parts)
+    return parts.geturl()


 class RespDataHandler(BaseHandler):
@@ -357,7 +368,7 @@ class BrowserlyHeaderHandler(BaseHandler):
 def iter_html_tag(html_str, tag_name):
    " To avoid parsing whole pages when looking for a simple tag "

-    re_tag = r'<%s(\s*[^>])*>' % tag_name
+    re_tag = r'<%s\s+[^>]+>' % tag_name
    re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'

    for tag_match in re.finditer(re_tag, html_str):
@@ -414,7 +425,7 @@ class HTTPRefreshHandler(BaseHandler):
    def http_response(self, req, resp):
        if 200 <= resp.code < 300:
            if resp.headers.get('refresh'):
-                regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url=(["\']?)(?P<url>.+)\2$'
+                regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url\s*=\s*(["\']?)(?P<url>.+)\2$'
                match = re.search(regex, resp.headers.get('refresh'))

                if match:
--- a/morss/feedify.ini
+++ b/morss/feedify.ini
@@ -90,9 +90,6 @@ item_updated = updated
 [html]
 mode = html

-path =
-  http://localhost/
-
 title = //div[@id='header']/h1
 desc = //div[@id='header']/p
 items = //div[@id='content']/div
--- a/morss/feeds.py
+++ b/morss/feeds.py
@@ -699,7 +699,7 @@ class Feed(object):
                try:
                    setattr(item, attr, new[attr])

-                except (IndexError, TypeError):
+                except (KeyError, IndexError, TypeError):
                    pass

        return item
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -17,6 +17,7 @@

 import os
 import re
+import sys
 import time
 from datetime import datetime
 from fnmatch import fnmatch
@@ -59,7 +60,7 @@ def log(txt):

        else:
            # when using internal server or cli
-            print(repr(txt))
+            print(repr(txt), file=sys.stderr)


 def len_html(txt):
@@ -286,6 +287,9 @@ def FeedFetch(url, options):

        ruleset['items'] = options.items

+        if options.mode:
+            ruleset['mode'] = options.mode
+
        ruleset['title'] = options.get('title', '//head/title')
        ruleset['desc'] = options.get('desc', '//head/meta[@name="description"]/@content')

@@ -424,7 +428,7 @@ def process(url, cache=None, options=None):
    options = Options(options)

    if cache:
-        caching.default_cache = caching.SQLiteCache(cache)
+        caching.default_cache = caching.DiskCacheHandler(cache)

    url, rss = FeedFetch(url, options)
    rss = FeedGather(rss, url, options)
--- a/setup.py
+++ b/setup.py
@@ -3,11 +3,33 @@ from glob import glob

 from setuptools import setup

+
+def get_version():
+    with open('morss/__init__.py', 'r+') as file:
+        lines = file.readlines()
+
+        # look for hard coded version number
+        for i in range(len(lines)):
+            if lines[i].startswith('__version__'):
+                version = lines[i].split('"')[1]
+                break
+
+        # create (& save) one if none found
+        if version == '':
+            version = datetime.now().strftime('%Y%m%d.%H%M')
+            lines[i] = '__version__ = "' + version + '"\n'
+
+            file.seek(0)
+            file.writelines(lines)
+
+        # return version number
+        return version
+
 package_name = 'morss'

 setup(
    name = package_name,
-    version = datetime.now().strftime('%Y%m%d.%H%M'),
+    version = get_version(),
    description = 'Get full-text RSS feeds',
    long_description = open('README.md').read(),
    long_description_content_type = 'text/markdown',
@@ -22,8 +44,8 @@ setup(
    packages = [package_name],
    install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'],
    extras_require = {
-        'full': ['pymysql', 'redis', 'diskcache', 'gunicorn', 'setproctitle'],
-        'dev': ['pylint']
+        'full': ['redis', 'diskcache', 'gunicorn', 'setproctitle'],
+        'dev': ['pylint', 'pyenchant', 'pytest', 'pytest-cov'],
    },
    python_requires = '>=2.7',
    package_data = {package_name: ['feedify.ini']},
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,60 @@
+import os
+import os.path
+import threading
+
+import pytest
+
+try:
+    # python2
+    from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
+    from SimpleHTTPServer import SimpleHTTPRequestHandler
+except:
+    # python3
+    from http.server import (BaseHTTPRequestHandler, HTTPServer,
+                             SimpleHTTPRequestHandler)
+
+class HTTPReplayHandler(SimpleHTTPRequestHandler):
+    " Serves pages saved alongside with headers. See `curl --http1.1 -is http://...` "
+
+    directory = os.path.join(os.path.dirname(__file__), './samples/')
+
+    __init__ = BaseHTTPRequestHandler.__init__
+
+    def do_GET(self):
+        path = self.translate_path(self.path)
+
+        if os.path.isdir(path):
+            f = self.list_directory(path)
+
+        else:
+            f = open(path, 'rb')
+
+        try:
+            self.copyfile(f, self.wfile)
+
+        finally:
+            f.close()
+
+class MuteHTTPServer(HTTPServer):
+    def handle_error(self, request, client_address):
+        # mute errors
+        pass
+
+def make_server(port=8888):
+    print('Serving http://localhost:%s/' % port)
+    return MuteHTTPServer(('', port), RequestHandlerClass=HTTPReplayHandler)
+
+@pytest.fixture
+def replay_server():
+    httpd = make_server()
+    thread = threading.Thread(target=httpd.serve_forever)
+    thread.start()
+
+    yield
+
+    httpd.shutdown()
+    thread.join()
+
+if __name__ == '__main__':
+    httpd = make_server()
+    httpd.serve_forever()
--- a/tests/samples/200-ok.txt
+++ b/tests/samples/200-ok.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 200 OK
+content-type: text/plain
+
+success
--- a/tests/samples/301-redirect-abs.txt
+++ b/tests/samples/301-redirect-abs.txt
@@ -0,0 +1,3 @@
+HTTP/1.1 301 Moved Permanently
+location: /200-ok.txt
+
--- a/tests/samples/301-redirect-rel.txt
+++ b/tests/samples/301-redirect-rel.txt
@@ -0,0 +1,3 @@
+HTTP/1.1 301 Moved Permanently
+location: ./200-ok.txt
+
--- a/tests/samples/301-redirect-url.txt
+++ b/tests/samples/301-redirect-url.txt
@@ -0,0 +1,3 @@
+HTTP/1.1 301 Moved Permanently
+location: http://localhost:8888/200-ok.txt
+
--- a/tests/samples/308-redirect.txt
+++ b/tests/samples/308-redirect.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 308 Permanent Redirect
+location: /200-ok.txt
+
+/200-ok.txt
--- a/tests/samples/alternate-abs.txt
+++ b/tests/samples/alternate-abs.txt
@@ -0,0 +1,8 @@
+HTTP/1.1 200 OK
+content-type: text/html; charset=UTF-8
+
+<!DOCTYPE html>
+<html>
+<head><link rel="alternate" type="application/rss+xml" href="/200-ok.txt" /></head>
+<body>meta redirect</body>
+</html>
--- a/tests/samples/enc-gb2312-header.txt
+++ b/tests/samples/enc-gb2312-header.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 200 OK
+content-type: text/plain; charset=gb2312
+
+<EFBFBD>ɹ<EFBFBD>
--- a/tests/samples/enc-gb2312-meta.txt
+++ b/tests/samples/enc-gb2312-meta.txt
@@ -0,0 +1,10 @@
+HTTP/1.1 200 OK
+content-type: text/html
+
+
+<!DOCTYPE html>
+<html>
+<head><meta charset="gb2312"/></head>
+<body>
+<EFBFBD>ɹ<EFBFBD>
+</body></html>
--- a/tests/samples/enc-iso-8859-1-header.txt
+++ b/tests/samples/enc-iso-8859-1-header.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 200 OK
+content-type: text/plain; charset=iso-8859-1
+
+succ<EFBFBD>s
--- a/tests/samples/enc-iso-8859-1-missing.txt
+++ b/tests/samples/enc-iso-8859-1-missing.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 200 OK
+content-type: text/plain
+
+succ<EFBFBD>s
--- a/tests/samples/enc-utf-8-header.txt
+++ b/tests/samples/enc-utf-8-header.txt
@@ -0,0 +1,4 @@
+HTTP/1.1 200 OK
+content-type: text/plain; charset=UTF-8
+
+succès
--- a/tests/samples/feed-atom-utf-8.txt
+++ b/tests/samples/feed-atom-utf-8.txt
@@ -0,0 +1,16 @@
+HTTP/1.1 200 OK
+Content-Type: text/xml; charset=utf-8
+
+<?xml version='1.0' encoding='utf-8'?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+	<title>!TITLE!</title>
+	<subtitle>!DESC!</subtitle>
+	<entry>
+		<title>!ITEM_TITLE!</title>
+		<summary>!ITEM_DESC!</summary>
+		<content type="html">!ITEM_CONTENT!</content>
+		<link href="!ITEM_LINK!"/>
+		<updated>2022-01-01T00:00:01+01:00</updated>
+		<published>2022-01-01T00:00:02+01:00</published>
+	</entry>
+</feed>
--- a/tests/samples/feed-atom03-utf-8.txt
+++ b/tests/samples/feed-atom03-utf-8.txt
@@ -0,0 +1,15 @@
+HTTP/1.1 200 OK
+content-type: application/xml
+
+<?xml version='1.0' encoding='utf-8' ?>
+<feed version='0.3' xmlns='http://purl.org/atom/ns#'>
+	<title>!TITLE!</title>
+	<subtitle>!DESC!</subtitle>
+	<entry>
+		<title>!ITEM_TITLE!</title>
+		<link rel='alternate' type='text/html' href='!ITEM_LINK!' />
+		<summary>!ITEM_DESC!</summary>
+		<content>!ITEM_CONTENT!</content>
+		<issued>2022-01-01T00:00:01+01:00</issued> <!-- FIXME -->
+	</entry>
+</feed>
--- a/tests/samples/feed-html-utf-8.txt
+++ b/tests/samples/feed-html-utf-8.txt
@@ -0,0 +1,22 @@
+HTTP/1.1 200 OK
+Content-Type: text/html; charset=utf-8
+
+<html>
+<head></head>
+
+<body>
+<div id="header">
+	<h1>!TITLE!</h1>
+	<p>!DESC!</p>
+</div>
+
+<div id="content">
+	<div class="item">
+		<a target="_blank" href="!ITEM_LINK!">!ITEM_TITLE!</a>
+		<div class="desc">!ITEM_DESC!</div>
+		<div class="content">!ITEM_CONTENT!</div>
+	</div>
+</div>
+
+</body>
+</html>
--- a/tests/samples/feed-json-utf-8.txt
+++ b/tests/samples/feed-json-utf-8.txt
@@ -0,0 +1,16 @@
+HTTP/1.1 200 OK
+Content-Type: application/json; charset=utf-8
+
+{
+	"title": "!TITLE!",
+	"desc": "!DESC!",
+	"items": [
+		{
+			"title": "!ITEM_TITLE!",
+			"time": "2022-01-01T00:00:01+0100",
+			"url": "!ITEM_LINK!",
+			"desc": "!ITEM_DESC!",
+			"content": "!ITEM_CONTENT!"
+		}
+	]
+}
--- a/tests/samples/feed-rss-channel-utf-8.txt
+++ b/tests/samples/feed-rss-channel-utf-8.txt
@@ -0,0 +1,17 @@
+HTTP/1.1 200 OK
+Content-Type: text/xml; charset=utf-8
+
+<?xml version='1.0' encoding='utf-8'?>
+<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
+  <channel>
+    <title>!TITLE!</title>
+    <description>!DESC!</description>
+    <item>
+      <title>!ITEM_TITLE!</title>
+      <pubDate>Mon, 01 Jan 2022 00:00:01 +0100</pubDate>
+      <link>!ITEM_LINK!</link>
+      <description>!ITEM_DESC!</description>
+      <content:encoded>!ITEM_CONTENT!</content:encoded>
+    </item>
+  </channel>
+</rss>
--- a/tests/samples/gzip.txt
+++ b/tests/samples/gzip.txt
--- a/tests/samples/header-refresh.txt
+++ b/tests/samples/header-refresh.txt
@@ -0,0 +1,3 @@
+HTTP/1.1 200 OK
+refresh: 0;url=/200-ok.txt
+
--- a/tests/samples/meta-redirect-abs.txt
+++ b/tests/samples/meta-redirect-abs.txt
@@ -0,0 +1,8 @@
+HTTP/1.1 200 OK
+content-type: text/html; charset=UTF-8
+
+<!DOCTYPE html>
+<html>
+<head><meta http-equiv="refresh" content="2; url = /200-ok.txt" /></head>
+<body>meta redirect</body>
+</html>
--- a/tests/samples/meta-redirect-rel.txt
+++ b/tests/samples/meta-redirect-rel.txt
@@ -0,0 +1,8 @@
+HTTP/1.1 200 OK
+content-type: text/html; charset=UTF-8
+
+<!DOCTYPE html>
+<html>
+<head><meta http-equiv="refresh" content="2; url = ./200-ok.txt" /></head>
+<body>meta redirect</body>
+</html>
--- a/tests/samples/meta-redirect-url.txt
+++ b/tests/samples/meta-redirect-url.txt
@@ -0,0 +1,8 @@
+HTTP/1.1 200 OK
+content-type: text/html; charset=UTF-8
+
+<!DOCTYPE html>
+<html>
+<head><meta http-equiv="refresh" content="2; url = http://localhost:8888/200-ok.txt" /></head>
+<body>meta redirect</body>
+</html>
--- a/tests/samples/size-1MiB.txt
+++ b/tests/samples/size-1MiB.txt
--- a/tests/test_crawler.py
+++ b/tests/test_crawler.py
@@ -0,0 +1,62 @@
+import pytest
+
+from morss.crawler import *
+
+
+def test_get(replay_server):
+    assert get('http://localhost:8888/200-ok.txt') == b'success\r\n'
+
+def test_adv_get(replay_server):
+    assert adv_get('http://localhost:8888/200-ok.txt')['data'] == b'success\r\n'
+
+@pytest.mark.parametrize('before,after', [
+    (b'http://localhost:8888/',     'http://localhost:8888/'),
+    ('localhost:8888/',             'http://localhost:8888/'),
+    ('http:/localhost:8888/',       'http://localhost:8888/'),
+    ('http://localhost:8888/&/',     'http://localhost:8888/&/'),
+    ('http://localhost:8888/ /',    'http://localhost:8888/%20/'),
+    ('http://localhost-€/€/',       'http://xn--localhost--077e/%E2%82%AC/'),
+    ('http://localhost-€:8888/€/',  'http://xn--localhost--077e:8888/%E2%82%AC/'),
+    ])
+def test_sanitize_url(before, after):
+    assert sanitize_url(before) == after
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(SizeLimitHandler(500*1024))])
+def test_size_limit_handler(replay_server, opener):
+    assert len(opener.open('http://localhost:8888/size-1MiB.txt').read()) == 500*1024
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(GZIPHandler())])
+def test_gzip_handler(replay_server, opener):
+    assert opener.open('http://localhost:8888/gzip.txt').read() == b'success\n'
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(EncodingFixHandler())])
+@pytest.mark.parametrize('url', [
+    'enc-gb2312-header.txt', 'enc-gb2312-meta.txt', #'enc-gb2312-missing.txt',
+    'enc-iso-8859-1-header.txt', 'enc-iso-8859-1-missing.txt',
+    'enc-utf-8-header.txt',
+    ])
+def test_encoding_fix_handler(replay_server, opener, url):
+    out = adv_get('http://localhost:8888/%s' % url)
+    out = out['data'].decode(out['encoding'])
+    assert 'succes' in out or 'succès' in out or '成功' in out
+
+@pytest.mark.parametrize('opener', [custom_opener(follow='rss'), build_opener(AlternateHandler(MIMETYPE['rss']))])
+def test_alternate_handler(replay_server, opener):
+    assert opener.open('http://localhost:8888/alternate-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPEquivHandler(), HTTPRefreshHandler())])
+def test_http_equiv_handler(replay_server, opener):
+    assert opener.open('http://localhost:8888/meta-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+    assert opener.open('http://localhost:8888/meta-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+    assert opener.open('http://localhost:8888/meta-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPAllRedirectHandler())])
+def test_http_all_redirect_handler(replay_server, opener):
+    assert opener.open('http://localhost:8888/308-redirect.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+    assert opener.open('http://localhost:8888/301-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+    assert opener.open('http://localhost:8888/301-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+    assert opener.open('http://localhost:8888/301-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
+
+@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPRefreshHandler())])
+def test_http_refresh_handler(replay_server, opener):
+    assert opener.open('http://localhost:8888/header-refresh.txt').geturl() == 'http://localhost:8888/200-ok.txt'
--- a/tests/test_feeds.py
+++ b/tests/test_feeds.py
@@ -0,0 +1,108 @@
+import pytest
+
+from morss.crawler import adv_get
+from morss.feeds import *
+
+
+def get_feed(url):
+    url = 'http://localhost:8888/%s' % url
+    out = adv_get(url)
+    feed = parse(out['data'], url=url, encoding=out['encoding'])
+    return feed
+
+def check_feed(feed):
+    # NB. time and updated not covered
+    assert feed.title == '!TITLE!'
+    assert feed.desc == '!DESC!'
+    assert feed[0] == feed.items[0]
+    assert feed[0].title == '!ITEM_TITLE!'
+    assert feed[0].link == '!ITEM_LINK!'
+    assert '!ITEM_DESC!' in feed[0].desc # broader test due to possible inclusion of surrounding <div> in xml
+    assert '!ITEM_CONTENT!' in feed[0].content
+
+def check_output(feed):
+    output = feed.tostring()
+    assert '!TITLE!' in output
+    assert '!DESC!' in output
+    assert '!ITEM_TITLE!' in output
+    assert '!ITEM_LINK!' in output
+    assert '!ITEM_DESC!' in output
+    assert '!ITEM_CONTENT!' in output
+
+def check_change(feed):
+    feed.title = '!TITLE2!'
+    feed.desc = '!DESC2!'
+    feed[0].title = '!ITEM_TITLE2!'
+    feed[0].link = '!ITEM_LINK2!'
+    feed[0].desc = '!ITEM_DESC2!'
+    feed[0].content = '!ITEM_CONTENT2!'
+
+    assert feed.title == '!TITLE2!'
+    assert feed.desc == '!DESC2!'
+    assert feed[0].title == '!ITEM_TITLE2!'
+    assert feed[0].link == '!ITEM_LINK2!'
+    assert '!ITEM_DESC2!' in feed[0].desc
+    assert '!ITEM_CONTENT2!' in feed[0].content
+
+def check_add(feed):
+    feed.append({
+        'title': '!ITEM_TITLE3!',
+        'link': '!ITEM_LINK3!',
+        'desc': '!ITEM_DESC3!',
+        'content': '!ITEM_CONTENT3!',
+    })
+
+    assert feed[1].title == '!ITEM_TITLE3!'
+    assert feed[1].link == '!ITEM_LINK3!'
+    assert '!ITEM_DESC3!' in feed[1].desc
+    assert '!ITEM_CONTENT3!' in feed[1].content
+
+each_format = pytest.mark.parametrize('url', [
+    'feed-rss-channel-utf-8.txt', 'feed-atom-utf-8.txt',
+    'feed-atom03-utf-8.txt', 'feed-json-utf-8.txt', 'feed-html-utf-8.txt',
+    ])
+
+each_check = pytest.mark.parametrize('check', [
+    check_feed, check_output, check_change, check_add,
+    ])
+
+@each_format
+@each_check
+def test_parse(replay_server, url, check):
+    feed = get_feed(url)
+    check(feed)
+
+@each_format
+@each_check
+def test_convert_rss(replay_server, url, check):
+    feed = get_feed(url)
+    feed = feed.convert(FeedXML)
+    check(feed)
+
+@each_format
+@each_check
+def test_convert_json(replay_server, url, check):
+    feed = get_feed(url)
+    feed = feed.convert(FeedJSON)
+    check(feed)
+
+@each_format
+@each_check
+def test_convert_html(replay_server, url, check):
+    feed = get_feed(url)
+    feed = feed.convert(FeedHTML)
+    if len(feed) > 1:
+        # remove the 'blank' default html item
+        del feed[0]
+    check(feed)
+
+@each_format
+def test_convert_csv(replay_server, url):
+    # only csv output, not csv feed, check therefore differnet
+    feed = get_feed(url)
+    output = feed.tocsv()
+
+    assert '!ITEM_TITLE!' in output
+    assert '!ITEM_LINK!' in output
+    assert '!ITEM_DESC!' in output
+    assert '!ITEM_CONTENT!' in output
Author	SHA1	Message	Date
pictuga	c5b2df754e	Save auto version number All checks were successful default / test-lint (push) Successful in 1m31s Details default / python-publish (push) Successful in 35s Details default / docker-publish-deploy (push) Successful in 1m56s Details Fixed #108	2023-06-27 22:36:29 +02:00
pictuga	6529fdbdd8	Clean up sqlite code All checks were successful default / test-lint (push) Successful in 1m26s Details default / python-publish (push) Successful in 30s Details default / docker-publish-deploy (push) Successful in 1m35s Details	2023-06-26 01:30:47 +02:00
pictuga	f4da40fffb	actions: fix deploy	2023-06-26 01:29:00 +02:00
pictuga	d27fc93f75	actions: clean up	2023-06-26 01:28:33 +02:00
pictuga	dfb2b83c06	actions: fix python setup Some checks reported warnings default / publish-deploy (push) Has been cancelled Details default / docker-publish (push) Has been cancelled Details default / test-lint (push) Successful in 1m33s Details	2023-06-24 01:50:12 +02:00
pictuga	4340b678d0	actions: change image Some checks failed default / test-lint (push) Failing after 23s Details default / publish-deploy (push) Failing after 12s Details default / docker-publish (push) Successful in 2m19s Details	2023-06-23 23:14:32 +02:00
pictuga	ff9503b0d0	Switch from Drone to Gitea Actions Some checks failed default / publish-deploy (push) Failing after 45s Details default / docker-publish (push) Failing after 10s Details default / test-lint (push) Failing after 11m42s Details	2023-05-17 22:54:05 +02:00
Nesswit	8bdcd8f386	Add `mode` option	2023-05-04 16:01:52 +09:00
pictuga	ea2ebedfcb	Added systemd service file Some checks failed continuous-integration/drone/push Build is failing Details Fixing #94	2022-12-13 23:01:42 +01:00
pictuga	438c32a312	Remove sqlite & mysql cache backends Some checks failed continuous-integration/drone/push Build is failing Details Obsoleted since the introduction of diskcache & redis	2022-12-13 22:40:13 +01:00
pictuga	8b26797e93	README: add recommended install way Some checks reported errors continuous-integration/drone/push Build was killed Details continuous-integration/drone Build is passing Details Part of discussions on #94	2022-12-13 22:07:21 +01:00
pictuga	e1ed33f320	crawler: improve html iter code All checks were successful continuous-integration/drone/push Build is passing Details Ignores tags without attributes. Avoids bug with unclosed tags.	2022-02-09 15:57:12 +01:00
pictuga	b65272daab	crawler: accept more meta redirects All checks were successful continuous-integration/drone/push Build is passing Details	2022-02-01 23:32:49 +01:00
pictuga	4d64afe9cb	crawler: fix regression from `d6b90448f3` Some checks failed continuous-integration/drone/push Build is failing Details	2022-02-01 23:18:16 +01:00
pictuga	d3b623482d	pytest: crawler	2022-02-01 23:16:43 +01:00
pictuga	32645548c2	pytest: first batch with test_feeds Some checks failed continuous-integration/drone/push Build is failing Details And multiple related fixes	2022-01-31 08:32:34 +01:00
pictuga	d6b90448f3	crawler: improve handling of non-ascii urls	2022-01-30 23:27:49 +01:00
pictuga	da81edc651	log to stderr Some checks failed continuous-integration/drone/push Build is failing Details	2022-01-26 07:57:57 +01:00