Compare commits
14 Commits
master
...
000a5cda7a
Author | SHA1 | Date | |
---|---|---|---|
000a5cda7a | |||
f2efd56e8f | |||
52e73331b8 | |||
046f3f9f3d | |||
db8e046eae | |||
b4b1e93289 | |||
8a329fbb6d | |||
77159b99ca | |||
c158e65192 | |||
0d64964a02 | |||
e8271ae9a0 | |||
2abe061422 | |||
57bd94d42f | |||
69cdf05341 |
15
.drone.yml
Normal file
15
.drone.yml
Normal file
@@ -0,0 +1,15 @@
|
||||
kind: pipeline
|
||||
name: default
|
||||
|
||||
steps:
|
||||
- name: isort
|
||||
image: python:alpine
|
||||
commands:
|
||||
- pip install isort
|
||||
- isort --check-only --diff .
|
||||
- name: pylint
|
||||
image: alpine
|
||||
commands:
|
||||
- apk add --no-cache python3 py3-lxml py3-pip py3-wheel py3-pylint py3-enchant hunspell-en
|
||||
- pip3 install --no-cache-dir .
|
||||
- pylint morss --rcfile=.pylintrc --disable=C,R,W --fail-under=8
|
50
.pylintrc
Normal file
50
.pylintrc
Normal file
@@ -0,0 +1,50 @@
|
||||
[MASTER]
|
||||
ignore=CVS
|
||||
suggestion-mode=yes
|
||||
extension-pkg-allow-list=lxml.etree
|
||||
|
||||
[MESSAGES CONTROL]
|
||||
disable=missing-function-docstring,
|
||||
missing-class-docstring,
|
||||
missing-module-docstring,
|
||||
wrong-spelling-in-comment,
|
||||
|
||||
[REPORTS]
|
||||
reports=yes
|
||||
score=yes
|
||||
|
||||
[SPELLING]
|
||||
spelling-dict=en_GB
|
||||
spelling-ignore-words=morss
|
||||
|
||||
[STRING]
|
||||
check-quote-consistency=yes
|
||||
check-str-concat-over-line-jumps=yes
|
||||
|
||||
[VARIABLES]
|
||||
allow-global-unused-variables=no
|
||||
init-import=no
|
||||
|
||||
[FORMAT]
|
||||
expected-line-ending-format=LF
|
||||
indent-string=' '
|
||||
max-line-length=120
|
||||
max-module-lines=1000
|
||||
|
||||
[BASIC]
|
||||
argument-naming-style=snake_case
|
||||
attr-naming-style=snake_case
|
||||
class-attribute-naming-style=snake_case
|
||||
class-const-naming-style=UPPER_CASE
|
||||
class-naming-style=PascalCase
|
||||
const-naming-style=UPPER_CASE
|
||||
function-naming-style=snake_case
|
||||
inlinevar-naming-style=snake_case
|
||||
method-naming-style=snake_case
|
||||
module-naming-style=snake_case
|
||||
variable-naming-style=snake_case
|
||||
|
||||
include-naming-hint=yes
|
||||
|
||||
bad-names=foo, bar
|
||||
good-names=i, j, k
|
@@ -1,5 +1,7 @@
|
||||
# Morss - Get full-text RSS feeds
|
||||
|
||||
[](https://ci.pictuga.com/pictuga/morss)
|
||||
|
||||
_GNU AGPLv3 code_
|
||||
_Provided logo is CC BY-NC-SA 4.0_
|
||||
|
||||
|
@@ -16,5 +16,8 @@
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
# ran on `import morss`
|
||||
|
||||
# pylint: disable=unused-import,unused-variable
|
||||
|
||||
from .morss import *
|
||||
from .wsgi import application
|
||||
|
@@ -20,9 +20,7 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
from . import wsgi
|
||||
from . import cli
|
||||
|
||||
from . import cli, wsgi
|
||||
from .morss import MorssException
|
||||
|
||||
|
||||
|
163
morss/cache.py
Normal file
163
morss/cache.py
Normal file
@@ -0,0 +1,163 @@
|
||||
# This file is part of morss
|
||||
#
|
||||
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU Affero General Public License as published by the Free
|
||||
# Software Foundation, either version 3 of the License, or (at your option) any
|
||||
# later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
||||
# details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import time
|
||||
import threading
|
||||
from collections import OrderedDict
|
||||
|
||||
CACHE_SIZE = int(os.getenv('CACHE_SIZE', 1000)) # max number of items in cache (default: 1k items)
|
||||
CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60)) # how often to auto-clear the cache (default: 1min)
|
||||
|
||||
|
||||
class BaseCache:
|
||||
""" Subclasses must behave like a dict """
|
||||
|
||||
def trim(self):
|
||||
pass
|
||||
|
||||
def autotrim(self, delay=CACHE_LIFESPAN):
|
||||
# trim the cache every so often
|
||||
|
||||
self.trim()
|
||||
|
||||
t = threading.Timer(delay, self.autotrim)
|
||||
t.daemon = True
|
||||
t.start()
|
||||
|
||||
def __contains__(self, url):
|
||||
try:
|
||||
self[url]
|
||||
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
try:
|
||||
import sqlite3 # isort:skip
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class SQLiteCache(BaseCache):
|
||||
def __init__(self, filename=':memory:'):
|
||||
self.con = sqlite3.connect(filename, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
|
||||
|
||||
with self.con:
|
||||
self.con.execute('CREATE TABLE IF NOT EXISTS data (ky UNICODE PRIMARY KEY, data BLOB, timestamp INT)')
|
||||
self.con.execute('pragma journal_mode=WAL')
|
||||
|
||||
self.trim()
|
||||
|
||||
def __del__(self):
|
||||
self.con.close()
|
||||
|
||||
def trim(self):
|
||||
with self.con:
|
||||
self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,))
|
||||
|
||||
def __getitem__(self, key):
|
||||
row = self.con.execute('SELECT * FROM data WHERE ky=?', (key,)).fetchone()
|
||||
|
||||
if not row:
|
||||
raise KeyError
|
||||
|
||||
return row[1]
|
||||
|
||||
def __setitem__(self, key, data):
|
||||
with self.con:
|
||||
self.con.execute('INSERT INTO data VALUES (?,?,?) ON CONFLICT(ky) DO UPDATE SET data=?, timestamp=?', (key, data, time.time(), data, time.time()))
|
||||
|
||||
|
||||
try:
|
||||
import pymysql.cursors # isort:skip
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class MySQLCacheHandler(BaseCache):
|
||||
def __init__(self, user, password, database, host='localhost'):
|
||||
self.user = user
|
||||
self.password = password
|
||||
self.database = database
|
||||
self.host = host
|
||||
|
||||
with self.cursor() as cursor:
|
||||
cursor.execute('CREATE TABLE IF NOT EXISTS data (ky VARCHAR(255) NOT NULL PRIMARY KEY, data MEDIUMBLOB, timestamp INT)')
|
||||
|
||||
self.trim()
|
||||
|
||||
def cursor(self):
|
||||
return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
|
||||
|
||||
def trim(self):
|
||||
with self.cursor() as cursor:
|
||||
cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,))
|
||||
|
||||
def __getitem__(self, key):
|
||||
cursor = self.cursor()
|
||||
cursor.execute('SELECT * FROM data WHERE ky=%s', (key,))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
raise KeyError
|
||||
|
||||
return row[1]
|
||||
|
||||
def __setitem__(self, key, data):
|
||||
with self.cursor() as cursor:
|
||||
cursor.execute('INSERT INTO data VALUES (%s,%s,%s) ON DUPLICATE KEY UPDATE data=%s, timestamp=%s',
|
||||
(key, data, time.time(), data, time.time()))
|
||||
|
||||
|
||||
class CappedDict(OrderedDict, BaseCache):
|
||||
def trim(self):
|
||||
if CACHE_SIZE >= 0:
|
||||
for i in range( max( len(self) - CACHE_SIZE , 0 )):
|
||||
self.popitem(False)
|
||||
|
||||
def __setitem__(self, key, data):
|
||||
# https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
|
||||
if key in self:
|
||||
del self[key]
|
||||
OrderedDict.__setitem__(self, key, data)
|
||||
|
||||
|
||||
if 'CACHE' in os.environ:
|
||||
if os.environ['CACHE'] == 'mysql':
|
||||
default_cache = MySQLCacheHandler(
|
||||
user = os.getenv('MYSQL_USER'),
|
||||
password = os.getenv('MYSQL_PWD'),
|
||||
database = os.getenv('MYSQL_DB'),
|
||||
host = os.getenv('MYSQL_HOST', 'localhost')
|
||||
)
|
||||
|
||||
elif os.environ['CACHE'] == 'sqlite':
|
||||
if 'SQLITE_PATH' in os.environ:
|
||||
path = os.getenv('SQLITE_PATH')
|
||||
|
||||
else:
|
||||
path = ':memory:'
|
||||
|
||||
default_cache = SQLiteCache(path)
|
||||
|
||||
else:
|
||||
default_cache = CappedDict()
|
@@ -15,12 +15,11 @@
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import sys
|
||||
import os.path
|
||||
import argparse
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
from .morss import FeedFetch, FeedGather, FeedFormat
|
||||
from .morss import Options
|
||||
from .morss import FeedFetch, FeedFormat, FeedGather, Options
|
||||
|
||||
|
||||
def cli_app():
|
||||
|
250
morss/crawler.py
250
morss/crawler.py
@@ -16,30 +16,36 @@
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import zlib
|
||||
from io import BytesIO, StringIO
|
||||
import re
|
||||
import chardet
|
||||
from cgi import parse_header
|
||||
import time
|
||||
import threading
|
||||
import pickle
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import zlib
|
||||
from cgi import parse_header
|
||||
from collections import OrderedDict
|
||||
from io import BytesIO, StringIO
|
||||
|
||||
import chardet
|
||||
|
||||
from .cache import default_cache
|
||||
|
||||
try:
|
||||
# python 2
|
||||
from urllib2 import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
|
||||
from urllib import quote
|
||||
from urlparse import urlparse, urlunparse
|
||||
|
||||
import mimetools
|
||||
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
|
||||
Request, addinfourl, build_opener, parse_http_list,
|
||||
parse_keqv_list)
|
||||
from urlparse import urlparse, urlunparse
|
||||
except ImportError:
|
||||
# python 3
|
||||
from urllib.request import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
|
||||
from urllib.parse import quote
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
import email
|
||||
from urllib.parse import quote, urlparse, urlunparse
|
||||
from urllib.request import (BaseHandler, HTTPCookieProcessor,
|
||||
HTTPRedirectHandler, Request, addinfourl,
|
||||
build_opener, parse_http_list, parse_keqv_list)
|
||||
|
||||
try:
|
||||
# python 2
|
||||
@@ -49,10 +55,6 @@ except NameError:
|
||||
basestring = unicode = str
|
||||
|
||||
|
||||
CACHE_SIZE = int(os.getenv('CACHE_SIZE', 1000)) # max number of items in cache (default: 1k items)
|
||||
CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60)) # how often to auto-clear the cache (default: 1min)
|
||||
|
||||
|
||||
MIMETYPE = {
|
||||
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
|
||||
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
||||
@@ -131,6 +133,7 @@ def custom_opener(follow=None, delay=None):
|
||||
handlers.append(SizeLimitHandler(500*1024)) # 500KiB
|
||||
handlers.append(HTTPCookieProcessor())
|
||||
handlers.append(GZIPHandler())
|
||||
handlers.append(HTTPAllRedirectHandler())
|
||||
handlers.append(HTTPEquivHandler())
|
||||
handlers.append(HTTPRefreshHandler())
|
||||
handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
|
||||
@@ -397,6 +400,11 @@ class HTTPEquivHandler(RespStrHandler):
|
||||
resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
|
||||
|
||||
|
||||
class HTTPAllRedirectHandler(HTTPRedirectHandler):
|
||||
def http_error_308(self, req, fp, code, msg, headers):
|
||||
return self.http_error_301(req, fp, 301, msg, headers)
|
||||
|
||||
|
||||
class HTTPRefreshHandler(BaseHandler):
|
||||
handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000
|
||||
|
||||
@@ -447,37 +455,46 @@ class CacheHandler(BaseHandler):
|
||||
|
||||
def load(self, url):
|
||||
try:
|
||||
out = list(self.cache[url])
|
||||
data = pickle.loads(self.cache[url])
|
||||
|
||||
except KeyError:
|
||||
out = [None, None, unicode(), bytes(), 0]
|
||||
data = None
|
||||
|
||||
if sys.version_info[0] >= 3:
|
||||
out[2] = email.message_from_string(out[2] or unicode()) # headers
|
||||
else:
|
||||
out[2] = mimetools.Message(StringIO(out[2] or unicode()))
|
||||
if sys.version_info[0] >= 3:
|
||||
data['headers'] = email.message_from_string(data['headers'] or unicode()) # headers
|
||||
else:
|
||||
data['headers'] = mimetools.Message(StringIO(data['headers'] or unicode()))
|
||||
|
||||
return out
|
||||
return data
|
||||
|
||||
def save(self, url, code, msg, headers, data, timestamp):
|
||||
self.cache[url] = (code, msg, unicode(headers), data, timestamp)
|
||||
def save(self, key, data):
|
||||
data['headers'] = unicode(data['headers'])
|
||||
self.cache[key] = pickle.dumps(data, 0)
|
||||
|
||||
def is_cached(self, url):
|
||||
return self.load(url)[0] is not None
|
||||
def is_cached(self, key):
|
||||
return self.load(key) is not None
|
||||
|
||||
def cached_response(self, req):
|
||||
# this does NOT check whether it's already cached, use with care
|
||||
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
|
||||
data = self.load(req.get_full_url())
|
||||
|
||||
# return the cache as a response
|
||||
resp = addinfourl(BytesIO(data), headers, req.get_full_url(), code)
|
||||
resp.msg = msg
|
||||
resp = addinfourl(BytesIO(data['data']), data['headers'], req.get_full_url(), data['code'])
|
||||
resp.msg = data['msg']
|
||||
|
||||
return resp
|
||||
|
||||
def save_response(self, req, resp):
|
||||
data = resp.read()
|
||||
|
||||
self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
|
||||
self.save(req.get_full_url(), {
|
||||
'code': resp.code,
|
||||
'msg': resp.msg,
|
||||
'headers': resp.headers,
|
||||
'data': data,
|
||||
'timestamp': time.time()
|
||||
})
|
||||
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
@@ -487,13 +504,14 @@ class CacheHandler(BaseHandler):
|
||||
return resp
|
||||
|
||||
def http_request(self, req):
|
||||
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
|
||||
data = self.load(req.get_full_url())
|
||||
|
||||
if 'etag' in headers:
|
||||
req.add_unredirected_header('If-None-Match', headers['etag'])
|
||||
if data is not None:
|
||||
if 'etag' in data['headers']:
|
||||
req.add_unredirected_header('If-None-Match', data['headers']['etag'])
|
||||
|
||||
if 'last-modified' in headers:
|
||||
req.add_unredirected_header('If-Modified-Since', headers.get('last-modified'))
|
||||
if 'last-modified' in data['headers']:
|
||||
req.add_unredirected_header('If-Modified-Since', data['headers']['last-modified'])
|
||||
|
||||
return req
|
||||
|
||||
@@ -502,33 +520,33 @@ class CacheHandler(BaseHandler):
|
||||
# If 'None' is returned, try your chance with the next-available handler
|
||||
# If a 'resp' is returned, stop there, and proceed with 'http_response'
|
||||
|
||||
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
|
||||
data = self.load(req.get_full_url())
|
||||
|
||||
if data is None:
|
||||
# cache empty, refresh
|
||||
return None
|
||||
|
||||
# some info needed to process everything
|
||||
cache_control = parse_http_list(headers.get('cache-control', ()))
|
||||
cache_control += parse_http_list(headers.get('pragma', ()))
|
||||
cache_control = parse_http_list(data['headers'].get('cache-control', ()))
|
||||
cache_control += parse_http_list(data['headers'].get('pragma', ()))
|
||||
|
||||
cc_list = [x for x in cache_control if '=' not in x]
|
||||
cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
|
||||
|
||||
cache_age = time.time() - timestamp
|
||||
cache_age = time.time() - data['timestamp']
|
||||
|
||||
# list in a simple way what to do when
|
||||
if self.force_min == -2:
|
||||
if code is not None:
|
||||
if data['code'] is not None:
|
||||
# already in cache, perfect, use cache
|
||||
return self.cached_response(req)
|
||||
|
||||
else:
|
||||
# raise an error, via urllib handlers
|
||||
resp = addinfourl(BytesIO(), headers, req.get_full_url(), 409)
|
||||
resp = addinfourl(BytesIO(), data['headers'], req.get_full_url(), 409)
|
||||
resp.msg = 'Conflict'
|
||||
return resp
|
||||
|
||||
elif code is None:
|
||||
# cache empty, refresh
|
||||
return None
|
||||
|
||||
elif self.force_min == -1:
|
||||
# force use cache
|
||||
return self.cached_response(req)
|
||||
@@ -537,7 +555,7 @@ class CacheHandler(BaseHandler):
|
||||
# force refresh
|
||||
return None
|
||||
|
||||
elif code == 301 and cache_age < 7*24*3600:
|
||||
elif data['code'] == 301 and cache_age < 7*24*3600:
|
||||
# "301 Moved Permanently" has to be cached...as long as we want
|
||||
# (awesome HTTP specs), let's say a week (why not?). Use force_min=0
|
||||
# if you want to bypass this (needed for a proper refresh)
|
||||
@@ -594,142 +612,6 @@ class CacheHandler(BaseHandler):
|
||||
https_response = http_response
|
||||
|
||||
|
||||
class BaseCache:
|
||||
""" Subclasses must behave like a dict """
|
||||
|
||||
def trim(self):
|
||||
pass
|
||||
|
||||
def autotrim(self, delay=CACHE_LIFESPAN):
|
||||
# trim the cache every so often
|
||||
|
||||
self.trim()
|
||||
|
||||
t = threading.Timer(delay, self.autotrim)
|
||||
t.daemon = True
|
||||
t.start()
|
||||
|
||||
def __contains__(self, url):
|
||||
try:
|
||||
self[url]
|
||||
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
import sqlite3
|
||||
|
||||
|
||||
class SQLiteCache(BaseCache):
|
||||
def __init__(self, filename=':memory:'):
|
||||
self.con = sqlite3.connect(filename, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
|
||||
|
||||
with self.con:
|
||||
self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)')
|
||||
self.con.execute('pragma journal_mode=WAL')
|
||||
|
||||
self.trim()
|
||||
|
||||
def __del__(self):
|
||||
self.con.close()
|
||||
|
||||
def trim(self):
|
||||
with self.con:
|
||||
self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,))
|
||||
|
||||
def __getitem__(self, url):
|
||||
row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
|
||||
|
||||
if not row:
|
||||
raise KeyError
|
||||
|
||||
return row[1:]
|
||||
|
||||
def __setitem__(self, url, value): # value = (code, msg, headers, data, timestamp)
|
||||
value = list(value)
|
||||
value[3] = sqlite3.Binary(value[3]) # data
|
||||
value = tuple(value)
|
||||
|
||||
with self.con:
|
||||
self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?) ON CONFLICT(url) DO UPDATE SET code=?, msg=?, headers=?, data=?, timestamp=?', (url,) + value + value)
|
||||
|
||||
|
||||
import pymysql.cursors
|
||||
|
||||
|
||||
class MySQLCacheHandler(BaseCache):
|
||||
def __init__(self, user, password, database, host='localhost'):
|
||||
self.user = user
|
||||
self.password = password
|
||||
self.database = database
|
||||
self.host = host
|
||||
|
||||
with self.cursor() as cursor:
|
||||
cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)')
|
||||
|
||||
self.trim()
|
||||
|
||||
def cursor(self):
|
||||
return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
|
||||
|
||||
def trim(self):
|
||||
with self.cursor() as cursor:
|
||||
cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,))
|
||||
|
||||
def __getitem__(self, url):
|
||||
cursor = self.cursor()
|
||||
cursor.execute('SELECT * FROM data WHERE url=%s', (url,))
|
||||
row = cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
raise KeyError
|
||||
|
||||
return row[1:]
|
||||
|
||||
def __setitem__(self, url, value): # (code, msg, headers, data, timestamp)
|
||||
with self.cursor() as cursor:
|
||||
cursor.execute('INSERT INTO data VALUES (%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE code=%s, msg=%s, headers=%s, data=%s, timestamp=%s',
|
||||
(url,) + value + value)
|
||||
|
||||
|
||||
class CappedDict(OrderedDict, BaseCache):
|
||||
def trim(self):
|
||||
if CACHE_SIZE >= 0:
|
||||
for i in range( max( len(self) - CACHE_SIZE , 0 )):
|
||||
self.popitem(False)
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
# https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
|
||||
if key in self:
|
||||
del self[key]
|
||||
OrderedDict.__setitem__(self, key, value)
|
||||
|
||||
|
||||
if 'CACHE' in os.environ:
|
||||
if os.environ['CACHE'] == 'mysql':
|
||||
default_cache = MySQLCacheHandler(
|
||||
user = os.getenv('MYSQL_USER'),
|
||||
password = os.getenv('MYSQL_PWD'),
|
||||
database = os.getenv('MYSQL_DB'),
|
||||
host = os.getenv('MYSQL_HOST', 'localhost')
|
||||
)
|
||||
|
||||
elif os.environ['CACHE'] == 'sqlite':
|
||||
if 'SQLITE_PATH' in os.environ:
|
||||
path = os.getenv('SQLITE_PATH')
|
||||
|
||||
else:
|
||||
path = ':memory:'
|
||||
|
||||
default_cache = SQLiteCache(path)
|
||||
|
||||
else:
|
||||
default_cache = CappedDict()
|
||||
|
||||
|
||||
if 'IGNORE_SSL' in os.environ:
|
||||
import ssl
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
|
@@ -15,35 +15,35 @@
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import sys
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
from datetime import datetime
|
||||
sys.path.append('/home/paul/Documents/Code/morss/lib')
|
||||
|
||||
import re
|
||||
import json
|
||||
import csv
|
||||
|
||||
import json
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from fnmatch import fnmatch
|
||||
|
||||
from lxml import etree
|
||||
from dateutil import tz
|
||||
import dateutil.parser
|
||||
from copy import deepcopy
|
||||
|
||||
import lxml.html
|
||||
from dateutil import tz
|
||||
from lxml import etree
|
||||
|
||||
from .readabilite import parse as html_parse
|
||||
|
||||
json.encoder.c_make_encoder = None
|
||||
|
||||
try:
|
||||
# python 2
|
||||
from StringIO import StringIO
|
||||
from ConfigParser import RawConfigParser
|
||||
from StringIO import StringIO
|
||||
except ImportError:
|
||||
# python 3
|
||||
from io import StringIO
|
||||
from configparser import RawConfigParser
|
||||
from io import StringIO
|
||||
|
||||
try:
|
||||
# python 2
|
||||
|
@@ -16,30 +16,25 @@
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime
|
||||
from dateutil import tz
|
||||
|
||||
from fnmatch import fnmatch
|
||||
import re
|
||||
|
||||
import lxml.etree
|
||||
import lxml.html
|
||||
from dateutil import tz
|
||||
|
||||
from . import feeds
|
||||
from . import crawler
|
||||
from . import readabilite
|
||||
|
||||
from . import crawler, feeds, readabilite
|
||||
|
||||
try:
|
||||
# python 2
|
||||
from httplib import HTTPException
|
||||
from urlparse import urlparse, urljoin, parse_qs
|
||||
from urlparse import parse_qs, urljoin, urlparse
|
||||
except ImportError:
|
||||
# python 3
|
||||
from http.client import HTTPException
|
||||
from urllib.parse import urlparse, urljoin, parse_qs
|
||||
from urllib.parse import parse_qs, urljoin, urlparse
|
||||
|
||||
|
||||
MAX_ITEM = int(os.getenv('MAX_ITEM', 5)) # cache-only beyond
|
||||
|
@@ -15,10 +15,11 @@
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import re
|
||||
|
||||
import lxml.etree
|
||||
import lxml.html
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
|
||||
|
||||
def parse(data, encoding=None):
|
||||
@@ -352,6 +353,7 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
|
||||
from . import crawler
|
||||
|
||||
req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
|
||||
|
@@ -15,16 +15,16 @@
|
||||
# You should have received a copy of the GNU Affero General Public License along
|
||||
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
import sys
|
||||
import cgitb
|
||||
import mimetypes
|
||||
import os.path
|
||||
import re
|
||||
import lxml.etree
|
||||
|
||||
import cgitb
|
||||
import wsgiref.util
|
||||
import wsgiref.simple_server
|
||||
import sys
|
||||
import wsgiref.handlers
|
||||
import mimetypes
|
||||
import wsgiref.simple_server
|
||||
import wsgiref.util
|
||||
|
||||
import lxml.etree
|
||||
|
||||
try:
|
||||
# python 2
|
||||
@@ -33,11 +33,9 @@ except ImportError:
|
||||
# python 3
|
||||
from urllib.parse import unquote
|
||||
|
||||
from . import crawler
|
||||
from . import readabilite
|
||||
from .morss import FeedFetch, FeedGather, FeedFormat
|
||||
from .morss import Options, log, TIMEOUT, DELAY, MorssException
|
||||
|
||||
from . import crawler, readabilite
|
||||
from .morss import (DELAY, TIMEOUT, FeedFetch, FeedFormat, FeedGather,
|
||||
MorssException, Options, log)
|
||||
|
||||
PORT = int(os.getenv('PORT', 8080))
|
||||
|
||||
|
6
setup.py
6
setup.py
@@ -1,6 +1,7 @@
|
||||
from setuptools import setup
|
||||
from glob import glob
|
||||
|
||||
from setuptools import setup
|
||||
|
||||
package_name = 'morss'
|
||||
|
||||
setup(
|
||||
@@ -12,7 +13,8 @@ setup(
|
||||
download_url = 'https://git.pictuga.com/pictuga/morss',
|
||||
license = 'AGPL v3',
|
||||
packages = [package_name],
|
||||
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet', 'pymysql'],
|
||||
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'],
|
||||
extras_require = {'full': ['pymysql']},
|
||||
package_data = {package_name: ['feedify.ini']},
|
||||
data_files = [
|
||||
('share/' + package_name, ['README.md', 'LICENSE']),
|
||||
|
Reference in New Issue
Block a user