crawler: improve html iter code

Ignores tags without attributes. Avoids bug with unclosed tags.
2022-02-09 15:57:12 +01:00
parent b65272daab
commit e1ed33f320
1 changed files with 1 additions and 1 deletions
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -368,7 +368,7 @@ class BrowserlyHeaderHandler(BaseHandler):
 def iter_html_tag(html_str, tag_name):
    " To avoid parsing whole pages when looking for a simple tag "

-    re_tag = r'<%s(\s*[^>])*>' % tag_name
+    re_tag = r'<%s\s+[^>]+>' % tag_name
    re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'

    for tag_match in re.finditer(re_tag, html_str):