crawler: improve html iter code
continuous-integration/drone/push Build is passing
Details
continuous-integration/drone/push Build is passing
Details
Ignores tags without attributes. Avoids bug with unclosed tags.master
parent
b65272daab
commit
e1ed33f320
|
@ -368,7 +368,7 @@ class BrowserlyHeaderHandler(BaseHandler):
|
||||||
def iter_html_tag(html_str, tag_name):
|
def iter_html_tag(html_str, tag_name):
|
||||||
" To avoid parsing whole pages when looking for a simple tag "
|
" To avoid parsing whole pages when looking for a simple tag "
|
||||||
|
|
||||||
re_tag = r'<%s(\s*[^>])*>' % tag_name
|
re_tag = r'<%s\s+[^>]+>' % tag_name
|
||||||
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
|
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
|
||||||
|
|
||||||
for tag_match in re.finditer(re_tag, html_str):
|
for tag_match in re.finditer(re_tag, html_str):
|
||||||
|
|
Loading…
Reference in New Issue