@@ -22,14 +22,25 @@ def http_content_type_encoding(content_type):
2222
2323# regexp for parsing HTTP meta tags
2424_TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?'''
25+ _SKIP_ATTRS = '''(?x)(?:\\s+
26+ [^=<>/\\s"'\x00-\x1f\x7f]+ # Attribute name
27+ (?:\\s*=\\s*
28+ (?: # ' and " are entity encoded (', "), so no need for \', \"
29+ '[^']*' # attr in '
30+ |
31+ "[^"]*" # attr in "
32+ |
33+ [^'"\\s]+ # attr having no ' nor "
34+ ))?
35+ )*?'''
2536_HTTPEQUIV_RE = _TEMPLATE % ('http-equiv', 'Content-Type')
2637_CONTENT_RE = _TEMPLATE % ('content', r'(?P<mime>[^;]+);\s*charset=(?P<charset>[\w-]+)')
2738_CONTENT2_RE = _TEMPLATE % ('charset', r'(?P<charset2>[\w-]+)')
2839_XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P<xmlcharset>[\w-]+)')
2940
3041# check for meta tags, or xml decl. and stop search if a body tag is encountered
31- _BODY_ENCODING_PATTERN = r'<\s*(?:meta(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % \
32- ( _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
42+ _BODY_ENCODING_PATTERN = r'<\s*(?:meta%s (?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % (
43+ _SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
3344_BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I)
3445_BODY_ENCODING_BYTES_RE = re.compile(_BODY_ENCODING_PATTERN.encode('ascii'), re.I)
3546
0 commit comments