diff --git a/LICENSE b/LICENSE index 7b8d18c..f5b1fe7 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,5 @@ Copyright (c) 2010, Daniel Ryan +Copyright (c) 2017, Glyph All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/__init__.py b/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..5e40900 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[wheel] +universal = 1 diff --git a/setup.py b/setup.py index fcdabb2..d745284 100644 --- a/setup.py +++ b/setup.py @@ -1,20 +1,23 @@ from setuptools import setup, find_packages - + setup( - name='twitter-text-py', - version='2.0.2', - description='A library for auto-converting URLs, mentions, hashtags, lists, etc. in Twitter text. Also does tweet validation and search term highlighting.', - author='Daniel Ryan', - author_email='dryan@dryan.com', - url='http://github.com/dryan/twitter-text-py', + name='twitter-text', + version='3.0', + description='A library for auto-converting URLs, mentions, hashtags, lists, etc. in Twitter text. Also does tweet validation and search term highlighting. Fork of twitter-text-py, that supports python 3. Originally by David Ryan, Py3 port by Glyph.', + author='Glyph', + author_email='twitter-text-587601@glyph.im', + url='http://github.com/glyph/twitter-text-py', packages=find_packages(), classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Web Environment', + 'Environment :: Console', 'Intended Audience :: Developers', 'License :: OSI Approved :: BSD License', 'Operating System :: OS Independent', 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 3', 'Framework :: Django', ], include_package_data=True, diff --git a/tests.py b/tests.py index 891b35e..d73a49a 100644 --- a/tests.py +++ b/tests.py @@ -2,6 +2,10 @@ import twitter_text, sys, os, json, argparse, re from twitter_text.unicode import force_unicode +try: + unichr +except NameError: + unichr = chr narrow_build = True try: @@ -34,25 +38,35 @@ def construct_yaml_str(self, node): raise Exception('You need to install BeautifulSoup to run the tests') def success(text): - return (u'\033[92m%s\033[0m\n' % text).encode('utf-8') + return (u'\033[92m%s\033[0m\n' % text) def error(text): - return (u'\033[91m%s\033[0m\n' % text).encode('utf-8') + return (u'\033[91m%s\033[0m\n' % text) attempted = 0 +passed = 0 +failed = 0 def assert_equal_without_attribute_order(result, test, failure_message = None): global attempted attempted += 1 # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through - assert BeautifulSoup(result) == BeautifulSoup(test.get('expected')), error(u'Test %d Failed: %s' % (attempted, test.get('description'))) + actual = BeautifulSoup(result) + expected = BeautifulSoup(test.get('expected')) + assert actual == expected, error(u'Test %d Failed: %s (%s != %s)' % (attempted, test.get('description'), + actual, expected)) sys.stdout.write(success(u'Test %d Passed: %s' % (attempted, test.get('description')))) sys.stdout.flush() def assert_equal(result, test): global attempted attempted += 1 - assert result == test.get('expected'), error(u'\nTest %d Failed: %s%s' % (attempted, test.get('description'), u'\n%s' % test.get('hits') if test.get('hits') else '')) + expected = test.get('expected') + assert result == expected, error(u'\nTest %d Failed: %s%s (%s != %s)' % ( + attempted, test.get('description'), + u'\n%s' % test.get('hits') if test.get('hits') else '', + result, expected + )) sys.stdout.write(success(u'Test %d Passed: %s' % (attempted, test.get('description')))) sys.stdout.flush() @@ -72,27 +86,33 @@ def assert_equal(result, test): sys.stdout.write('Skipping: %s\n' % test.get('description')) sys.stdout.flush() continue - extractor = twitter_text.extractor.Extractor(test.get('text')) - if section == 'mentions': - assert_equal(extractor.extract_mentioned_screen_names(), test) - elif section == 'mentions_with_indices': - assert_equal(extractor.extract_mentioned_screen_names_with_indices(), test) - elif section == 'mentions_or_lists_with_indices': - assert_equal(extractor.extract_mentions_or_lists_with_indices(), test) - elif section == 'replies': - assert_equal(extractor.extract_reply_screen_name(), test) - elif section == 'urls': - assert_equal(extractor.extract_urls(), test) - elif section == 'urls_with_indices': - assert_equal(extractor.extract_urls_with_indices(), test) - elif section == 'hashtags': - assert_equal(extractor.extract_hashtags(), test) - elif section == 'cashtags': - assert_equal(extractor.extract_cashtags(), test) - elif section == 'hashtags_with_indices': - assert_equal(extractor.extract_hashtags_with_indices(), test) - elif section == 'cashtags_with_indices': - assert_equal(extractor.extract_cashtags_with_indices(), test) + try: + extractor = twitter_text.extractor.Extractor(test.get('text')) + if section == 'mentions': + assert_equal(extractor.extract_mentioned_screen_names(), test) + elif section == 'mentions_with_indices': + assert_equal(extractor.extract_mentioned_screen_names_with_indices(), test) + elif section == 'mentions_or_lists_with_indices': + assert_equal(extractor.extract_mentions_or_lists_with_indices(), test) + elif section == 'replies': + assert_equal(extractor.extract_reply_screen_name(), test) + elif section == 'urls': + assert_equal(extractor.extract_urls(), test) + elif section == 'urls_with_indices': + assert_equal(extractor.extract_urls_with_indices(), test) + elif section == 'hashtags': + assert_equal(extractor.extract_hashtags(), test) + elif section == 'cashtags': + assert_equal(extractor.extract_cashtags(), test) + elif section == 'hashtags_with_indices': + assert_equal(extractor.extract_hashtags_with_indices(), test) + elif section == 'cashtags_with_indices': + assert_equal(extractor.extract_cashtags_with_indices(), test) + except AssertionError as ae: + print(ae.args[0]) + failed += 1 + else: + passed += 1 # autolink section autolink_file = open(os.path.join('twitter-text-conformance', 'autolink.yml'), 'r') @@ -112,20 +132,26 @@ def assert_equal(result, test): sys.stdout.flush() continue autolink = twitter_text.autolink.Autolink(test.get('text')) - if section == 'usernames': - assert_equal_without_attribute_order(autolink.auto_link_usernames_or_lists(autolink_options), test) - elif section == 'cashtags': - assert_equal_without_attribute_order(autolink.auto_link_cashtags(autolink_options), test) - elif section == 'urls': - assert_equal_without_attribute_order(autolink.auto_link_urls(autolink_options), test) - elif section == 'hashtags': - assert_equal_without_attribute_order(autolink.auto_link_hashtags(autolink_options), test) - elif section == 'all': - assert_equal_without_attribute_order(autolink.auto_link(autolink_options), test) - elif section == 'lists': - assert_equal_without_attribute_order(autolink.auto_link_usernames_or_lists(autolink_options), test) - elif section == 'json': - assert_equal_without_attribute_order(autolink.auto_link_with_json(json.loads(test.get('json')), autolink_options), test) + try: + if section == 'usernames': + assert_equal_without_attribute_order(autolink.auto_link_usernames_or_lists(autolink_options), test) + elif section == 'cashtags': + assert_equal_without_attribute_order(autolink.auto_link_cashtags(autolink_options), test) + elif section == 'urls': + assert_equal_without_attribute_order(autolink.auto_link_urls(autolink_options), test) + elif section == 'hashtags': + assert_equal_without_attribute_order(autolink.auto_link_hashtags(autolink_options), test) + elif section == 'all': + assert_equal_without_attribute_order(autolink.auto_link(autolink_options), test) + elif section == 'lists': + assert_equal_without_attribute_order(autolink.auto_link_usernames_or_lists(autolink_options), test) + elif section == 'json': + assert_equal_without_attribute_order(autolink.auto_link_with_json(json.loads(test.get('json')), autolink_options), test) + except AssertionError as ae: + print(ae.args[0]) + failed += 1 + else: + passed += 1 # hit_highlighting section hit_highlighting_file = open(os.path.join('twitter-text-conformance', 'hit_highlighting.yml'), 'r') @@ -150,7 +176,10 @@ def assert_equal(result, test): try: validate_file = open(os.path.join('twitter-text-conformance', 'validate.yml'), 'r') validate_file_contents = validate_file.read() - validate_tests = yaml.load(re.sub(ur'\\n', '\n', validate_file_contents.encode('unicode-escape'))) + validate_tests = yaml.load( + re.sub(u'\\\\n', '\n', + validate_file_contents.encode('unicode-escape').decode("ascii")) + ) validate_file.close() except ValueError: sys.stdout.write('\nValidation tests were skipped because of wide character issues\n') @@ -164,17 +193,24 @@ def assert_equal(result, test): sys.stdout.write('\nTesting Validation: %s\n' % section) for test in validate_tests.get('tests').get(section): validator = twitter_text.validation.Validation(test.get('text')) - if section == 'tweets': - assert_equal(not validator.tweet_invalid(), test) - elif section == 'usernames': - assert_equal(validator.valid_username(), test) - elif section == 'lists': - assert_equal(validator.valid_list(), test) - elif section == 'hashtags': - assert_equal(validator.valid_hashtag(), test) - elif section == 'urls': - assert_equal(validator.valid_url(), test) - -sys.stdout.write(u'\033[0m-------\n\033[92m%d tests passed.\033[0m\n' % attempted) + try: + if section == 'tweets': + assert_equal(not validator.tweet_invalid(), test) + elif section == 'usernames': + assert_equal(validator.valid_username(), test) + elif section == 'lists': + assert_equal(validator.valid_list(), test) + elif section == 'hashtags': + assert_equal(validator.valid_hashtag(), test) + elif section == 'urls': + assert_equal(validator.valid_url(), test) + except AssertionError as ae: + print(ae.args[0]) + failed += 1 + else: + passed += 1 + +sys.stdout.write(u'\033[0m-------\n\033[92m%d tests passed%s.\033[0m\n' % + (passed, (error(", %d failed" % failed) if failed else ""))) sys.stdout.flush() -sys.exit(os.EX_OK) \ No newline at end of file +sys.exit(os.EX_OK if not failed else os.EX_SOFTWARE) diff --git a/twitter-text-conformance b/twitter-text-conformance index 9b58c44..c17c243 160000 --- a/twitter-text-conformance +++ b/twitter-text-conformance @@ -1 +1 @@ -Subproject commit 9b58c44302c4ab5bab261f6cfaf6ca89b5a6cf35 +Subproject commit c17c243ca8a78953e2c2d4bf681ae9557925b0c6 diff --git a/twitter_text/autolink.py b/twitter_text/autolink.py index 821d042..fb5b2eb 100644 --- a/twitter_text/autolink.py +++ b/twitter_text/autolink.py @@ -1,6 +1,6 @@ # encoding=utf-8 -import re, cgi +import re from twitter_text.regex import REGEXEN from twitter_text.unicode import force_unicode @@ -113,7 +113,7 @@ def auto_link_entities(self, entities = [], options = {}): return self.text # NOTE deprecate these attributes not options keys in options hash, then use html_attrs - options = dict(DEFAULT_OPTIONS.items() + options.items()) + options = dict(list(DEFAULT_OPTIONS.items()) + list(options.items())) options['html_attrs'] = self._extract_html_attrs_from_options(options) if not options.get('suppress_no_follow', False): options['html_attrs']['rel'] = "nofollow" @@ -302,16 +302,16 @@ def _link_url_with_entity(self, entity, options = {}): For those URLs, display_url is not a substring of expanded_url, so we don't do anything special to render the elided parts. For a pic.twitter.com URL, the only elided part will be the "https://", so this is fine. """ - display_url = entity.get('display_url').decode('utf-8') + display_url = entity.get('display_url') expanded_url = entity.get('expanded_url') invisible_tag_attrs = options.get('invisible_tag_attrs', DEFAULT_INVISIBLE_TAG_ATTRS) - display_url_sans_ellipses = re.sub(ur'…', u'', display_url) + display_url_sans_ellipses = re.sub(u'…', u'', display_url) if expanded_url.find(display_url_sans_ellipses) > -1: before_display_url, after_display_url = expanded_url.split(display_url_sans_ellipses, 2) - preceding_ellipsis = re.search(ur'\A…', display_url) - following_ellipsis = re.search(ur'…\z', display_url) + preceding_ellipsis = re.search(u'\\A…', display_url) + following_ellipsis = re.search(u'…\\Z', display_url) if preceding_ellipsis is not None: preceding_ellipsis = preceding_ellipsis.group() else: @@ -431,4 +431,4 @@ def _tag_attrs(self, attributes = {}): value = u' '.join(value) attrs.append(u'%s="%s"' % (self._html_escape(key), self._html_escape(value))) - return u' '.join(attrs) \ No newline at end of file + return u' '.join(attrs) diff --git a/twitter_text/highlighter.py b/twitter_text/highlighter.py index ec128ca..b5e1c90 100644 --- a/twitter_text/highlighter.py +++ b/twitter_text/highlighter.py @@ -1,9 +1,11 @@ # encoding=utf-8 import re -from HTMLParser import HTMLParser +try: + from html.parser import HTMLParser +except ImportError: + from HTMLParser import HTMLParser -from twitter_text.regex import UNICODE_SPACES from twitter_text.unicode import force_unicode DEFAULT_HIGHLIGHT_TAG = 'em' @@ -34,7 +36,7 @@ def hit_highlight(self, hits = [], **kwargs): if not hits and kwargs.get('query'): stripped_text = strip_tags(self.text) - for match in re.finditer(ur'%s' % kwargs.get('query'), stripped_text): + for match in re.finditer(u'%s' % kwargs.get('query'), stripped_text): hits.append(match.span()) if hits and not type(hits) == list: @@ -58,7 +60,7 @@ def hit_highlight(self, hits = [], **kwargs): if index % 2: # we're inside a continue - chunk_start = len(u''.join(text_chunks[0:index / 2])) + chunk_start = len(u''.join(text_chunks[0:index // 2])) chunk_end = chunk_start + len(chunk) if hit_start >= chunk_start and hit_start < chunk_end: chunk = chunk[:hit_start - chunk_start] + tags[0] + chunk[hit_start - chunk_start:] @@ -80,4 +82,4 @@ def hit_highlight(self, hits = [], **kwargs): else: result.append(chunk) self.text = u''.join(result) - return self.text \ No newline at end of file + return self.text diff --git a/twitter_text/regex.py b/twitter_text/regex.py index c136f80..b4adcc2 100644 --- a/twitter_text/regex.py +++ b/twitter_text/regex.py @@ -4,7 +4,12 @@ # list is frozen at load time to ensure immutability. These reular expressions are # used throughout the Twitter classes. Special care has been taken to make # sure these reular expressions work with Tweets in all languages. -import re, string +import re +try: + unichr +except NameError: + unichr = chr +from functools import reduce REGEXEN = {} # :nodoc: @@ -19,13 +24,13 @@ def regex_range(start, end = None): # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE UNICODE_SPACES = [] for space in reduce(lambda x,y: x + y if type(y) == list else x + [y], [ - range(0x0009, 0x000D), # White_Space # Cc [5] .. + list(range(0x0009, 0x000D)), # White_Space # Cc [5] .. 0x0020, # White_Space # Zs SPACE 0x0085, # White_Space # Cc 0x00A0, # White_Space # Zs NO-BREAK SPACE 0x1680, # White_Space # Zs OGHAM SPACE MARK 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR - range(0x2000, 0x200A), # White_Space # Zs [11] EN QUAD..HAIR SPACE + list(range(0x2000, 0x200A)), # White_Space # Zs [11] EN QUAD..HAIR SPACE 0x2028, # White_Space # Zl LINE SEPARATOR 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE @@ -33,7 +38,7 @@ def regex_range(start, end = None): 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE ]): UNICODE_SPACES.append(unichr(space)) -REGEXEN['spaces'] = re.compile(ur''.join(UNICODE_SPACES)) +REGEXEN['spaces'] = re.compile(u''.join(UNICODE_SPACES)) # Characters not allowed in Tweets INVALID_CHARACTERS = [ @@ -43,7 +48,7 @@ def regex_range(start, end = None): ] REGEXEN['invalid_control_characters'] = [unichr(x) for x in INVALID_CHARACTERS] -REGEXEN['list_name'] = re.compile(ur'^[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}$') +REGEXEN['list_name'] = re.compile(u'^[a-zA-Z][a-zA-Z0-9_\\-\\u0080-\\u00ff]{0,24}$') # Latin accented characters # Excludes 0xd7 from the range (the multiplication sign, confusable with "x"). @@ -67,7 +72,7 @@ def regex_range(start, end = None): regex_range(0x0300, 0x036f), regex_range(0x1e00, 0x1eff), ] -REGEXEN['latin_accents'] = re.compile(ur''.join(LATIN_ACCENTS), re.IGNORECASE | re.UNICODE) +REGEXEN['latin_accents'] = re.compile(u''.join(LATIN_ACCENTS), re.IGNORECASE | re.UNICODE) LATIN_ACCENTS = u''.join(LATIN_ACCENTS) RTL_CHARACTERS = ''.join([ @@ -147,69 +152,69 @@ def regex_range(start, end = None): # this is a narrow python build so these extended Kanji characters won't work pass -PUNCTUATION_CHARS = ur'!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~' -SPACE_CHARS = ur" \t\n\x0B\f\r" -CTRL_CHARS = ur"\x00-\x1F\x7F" +PUNCTUATION_CHARS = u'!"#$%&\'()*+,-./:;<=>?@\\[\\]^_\\`{|}~' +SPACE_CHARS = u" \\t\\n\\x0B\\f\\r" +CTRL_CHARS = u"\\x00-\\x1F\\x7F" # A hashtag must contain latin characters, numbers and underscores, but not all numbers. -HASHTAG_ALPHA = ur'[a-z_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) -HASHTAG_ALPHANUMERIC = ur'[a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) -HASHTAG_BOUNDARY = ur'\A|\z|\[|[^&a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) +HASHTAG_ALPHA = u'[a-z_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) +HASHTAG_ALPHANUMERIC = u'[a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) +HASHTAG_BOUNDARY = u'\\A|\\Z|\\[|[^&a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) -HASHTAG = re.compile(ur'(%s)(#|#)(%s*%s%s*)' % (HASHTAG_BOUNDARY, HASHTAG_ALPHANUMERIC, HASHTAG_ALPHA, HASHTAG_ALPHANUMERIC), re.IGNORECASE) +HASHTAG = re.compile(u'(%s)(#|#)(%s*%s%s*)' % (HASHTAG_BOUNDARY, HASHTAG_ALPHANUMERIC, HASHTAG_ALPHA, HASHTAG_ALPHANUMERIC), re.IGNORECASE) REGEXEN['valid_hashtag'] = HASHTAG -REGEXEN['end_hashtag_match'] = re.compile(ur'\A(?:[##]|:\/\/)', re.IGNORECASE | re.UNICODE) -REGEXEN['numeric_only'] = re.compile(ur'^[\d]+$') +REGEXEN['end_hashtag_match'] = re.compile(u'\\A(?:[##]|:\\/\\/)', re.IGNORECASE | re.UNICODE) +REGEXEN['numeric_only'] = re.compile(u'^[\d]+$') -REGEXEN['valid_mention_preceding_chars'] = re.compile(r'(?:[^a-zA-Z0-9_!#\$%&*@@]|^|RT:?)') -REGEXEN['at_signs'] = re.compile(ur'[@@]') +REGEXEN['valid_mention_preceding_chars'] = re.compile(u'(?:[^a-zA-Z0-9_!#\\$%&*@@]|^|RT:?)') +REGEXEN['at_signs'] = re.compile(u'[@@]') REGEXEN['valid_mention_or_list'] = re.compile( - ur'(%s)' % REGEXEN['valid_mention_preceding_chars'].pattern.decode('utf-8') + # preceding character - ur'(%s)' % REGEXEN['at_signs'].pattern + # at mark - ur'([a-zA-Z0-9_]{1,20})' + # screen name - ur'(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?' # list (optional) + u'(%s)' % REGEXEN['valid_mention_preceding_chars'].pattern + # preceding character + u'(%s)' % REGEXEN['at_signs'].pattern + # at mark + u'([a-zA-Z0-9_]{1,20})' + # screen name + u'(\\/[a-zA-Z][a-zA-Z0-9_\\-]{0,24})?' # list (optional) ) -REGEXEN['valid_reply'] = re.compile(ur'^(?:[%s])*%s([a-zA-Z0-9_]{1,20})' % (REGEXEN['spaces'].pattern, REGEXEN['at_signs'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_reply'] = re.compile(u'^(?:[%s])*%s([a-zA-Z0-9_]{1,20})' % (REGEXEN['spaces'].pattern, REGEXEN['at_signs'].pattern), re.IGNORECASE | re.UNICODE) # Used in Extractor for final filtering -REGEXEN['end_mention_match'] = re.compile(ur'\A(?:%s|[%s]|:\/\/)' % (REGEXEN['at_signs'].pattern, REGEXEN['latin_accents'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['end_mention_match'] = re.compile(u'\\A(?:%s|[%s]|:\\/\\/)' % (REGEXEN['at_signs'].pattern, REGEXEN['latin_accents'].pattern), re.IGNORECASE | re.UNICODE) # URL related hash regex collection -REGEXEN['valid_url_preceding_chars'] = re.compile(ur'(?:[^A-Z0-9@@$##%s]|^)' % ur''.join(REGEXEN['invalid_control_characters']), re.IGNORECASE | re.UNICODE) -REGEXEN['invalid_url_without_protocol_preceding_chars'] = re.compile(ur'[-_.\/]$') -DOMAIN_VALID_CHARS = ur'[^%s%s%s%s%s]' % (PUNCTUATION_CHARS, SPACE_CHARS, CTRL_CHARS, ur''.join(REGEXEN['invalid_control_characters']), ur''.join(UNICODE_SPACES)) -REGEXEN['valid_subdomain'] = re.compile(ur'(?:(?:%s(?:[_-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE) -REGEXEN['valid_domain_name'] = re.compile(ur'(?:(?:%s(?:[-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE) -REGEXEN['valid_gTLD'] = re.compile(ur'(?:(?:academy|actor|aero|agency|arpa|asia|bar|bargains|berlin|best|bid|bike|biz|blue|boutique|build|builders|buzz|cab|camera|camp|cards|careers|cat|catering|center|ceo|cheap|christmas|cleaning|clothing|club|codes|coffee|com|community|company|computer|construction|contractors|cool|coop|cruises|dance|dating|democrat|diamonds|directory|domains|edu|education|email|enterprises|equipment|estate|events|expert|exposed|farm|fish|flights|florist|foundation|futbol|gallery|gift|glass|gov|graphics|guitars|guru|holdings|holiday|house|immobilien|industries|info|institute|int|international|jobs|kaufen|kim|kitchen|kiwi|koeln|kred|land|lighting|limo|link|luxury|management|mango|marketing|menu|mil|mobi|moda|monash|museum|nagoya|name|net|neustar|ninja|okinawa|onl|org|partners|parts|photo|photography|photos|pics|pink|plumbing|post|pro|productions|properties|pub|qpon|recipes|red|rentals|repair|report|reviews|rich|ruhr|sexy|shiksha|shoes|singles|social|solar|solutions|supplies|supply|support|systems|tattoo|technology|tel|tienda|tips|today|tokyo|tools|training|travel|uno|vacations|ventures|viajes|villas|vision|vote|voting|voto|voyage|wang|watch|wed|wien|wiki|works|xxx|xyz|zone|дети|онлайн|орг|сайт|بازار|شبكة|みんな|中信|中文网|公司|公>益|在线|我爱你|政务|游戏|移动|网络|集团|삼성)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE) -REGEXEN['valid_ccTLD'] = re.compile(ur'(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bl|bm|bn|bo|bq|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mf|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|мон|рф|срб|укр|қаз|الاردن|الجزائر|السعودية|المغرب|امارات|ایران|بھارت|تونس|سودان|سورية|عمان|فلسطين|قطر|مصر|مليسيا|پاکستان|भारत|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|ไทย|გე|中国|中加坡|湾|台灣|新香港|한국)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE) -REGEXEN['valid_punycode'] = re.compile(ur'(?:xn--[0-9a-z]+)', re.IGNORECASE | re.UNICODE) +REGEXEN['valid_url_preceding_chars'] = re.compile(u'(?:[^A-Z0-9@@$##%s]|^)' % u''.join(REGEXEN['invalid_control_characters']), re.IGNORECASE | re.UNICODE) +REGEXEN['invalid_url_without_protocol_preceding_chars'] = re.compile(u'[-_.\\/]$') +DOMAIN_VALID_CHARS = u'[^%s%s%s%s%s]' % (PUNCTUATION_CHARS, SPACE_CHARS, CTRL_CHARS, u''.join(REGEXEN['invalid_control_characters']), u''.join(UNICODE_SPACES)) +REGEXEN['valid_subdomain'] = re.compile(u'(?:(?:%s(?:[_-]|%s)*)?%s\\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_domain_name'] = re.compile(u'(?:(?:%s(?:[-]|%s)*)?%s\\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_gTLD'] = re.compile(u'(?:(?:academy|actor|aero|agency|arpa|asia|bar|bargains|berlin|best|bid|bike|biz|blue|boutique|build|builders|buzz|cab|camera|camp|cards|careers|cat|catering|center|ceo|cheap|christmas|cleaning|clothing|club|codes|coffee|com|community|company|computer|construction|contractors|cool|coop|cruises|dance|dating|democrat|diamonds|directory|domains|edu|education|email|enterprises|equipment|estate|events|expert|exposed|farm|fish|flights|florist|foundation|futbol|gallery|gift|glass|gov|graphics|guitars|guru|holdings|holiday|house|immobilien|industries|info|institute|int|international|jobs|kaufen|kim|kitchen|kiwi|koeln|kred|land|lighting|limo|link|luxury|management|mango|marketing|menu|mil|mobi|moda|monash|museum|nagoya|name|net|neustar|ninja|okinawa|onl|org|partners|parts|photo|photography|photos|pics|pink|plumbing|post|pro|productions|properties|pub|qpon|recipes|red|rentals|repair|report|reviews|rich|ruhr|sexy|shiksha|shoes|singles|social|solar|solutions|supplies|supply|support|systems|tattoo|technology|tel|tienda|tips|today|tokyo|tools|training|travel|uno|vacations|ventures|viajes|villas|vision|vote|voting|voto|voyage|wang|watch|wed|wien|wiki|works|xxx|xyz|zone|дети|онлайн|орг|сайт|بازار|شبكة|みんな|中信|中文网|公司|公>益|在线|我爱你|政务|游戏|移动|网络|集团|삼성)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE) +REGEXEN['valid_ccTLD'] = re.compile(u'(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bl|bm|bn|bo|bq|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mf|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|мон|рф|срб|укр|қаз|الاردن|الجزائر|السعودية|المغرب|امارات|ایران|بھارت|تونس|سودان|سورية|عمان|فلسطين|قطر|مصر|مليسيا|پاکستان|भारत|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|ไทย|გე|中国|中加坡|湾|台灣|新香港|한국)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE) +REGEXEN['valid_punycode'] = re.compile(u'(?:xn--[0-9a-z]+)', re.IGNORECASE | re.UNICODE) -REGEXEN['valid_domain'] = re.compile(ur'(?:%s*%s(?:%s|%s|%s))' % (REGEXEN['valid_subdomain'].pattern, REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_domain'] = re.compile(u'(?:%s*%s(?:%s|%s|%s))' % (REGEXEN['valid_subdomain'].pattern, REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE) # This is used in Extractor -REGEXEN['valid_ascii_domain'] = re.compile(ur'(?:(?:[A-Za-z0-9\-_]|[%s])+\.)+(?:%s|%s|%s)' % (REGEXEN['latin_accents'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_ascii_domain'] = re.compile(u'(?:(?:[A-Za-z0-9\\-_]|[%s])+\\.)+(?:%s|%s|%s)' % (REGEXEN['latin_accents'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE) # This is used in Extractor for stricter t.co URL extraction -REGEXEN['valid_tco_url'] = re.compile(ur'^https?:\/\/t\.co\/[a-z0-9]+', re.IGNORECASE | re.UNICODE) +REGEXEN['valid_tco_url'] = re.compile(u'^https?:\\/\\/t\\.co\\/[a-z0-9]+', re.IGNORECASE | re.UNICODE) # This is used in Extractor to filter out unwanted URLs. -REGEXEN['invalid_short_domain'] = re.compile(ur'\A%s%s\Z' % (REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_ccTLD'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['invalid_short_domain'] = re.compile(u'\\A%s%s\\Z' % (REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_ccTLD'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['valid_port_number'] = re.compile(ur'[0-9]+') +REGEXEN['valid_port_number'] = re.compile(u'[0-9]+') -REGEXEN['valid_general_url_path_chars'] = re.compile(ur"[a-z0-9!\*';:=\+\,\.\$\/%%#\[\]\-_~&|@%s]" % LATIN_ACCENTS, re.IGNORECASE | re.UNICODE) +REGEXEN['valid_general_url_path_chars'] = re.compile(u"[a-z0-9!\\*';:=\\+\\,\\.\\$\\/%%#\\[\\]\\-_~&|@%s]" % LATIN_ACCENTS, re.IGNORECASE | re.UNICODE) # Allow URL paths to contain balanced parens # 1. Used in Wikipedia URLs like /Primer_(film) # 2. Used in IIS sessions like /S(dfd346)/ -REGEXEN['valid_url_balanced_parens'] = re.compile(ur'\(%s+\)' % REGEXEN['valid_general_url_path_chars'].pattern, re.IGNORECASE | re.UNICODE) +REGEXEN['valid_url_balanced_parens'] = re.compile(u'\\(%s+\\)' % REGEXEN['valid_general_url_path_chars'].pattern, re.IGNORECASE | re.UNICODE) # Valid end-of-path chracters (so /foo. does not gobble the period). # 1. Allow =&# for empty URL parameters and other URL-join artifacts -REGEXEN['valid_url_path_ending_chars'] = re.compile(ur'[a-z0-9=_#\/\+\-%s]|(?:%s)' % (LATIN_ACCENTS, REGEXEN['valid_url_balanced_parens'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['valid_url_path'] = re.compile(ur'(?:(?:%s*(?:%s %s*)*%s)|(?:%s+\/))' % (REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_balanced_parens'].pattern, REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_path_ending_chars'].pattern, REGEXEN['valid_general_url_path_chars'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_url_path_ending_chars'] = re.compile(u'[a-z0-9=_#\\/\\+\\-%s]|(?:%s)' % (LATIN_ACCENTS, REGEXEN['valid_url_balanced_parens'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_url_path'] = re.compile(u'(?:(?:%s*(?:%s %s*)*%s)|(?:%s+\\/))' % (REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_balanced_parens'].pattern, REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_path_ending_chars'].pattern, REGEXEN['valid_general_url_path_chars'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['valid_url_query_chars'] = re.compile(ur"[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]", re.IGNORECASE | re.UNICODE) -REGEXEN['valid_url_query_ending_chars'] = re.compile(ur'[a-z0-9_&=#\/]', re.IGNORECASE | re.UNICODE) -REGEXEN['valid_url'] = re.compile(ur'((%s)((https?:\/\/)?(%s)(?::(%s))?(/%s*)?(\?%s*%s)?))' % ( +REGEXEN['valid_url_query_chars'] = re.compile(u"[a-z0-9!?\\*'\\(\\);:&=\\+\\$\\/%#\\[\\]\\-_\\.,~|@]", re.IGNORECASE | re.UNICODE) +REGEXEN['valid_url_query_ending_chars'] = re.compile(u'[a-z0-9_&=#\\/]', re.IGNORECASE | re.UNICODE) +REGEXEN['valid_url'] = re.compile(u'((%s)((https?:\\/\\/)?(%s)(?::(%s))?(/%s*)?(\\?%s*%s)?))' % ( REGEXEN['valid_url_preceding_chars'].pattern, REGEXEN['valid_domain'].pattern, REGEXEN['valid_port_number'].pattern, @@ -227,54 +232,54 @@ def regex_range(start, end = None): # $7 URL Path and anchor # $8 Query String -REGEXEN['cashtag'] = re.compile(ur'[a-z]{1,6}(?:[._][a-z]{1,2})?', re.IGNORECASE) -REGEXEN['valid_cashtag'] = re.compile(ur'(^|[%s])(\$|$|﹩)(%s)(?=$|\s|[%s])' % (REGEXEN['spaces'].pattern, REGEXEN['cashtag'].pattern, PUNCTUATION_CHARS), re.IGNORECASE) +REGEXEN['cashtag'] = re.compile(u'[a-z]{1,6}(?:[._][a-z]{1,2})?', re.IGNORECASE) +REGEXEN['valid_cashtag'] = re.compile(u'(^|[%s])(\\$|$|﹩)(%s)(?=$|\\s|[%s])' % (REGEXEN['spaces'].pattern, REGEXEN['cashtag'].pattern, PUNCTUATION_CHARS), re.IGNORECASE) # These URL validation pattern strings are based on the ABNF from RFC 3986 -REGEXEN['validate_url_unreserved'] = re.compile(ur'[a-z0-9\-._~]', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_pct_encoded'] = re.compile(ur'(?:%[0-9a-f]{2})', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_sub_delims'] = re.compile(ur"[!$&'()*+,;=]", re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_pchar'] = re.compile(ur'(?:%s|%s|%s|[:\|@])' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unreserved'] = re.compile(u'[a-z0-9\\-._~]', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_pct_encoded'] = re.compile(u'(?:%[0-9a-f]{2})', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_sub_delims'] = re.compile(u"[!$&'()*+,;=]", re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_pchar'] = re.compile(u'(?:%s|%s|%s|[:\\|@])' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_scheme'] = re.compile(ur'(?:[a-z][a-z0-9+\-.]*)', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_userinfo'] = re.compile(ur'(?:%s|%s|%s|:)*' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_scheme'] = re.compile(u'(?:[a-z][a-z0-9+\\-.]*)', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_userinfo'] = re.compile(u'(?:%s|%s|%s|:)*' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_dec_octet'] = re.compile(ur'(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_ipv4'] = re.compile(ur'(?:%s(?:\.%s){3})' % (REGEXEN['validate_url_dec_octet'].pattern, REGEXEN['validate_url_dec_octet'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_dec_octet'] = re.compile(u'(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_ipv4'] = re.compile(u'(?:%s(?:\\.%s){3})' % (REGEXEN['validate_url_dec_octet'].pattern, REGEXEN['validate_url_dec_octet'].pattern), re.IGNORECASE | re.UNICODE) # Punting on real IPv6 validation for now -REGEXEN['validate_url_ipv6'] = re.compile(ur'(?:\[[a-f0-9:\.]+\])', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_ipv6'] = re.compile(u'(?:\\[[a-f0-9:\\.]+\\])', re.IGNORECASE | re.UNICODE) # Also punting on IPvFuture for now -REGEXEN['validate_url_ip'] = re.compile(ur'(?:%s|%s)' % (REGEXEN['validate_url_ipv4'].pattern, REGEXEN['validate_url_ipv6'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_ip'] = re.compile(u'(?:%s|%s)' % (REGEXEN['validate_url_ipv4'].pattern, REGEXEN['validate_url_ipv6'].pattern), re.IGNORECASE | re.UNICODE) # This is more strict than the rfc specifies -REGEXEN['validate_url_subdomain_segment'] = re.compile(ur'(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_domain_segment'] = re.compile(ur'(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_domain_tld'] = re.compile(ur'(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_domain'] = re.compile(ur'(?:(?:%s\.)*(?:%s\.)%s)' % (REGEXEN['validate_url_subdomain_segment'].pattern, REGEXEN['validate_url_domain_segment'].pattern, REGEXEN['validate_url_domain_tld'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_subdomain_segment'] = re.compile(u'(?:[a-z0-9](?:[a-z0-9_\\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_domain_segment'] = re.compile(u'(?:[a-z0-9](?:[a-z0-9\\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_domain_tld'] = re.compile(u'(?:[a-z](?:[a-z0-9\\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_domain'] = re.compile(u'(?:(?:%s\\.)*(?:%s\\.)%s)' % (REGEXEN['validate_url_subdomain_segment'].pattern, REGEXEN['validate_url_domain_segment'].pattern, REGEXEN['validate_url_domain_tld'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_host'] = re.compile(ur'(?:%s|%s)' % (REGEXEN['validate_url_ip'].pattern, REGEXEN['validate_url_domain'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_host'] = re.compile(u'(?:%s|%s)' % (REGEXEN['validate_url_ip'].pattern, REGEXEN['validate_url_domain'].pattern), re.IGNORECASE | re.UNICODE) # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences -REGEXEN['validate_url_unicode_subdomain_segment'] = re.compile(ur'(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_unicode_domain_segment'] = re.compile(ur'(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_unicode_domain_tld'] = re.compile(ur'(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_unicode_domain'] = re.compile(ur'(?:(?:%s\.)*(?:%s\.)%s)' % (REGEXEN['validate_url_unicode_subdomain_segment'].pattern, REGEXEN['validate_url_unicode_domain_segment'].pattern, REGEXEN['validate_url_unicode_domain_tld'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unicode_subdomain_segment'] = re.compile(u'(?:(?:[a-z0-9]|[^\\x00-\\x7f])(?:(?:[a-z0-9_\\-]|[^\\x00-\\x7f])*(?:[a-z0-9]|[^\\x00-\\x7f]))?)', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unicode_domain_segment'] = re.compile(u'(?:(?:[a-z0-9]|[^\\x00-\\x7f])(?:(?:[a-z0-9\\-]|[^\\x00-\\x7f])*(?:[a-z0-9]|[^\\x00-\\x7f]))?)', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unicode_domain_tld'] = re.compile(u'(?:(?:[a-z]|[^\\x00-\\x7f])(?:(?:[a-z0-9\\-]|[^\\x00-\\x7f])*(?:[a-z0-9]|[^\\x00-\\x7f]))?)', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unicode_domain'] = re.compile(u'(?:(?:%s\\.)*(?:%s\\.)%s)' % (REGEXEN['validate_url_unicode_subdomain_segment'].pattern, REGEXEN['validate_url_unicode_domain_segment'].pattern, REGEXEN['validate_url_unicode_domain_tld'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_unicode_host'] = re.compile(ur'(?:%s|%s)' % (REGEXEN['validate_url_ip'].pattern, REGEXEN['validate_url_unicode_domain'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unicode_host'] = re.compile(u'(?:%s|%s)' % (REGEXEN['validate_url_ip'].pattern, REGEXEN['validate_url_unicode_domain'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_port'] = re.compile(ur'[0-9]{1,5}') +REGEXEN['validate_url_port'] = re.compile(u'[0-9]{1,5}') -REGEXEN['validate_url_unicode_authority'] = re.compile(ur'(?:(%s)@)?(%s)(?::(%s))?' % (REGEXEN['validate_url_userinfo'].pattern, REGEXEN['validate_url_unicode_host'].pattern, REGEXEN['validate_url_port'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unicode_authority'] = re.compile(u'(?:(%s)@)?(%s)(?::(%s))?' % (REGEXEN['validate_url_userinfo'].pattern, REGEXEN['validate_url_unicode_host'].pattern, REGEXEN['validate_url_port'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_authority'] = re.compile(ur'(?:(%s)@)?(%s)(?::(%s))?' % (REGEXEN['validate_url_userinfo'].pattern, REGEXEN['validate_url_host'].pattern, REGEXEN['validate_url_port'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_authority'] = re.compile(u'(?:(%s)@)?(%s)(?::(%s))?' % (REGEXEN['validate_url_userinfo'].pattern, REGEXEN['validate_url_host'].pattern, REGEXEN['validate_url_port'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_path'] = re.compile(ur'(/%s*)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_query'] = re.compile(ur'(%s|/|\?)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_fragment'] = re.compile(ur'(%s|/|\?)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_path'] = re.compile(u'(/%s*)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_query'] = re.compile(u'(%s|/|\\?)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_fragment'] = re.compile(u'(%s|/|\\?)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE) # Modified version of RFC 3986 Appendix B -REGEXEN['validate_url_unencoded'] = re.compile(ur'\A(?:([^:/?#]+)://)?([^/?#]*)([^?#]*)(?:\?([^#]*))?(?:\#(.*))?\Z', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unencoded'] = re.compile(u'\\A(?:([^:/?#]+)://)?([^/?#]*)([^?#]*)(?:\\?([^#]*))?(?:\\#(.*))?\\Z', re.IGNORECASE | re.UNICODE) -REGEXEN['rtl_chars'] = re.compile(ur'[%s]' % RTL_CHARACTERS, re.IGNORECASE | re.UNICODE) +REGEXEN['rtl_chars'] = re.compile(u'[%s]' % RTL_CHARACTERS, re.IGNORECASE | re.UNICODE) diff --git a/twitter_text/unicode.py b/twitter_text/unicode.py index 4e17267..aa52f0d 100644 --- a/twitter_text/unicode.py +++ b/twitter_text/unicode.py @@ -1,5 +1,11 @@ import types, datetime from decimal import Decimal +try: + long +except NameError: + long = int + unicode = str + basestring = (bytes, str) # borrowed from django.utils.encoding class TwitterTextUnicodeDecodeError(UnicodeDecodeError): @@ -57,7 +63,7 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): # errors), so that if s is a SafeString, it ends up being a # SafeUnicode at the end. s = s.decode(encoding, errors) - except UnicodeDecodeError, e: + except UnicodeDecodeError as e: if not isinstance(s, Exception): raise TwitterTextUnicodeDecodeError(s, *e.args) else: diff --git a/twitter_text/validation.py b/twitter_text/validation.py index 6dea5f9..ee6af4b 100644 --- a/twitter_text/validation.py +++ b/twitter_text/validation.py @@ -75,7 +75,7 @@ def tweet_invalid(self): if self.tweet_length() > MAX_LENGTH: valid, validation_error = False, 'Too long' - if re.search(ur''.join(REGEXEN['invalid_control_characters']), self.text): + if re.search(u''.join(REGEXEN['invalid_control_characters']), self.text): valid, validation_error = False, 'Invalid characters' if self.parent and hasattr(self.parent, 'tweet_is_valid'): @@ -97,7 +97,7 @@ def valid_username(self): return len(extracted) == 1 and extracted[0] == self.text[1:] def valid_list(self): - match = re.compile(ur'^%s$' % REGEXEN['valid_mention_or_list'].pattern).search(self.text) + match = re.compile(u'^%s$' % REGEXEN['valid_mention_or_list'].pattern).search(self.text) return bool(match is not None and match.groups()[0] == "" and match.groups()[3]) def valid_hashtag(self): @@ -124,7 +124,7 @@ def valid_url(self, unicode_domains = True, require_protocol = True): not require_protocol or ( self._valid_match(scheme, REGEXEN['validate_url_scheme']) - and re.compile(ur'^https?$', re.IGNORECASE).match(scheme) + and re.compile(u'^https?$', re.IGNORECASE).match(scheme) ) ) and (