From 45191e43d11fd95876157a4432109cc558253202 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Tue, 1 Mar 2016 16:59:12 -0800 Subject: [PATCH 01/30] PEP8 cleanup --- setup.py | 4 +- tests.py | 42 +++++--- twitter_text/__init__.py | 34 ++++--- twitter_text/autolink.py | 72 +++++++------- twitter_text/extractor.py | 59 ++++++------ twitter_text/highlighter.py | 14 ++- twitter_text/regex.py | 123 +++++++++++++----------- twitter_text/templatetags/twitterize.py | 11 ++- twitter_text/unicode.py | 16 +-- twitter_text/validation.py | 65 ++++++------- 10 files changed, 241 insertions(+), 199 deletions(-) diff --git a/setup.py b/setup.py index fcdabb2..bb27c76 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ from setuptools import setup, find_packages - + setup( name='twitter-text-py', version='2.0.2', @@ -19,5 +19,5 @@ ], include_package_data=True, install_requires=['setuptools'], - license = "BSD" + license="BSD" ) diff --git a/tests.py b/tests.py index 891b35e..dd896c5 100644 --- a/tests.py +++ b/tests.py @@ -1,8 +1,22 @@ # encoding=utf-8 -import twitter_text, sys, os, json, argparse, re +import argparse +import json +import os +import re +import sys +import twitter_text + from twitter_text.unicode import force_unicode +try: + import yaml +except ImportError: + raise Exception('You need to install pyaml to run the tests') +# from http://stackoverflow.com/questions/2890146/how-to-force-pyyaml-to-load-strings-as-unicode-objects +from yaml import Loader, SafeLoader + + narrow_build = True try: unichr(0x20000) @@ -10,21 +24,18 @@ except: pass -parser = argparse.ArgumentParser(description = u'Run the integration tests for twitter_text') -parser.add_argument('--ignore-narrow-errors', '-i', help = u'Ignore errors caused by narrow builds', default = False, action = 'store_true') + +parser = argparse.ArgumentParser(description=u'Run the integration tests for twitter_text') +parser.add_argument('--ignore-narrow-errors', '-i', help=u'Ignore errors caused by narrow builds', default=False, action='store_true') args = parser.parse_args() -try: - import yaml -except ImportError: - raise Exception('You need to install pyaml to run the tests') -# from http://stackoverflow.com/questions/2890146/how-to-force-pyyaml-to-load-strings-as-unicode-objects -from yaml import Loader, SafeLoader + def construct_yaml_str(self, node): return self.construct_scalar(node) Loader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str) SafeLoader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str) + try: from bs4 import BeautifulSoup except ImportError: @@ -33,15 +44,19 @@ def construct_yaml_str(self, node): except ImportError: raise Exception('You need to install BeautifulSoup to run the tests') + def success(text): return (u'\033[92m%s\033[0m\n' % text).encode('utf-8') + def error(text): return (u'\033[91m%s\033[0m\n' % text).encode('utf-8') + attempted = 0 -def assert_equal_without_attribute_order(result, test, failure_message = None): + +def assert_equal_without_attribute_order(result, test, failure_message=None): global attempted attempted += 1 # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through @@ -49,6 +64,7 @@ def assert_equal_without_attribute_order(result, test, failure_message = None): sys.stdout.write(success(u'Test %d Passed: %s' % (attempted, test.get('description')))) sys.stdout.flush() + def assert_equal(result, test): global attempted attempted += 1 @@ -140,9 +156,9 @@ def assert_equal(result, test): for test in hit_highlighting_tests.get('tests').get(section): hit_highlighter = twitter_text.highlighter.HitHighlighter(test.get('text')) if section == 'plain_text': - assert_equal(hit_highlighter.hit_highlight(hits = test.get('hits')), test) + assert_equal(hit_highlighter.hit_highlight(hits=test.get('hits')), test) elif section == 'with_links': - assert_equal_without_attribute_order(hit_highlighter.hit_highlight(hits = test.get('hits')), test) + assert_equal_without_attribute_order(hit_highlighter.hit_highlight(hits=test.get('hits')), test) # validation section validation_tested = False @@ -177,4 +193,4 @@ def assert_equal(result, test): sys.stdout.write(u'\033[0m-------\n\033[92m%d tests passed.\033[0m\n' % attempted) sys.stdout.flush() -sys.exit(os.EX_OK) \ No newline at end of file +sys.exit(os.EX_OK) diff --git a/twitter_text/__init__.py b/twitter_text/__init__.py index bb06120..6f17ac2 100644 --- a/twitter_text/__init__.py +++ b/twitter_text/__init__.py @@ -6,33 +6,39 @@ from twitter_text.validation import Validation from twitter_text.unicode import force_unicode + class TwitterText(object): def __init__(self, text): - self.text = force_unicode(text) # this will get modified by some functions - self.original_text = self.text # this never changes; use it as a fallback or for comparison + # this will get modified by some functions + self.text = force_unicode(text) + # this never changes; use it as a fallback or for comparison + self.original_text = self.text self.has_been_linked = False - self.tweet_length = None # gets changed by validation method - self.tweet_is_valid = None # gets changed by validation method - self.validation_error = None # gets changed by validation method - + # gets changed by validation method + self.tweet_length = None + # gets changed by validation method + self.tweet_is_valid = None + # gets changed by validation method + self.validation_error = None + def __unicode__(self): return self.text - + def __repr__(self): return self.__unicode__() - + @property def autolink(self): - return Autolink(self.text, parent = self) - + return Autolink(self.text, parent=self) + @property def extractor(self): return Extractor(self.text) - + @property def highlighter(self): - return HitHighlighter(self.text, parent = self) - + return HitHighlighter(self.text, parent=self) + @property def validation(self): - return Validation(self.text, parent = self) \ No newline at end of file + return Validation(self.text, parent=self) diff --git a/twitter_text/autolink.py b/twitter_text/autolink.py index 821d042..85e21aa 100644 --- a/twitter_text/autolink.py +++ b/twitter_text/autolink.py @@ -1,6 +1,6 @@ # encoding=utf-8 -import re, cgi +import re from twitter_text.regex import REGEXEN from twitter_text.unicode import force_unicode @@ -28,17 +28,17 @@ DEFAULT_INVISIBLE_TAG_ATTRS = "style='position:absolute;left:-9999px;'" DEFAULT_OPTIONS = { - 'list_class': DEFAULT_LIST_CLASS, - 'username_class': DEFAULT_USERNAME_CLASS, - 'hashtag_class': DEFAULT_HASHTAG_CLASS, - 'cashtag_class': DEFAULT_CASHTAG_CLASS, + 'list_class': DEFAULT_LIST_CLASS, + 'username_class': DEFAULT_USERNAME_CLASS, + 'hashtag_class': DEFAULT_HASHTAG_CLASS, + 'cashtag_class': DEFAULT_CASHTAG_CLASS, - 'username_url_base': DEFAULT_USERNAME_URL_BASE, - 'list_url_base': DEFAULT_LIST_URL_BASE, - 'hashtag_url_base': DEFAULT_HASHTAG_URL_BASE, - 'cashtag_url_base': DEFAULT_CASHTAG_URL_BASE, + 'username_url_base': DEFAULT_USERNAME_URL_BASE, + 'list_url_base': DEFAULT_LIST_URL_BASE, + 'hashtag_url_base': DEFAULT_HASHTAG_URL_BASE, + 'cashtag_url_base': DEFAULT_CASHTAG_URL_BASE, - 'invisible_tag_attrs': DEFAULT_INVISIBLE_TAG_ATTRS, + 'invisible_tag_attrs': DEFAULT_INVISIBLE_TAG_ATTRS, } OPTIONS_NOT_ATTRIBUTES = ( @@ -69,30 +69,32 @@ ) HTML_ENTITIES = { - '&': '&', - '>': '>', - '<': '<', - '"': '"', - "'": ''', + '&': '&', + '>': '>', + '<': '<', + '"': '"', + "'": ''', } BOOLEAN_ATTRIBUTES = ( - 'disabled', + 'disabled', 'readonly', 'multiple', 'checked', ) + def default_transform(entity, text): return text + class Autolink(object): def __init__(self, text, **kwargs): self.text = force_unicode(text) self.parent = kwargs.get('parent', False) self.extractor = Extractor(self.text) - def auto_link_with_json(self, json_obj, options = {}): + def auto_link_with_json(self, json_obj, options={}): # concantenate entities entities = [] if 'entities' in json_obj: @@ -108,7 +110,7 @@ def auto_link_with_json(self, json_obj, options = {}): return self.auto_link_entities(entities, options) - def auto_link_entities(self, entities = [], options = {}): + def auto_link_entities(self, entities=[], options={}): if not self.text: return self.text @@ -118,7 +120,7 @@ def auto_link_entities(self, entities = [], options = {}): if not options.get('suppress_no_follow', False): options['html_attrs']['rel'] = "nofollow" - entities.sort(key = lambda entity: entity['indices'][0], reverse = True) + entities.sort(key=lambda entity: entity['indices'][0], reverse=True) chars = self.text for entity in entities: @@ -133,7 +135,7 @@ def auto_link_entities(self, entities = [], options = {}): return chars - def auto_link(self, options = {}): + def auto_link(self, options={}): """ Add tags around the usernames, lists, hashtags and URLs in the provided text. The tags can be controlled with the following entries in the options hash. @@ -161,7 +163,7 @@ def auto_link(self, options = {}): """ return self.auto_link_entities(self.extractor.extract_entities_with_indices({'extract_url_without_protocol': False}), options) - def auto_link_usernames_or_lists(self, options = {}): + def auto_link_usernames_or_lists(self, options={}): """ Add tags around the usernames and lists in the provided text. The tags can be controlled with the following entries in the options hash. @@ -182,7 +184,7 @@ def auto_link_usernames_or_lists(self, options = {}): """ return self.auto_link_entities(self.extractor.extract_mentions_or_lists_with_indices(), options) - def auto_link_hashtags(self, options = {}): + def auto_link_hashtags(self, options={}): """ Add tags around the hashtags in the provided text. The tags can be controlled with the following entries in the options hash. @@ -199,7 +201,7 @@ def auto_link_hashtags(self, options = {}): """ return self.auto_link_entities(self.extractor.extract_hashtags_with_indices(), options) - def auto_link_cashtags(self, options = {}): + def auto_link_cashtags(self, options={}): """ Add tags around the cashtags in the provided text. The tags can be controlled with the following entries in the options hash. @@ -216,7 +218,7 @@ def auto_link_cashtags(self, options = {}): """ return self.auto_link_entities(self.extractor.extract_cashtags_with_indices(), options) - def auto_link_urls(self, options = {}): + def auto_link_urls(self, options={}): """ Add tags around the URLs in the provided text. The tags can be controlled with the following entries in the options hash. @@ -240,13 +242,13 @@ def _html_escape(self, text): text = text.replace(char, HTML_ENTITIES[char]) return text - def _extract_html_attrs_from_options(self, options = {}): + def _extract_html_attrs_from_options(self, options={}): html_attrs = options.get('html_attrs', {}) options = options.copy() if 'html_attrs' in options: del(options['html_attrs']) for option in options.keys(): - if not option in OPTIONS_NOT_ATTRIBUTES: + if option not in OPTIONS_NOT_ATTRIBUTES: html_attrs[option] = options[option] return html_attrs @@ -256,7 +258,7 @@ def _url_entities_hash(self, url_entities): entities[entity.get('url')] = entity return entities - def _link_to_url(self, entity, chars, options = {}): + def _link_to_url(self, entity, chars, options={}): url = entity.get('url') href = options.get('link_url_transform', lambda x: x)(url) @@ -284,7 +286,7 @@ def _link_to_url(self, entity, chars, options = {}): link = self._link_to_text(entity, link_text, href, html_attrs, options) return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:] - def _link_url_with_entity(self, entity, options = {}): + def _link_url_with_entity(self, entity, options={}): """ Goal: If a user copies and pastes a tweet containing t.co'ed link, the resulting paste should contain the full original URL (expanded_url), not the display URL. @@ -348,7 +350,7 @@ def _link_url_with_entity(self, entity, options = {}): else: return self._html_escape(display_url) - def _link_to_hashtag(self, entity, chars, options = {}): + def _link_to_hashtag(self, entity, chars, options={}): hashchar = chars[entity['indices'][0]] hashtag = entity['hashtag'] hashtag_class = options.get('hashtag_class') @@ -368,7 +370,7 @@ def _link_to_hashtag(self, entity, chars, options = {}): link = self._link_to_text_with_symbol(entity, hashchar, hashtag, href, html_attrs, options) return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:] - def _link_to_cashtag(self, entity, chars, options = {}): + def _link_to_cashtag(self, entity, chars, options={}): dollar = chars[entity['indices'][0]] cashtag = entity['cashtag'] @@ -383,7 +385,7 @@ def _link_to_cashtag(self, entity, chars, options = {}): link = self._link_to_text_with_symbol(entity, dollar, cashtag, href, html_attrs, options) return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:] - def _link_to_screen_name(self, entity, chars, options = {}): + def _link_to_screen_name(self, entity, chars, options={}): name = u'%s%s' % (entity['screen_name'], entity.get('list_slug') or '') chunk = options.get('link_text_transform', default_transform)(entity, name) name = name.lower() @@ -404,7 +406,7 @@ def _link_to_screen_name(self, entity, chars, options = {}): link = self._link_to_text_with_symbol(entity, at, chunk, href, html_attrs, options) return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:] - def _link_to_text_with_symbol(self, entity, symbol, text, href, attributes = {}, options = {}): + def _link_to_text_with_symbol(self, entity, symbol, text, href, attributes={}, options={}): tagged_symbol = u'<%s>%s' % (options.get('symbol_tag'), symbol, options.get('symbol_tag')) if options.get('symbol_tag') else symbol text = self._html_escape(text) tagged_text = u'<%s>%s' % (options.get('text_with_symbol_tag'), text, options.get('text_with_symbol_tag')) if options.get('text_with_symbol_tag') else text @@ -413,14 +415,14 @@ def _link_to_text_with_symbol(self, entity, symbol, text, href, attributes = {}, else: return u'%s%s' % (tagged_symbol, self._link_to_text(entity, tagged_text, href, attributes, options)) - def _link_to_text(self, entity, text, href, attributes = {}, options = {}): + def _link_to_text(self, entity, text, href, attributes={}, options={}): attributes['href'] = href if options.get('link_attribute_transform'): attributes = options.get('link_attribute_transform')(entity, attributes) text = options.get('link_text_transform', default_transform)(entity, text) return u'%s' % (self._tag_attrs(attributes), text) - def _tag_attrs(self, attributes = {}): + def _tag_attrs(self, attributes={}): attrs = [] for key in sorted(attributes.keys()): value = attributes[key] @@ -431,4 +433,4 @@ def _tag_attrs(self, attributes = {}): value = u' '.join(value) attrs.append(u'%s="%s"' % (self._html_escape(key), self._html_escape(value))) - return u' '.join(attrs) \ No newline at end of file + return u' '.join(attrs) diff --git a/twitter_text/extractor.py b/twitter_text/extractor.py index 1015b8c..0898370 100644 --- a/twitter_text/extractor.py +++ b/twitter_text/extractor.py @@ -3,12 +3,13 @@ from twitter_text.regex import REGEXEN from twitter_text.unicode import force_unicode + class Extractor(object): """ A module for including Tweet parsing in a class. This module provides function for the extraction and processing of usernames, lists, URLs and hashtags. """ - + def __init__(self, text): self.text = force_unicode(text) @@ -19,18 +20,18 @@ def _remove_overlapping_entities(self, entities): """ # sort by start index - entities.sort(key = lambda entity: entity['indices'][0]) + entities.sort(key=lambda entity: entity['indices'][0]) # remove duplicates - prev = None + prev = None for entity in [e for e in entities]: if prev and prev['indices'][1] > entity['indices'][0]: entities.remove(entity) else: - prev = entity + prev = entity return entities - def extract_entities_with_indices(self, options = {}, transform = lambda x: x): + def extract_entities_with_indices(self, options={}, transform=lambda x: x): """ Extracts all usernames, lists, hashtags and URLs in the Tweet text along with the indices for where the entity ocurred @@ -43,19 +44,21 @@ def extract_entities_with_indices(self, options = {}, transform = lambda x: x): return [] # extract all entities - entities = self.extract_urls_with_indices(options) + \ - self.extract_hashtags_with_indices({'check_url_overlap': False}) + \ - self.extract_mentions_or_lists_with_indices() + \ - self.extract_cashtags_with_indices() + entities = ( + self.extract_urls_with_indices(options) + + self.extract_hashtags_with_indices({'check_url_overlap': False}) + + self.extract_mentions_or_lists_with_indices() + + self.extract_cashtags_with_indices() + ) - entities = self._remove_overlapping_entities(entities) + entities = self._remove_overlapping_entities(entities) for entity in entities: - entity = transform(entity) + entity = transform(entity) return entities - def extract_mentioned_screen_names(self, transform = lambda x: x): + def extract_mentioned_screen_names(self, transform=lambda x: x): """ Extracts a list of all usernames mentioned in the Tweet text. If the text is None or contains no username mentions an empty list @@ -65,7 +68,7 @@ def extract_mentioned_screen_names(self, transform = lambda x: x): """ return [transform(mention['screen_name']) for mention in self.extract_mentioned_screen_names_with_indices()] - def extract_mentioned_screen_names_with_indices(self, transform = lambda x: x): + def extract_mentioned_screen_names_with_indices(self, transform=lambda x: x): """ Extracts a list of all usernames mentioned in the Tweet text along with the indices for where the mention ocurred. If the @@ -87,7 +90,7 @@ def extract_mentioned_screen_names_with_indices(self, transform = lambda x: x): }) return possible_screen_names - def extract_mentions_or_lists_with_indices(self, transform = lambda x: x): + def extract_mentions_or_lists_with_indices(self, transform=lambda x: x): """ Extracts a list of all usernames or lists mentioned in the Tweet text along with the indices for where the mention ocurred. If the @@ -101,7 +104,7 @@ def extract_mentions_or_lists_with_indices(self, transform = lambda x: x): if not REGEXEN['at_signs'].search(self.text): return [] - possible_entries = [] + possible_entries = [] for match in REGEXEN['valid_mention_or_list'].finditer(self.text): try: after = self.text[match.end()] @@ -117,8 +120,8 @@ def extract_mentions_or_lists_with_indices(self, transform = lambda x: x): }) return possible_entries - - def extract_reply_screen_name(self, transform = lambda x: x): + + def extract_reply_screen_name(self, transform=lambda x: x): """ Extracts the username username replied to in the Tweet text. If the text is None or is not a reply None will be returned. @@ -135,8 +138,8 @@ def extract_reply_screen_name(self, transform = lambda x: x): else: possible_screen_name = transform(possible_screen_name.group(1)) return possible_screen_name - - def extract_urls(self, transform = lambda x: x): + + def extract_urls(self, transform=lambda x: x): """ Extracts a list of all URLs included in the Tweet text. If the text is None or contains no URLs an empty list @@ -145,8 +148,8 @@ def extract_urls(self, transform = lambda x: x): If a transform is given then it will be called for each URL. """ return [transform(url['url']) for url in self.extract_urls_with_indices()] - - def extract_urls_with_indices(self, options = {'extract_url_without_protocol': True}): + + def extract_urls_with_indices(self, options={'extract_url_without_protocol': True}): """ Extracts a list of all URLs included in the Tweet text along with the indices. If the text is None or contains no @@ -192,8 +195,8 @@ def extract_urls_with_indices(self, options = {'extract_url_without_protocol': T 'indices': [start_position, end_position] }) return urls - - def extract_hashtags(self, transform = lambda x: x): + + def extract_hashtags(self, transform=lambda x: x): """ Extracts a list of all hashtags included in the Tweet text. If the text is None or contains no hashtags an empty list @@ -203,8 +206,8 @@ def extract_hashtags(self, transform = lambda x: x): If a block is given then it will be called for each hashtag. """ return [transform(hashtag['hashtag']) for hashtag in self.extract_hashtags_with_indices()] - - def extract_hashtags_with_indices(self, options = {'check_url_overlap': True}, transform = lambda x: x): + + def extract_hashtags_with_indices(self, options={'check_url_overlap': True}, transform=lambda x: x): """ Extracts a list of all hashtags included in the Tweet text. If the text is None or contains no hashtags an empty list @@ -234,7 +237,7 @@ def extract_hashtags_with_indices(self, options = {'check_url_overlap': True}, t return tags - def extract_cashtags(self, transform = lambda x: x): + def extract_cashtags(self, transform=lambda x: x): """ Extracts a list of all cashtags included in the Tweet text. If the text is None or contains no cashtags an empty list @@ -245,7 +248,7 @@ def extract_cashtags(self, transform = lambda x: x): """ return [cashtag['cashtag'] for cashtag in self.extract_cashtags_with_indices()] - def extract_cashtags_with_indices(self, transform = lambda x: x): + def extract_cashtags_with_indices(self, transform=lambda x: x): """ Extracts a list of all cashtags included in the Tweet text. If the text is None or contains no cashtags an empty list @@ -267,4 +270,4 @@ def extract_cashtags_with_indices(self, transform = lambda x: x): 'indices': [start_position, end_position] }) - return tags \ No newline at end of file + return tags diff --git a/twitter_text/highlighter.py b/twitter_text/highlighter.py index ec128ca..90bbdfd 100644 --- a/twitter_text/highlighter.py +++ b/twitter_text/highlighter.py @@ -3,37 +3,41 @@ import re from HTMLParser import HTMLParser -from twitter_text.regex import UNICODE_SPACES from twitter_text.unicode import force_unicode DEFAULT_HIGHLIGHT_TAG = 'em' + # from http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python class MLStripper(HTMLParser): def __init__(self): self.reset() self.fed = [] + def handle_data(self, d): self.fed.append(d) + def get_data(self): return ''.join(self.fed) + def strip_tags(html): s = MLStripper() s.feed(html) return s.get_data() + class HitHighlighter(object): def __init__(self, text, **kwargs): self.text = force_unicode(text) self.parent = kwargs.get('parent', False) - def hit_highlight(self, hits = [], **kwargs): + def hit_highlight(self, hits=[], **kwargs): if not hits and not kwargs.get('query'): return self.text if not hits and kwargs.get('query'): - stripped_text = strip_tags(self.text) + stripped_text = strip_tags(self.text) for match in re.finditer(ur'%s' % kwargs.get('query'), stripped_text): hits.append(match.span()) @@ -49,7 +53,7 @@ def hit_highlight(self, hits = [], **kwargs): for index, chunk in enumerate(chunks): if not index % 2: text_chunks.append(chunk) - for hit in sorted(hits, key = lambda chunk: chunk[1], reverse = True): + for hit in sorted(hits, key=lambda chunk: chunk[1], reverse=True): hit_start, hit_end = hit placed = 0 for index, chunk in enumerate(chunks): @@ -80,4 +84,4 @@ def hit_highlight(self, hits = [], **kwargs): else: result.append(chunk) self.text = u''.join(result) - return self.text \ No newline at end of file + return self.text diff --git a/twitter_text/regex.py b/twitter_text/regex.py index c136f80..fffebde 100644 --- a/twitter_text/regex.py +++ b/twitter_text/regex.py @@ -4,44 +4,44 @@ # list is frozen at load time to ensure immutability. These reular expressions are # used throughout the Twitter classes. Special care has been taken to make # sure these reular expressions work with Tweets in all languages. -import re, string +import re -REGEXEN = {} # :nodoc: +REGEXEN = {} # :nodoc: -def regex_range(start, end = None): + +def regex_range(start, end=None): if end: return u'%s-%s' % (unichr(start), unichr(end)) else: return u'%s' % unichr(start) + # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand # to access both the list of characters and a pattern suitible for use with String#split # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE -UNICODE_SPACES = [] -for space in reduce(lambda x,y: x + y if type(y) == list else x + [y], [ - range(0x0009, 0x000D), # White_Space # Cc [5] .. - 0x0020, # White_Space # Zs SPACE - 0x0085, # White_Space # Cc - 0x00A0, # White_Space # Zs NO-BREAK SPACE - 0x1680, # White_Space # Zs OGHAM SPACE MARK - 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR - range(0x2000, 0x200A), # White_Space # Zs [11] EN QUAD..HAIR SPACE - 0x2028, # White_Space # Zl LINE SEPARATOR - 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR - 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE - 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE - 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE - ]): - UNICODE_SPACES.append(unichr(space)) +UNICODE_SPACES = [unichr(space) for space in reduce(lambda x, y: x + y if type(y) == list else x + [y], [ + range(0x0009, 0x000D), # White_Space # Cc [5] .. + 0x0020, # White_Space # Zs SPACE + 0x0085, # White_Space # Cc + 0x00A0, # White_Space # Zs NO-BREAK SPACE + 0x1680, # White_Space # Zs OGHAM SPACE MARK + 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR + range(0x2000, 0x200A), # White_Space # Zs [11] EN QUAD..HAIR SPACE + 0x2028, # White_Space # Zl LINE SEPARATOR + 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR + 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE + 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE + 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE +])] REGEXEN['spaces'] = re.compile(ur''.join(UNICODE_SPACES)) # Characters not allowed in Tweets -INVALID_CHARACTERS = [ - 0xFFFE, 0xFEFF, # BOM - 0xFFFF, # Special - 0x202A, 0x202B, 0x202C, 0x202D, 0x202E, # Directional change +INVALID_CHARACTERS = [ + 0xFFFE, 0xFEFF, # BOM + 0xFFFF, # Special + 0x202A, 0x202B, 0x202C, 0x202D, 0x202E, # Directional change ] -REGEXEN['invalid_control_characters'] = [unichr(x) for x in INVALID_CHARACTERS] +REGEXEN['invalid_control_characters'] = [unichr(x) for x in INVALID_CHARACTERS] REGEXEN['list_name'] = re.compile(ur'^[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}$') @@ -71,32 +71,32 @@ def regex_range(start, end = None): LATIN_ACCENTS = u''.join(LATIN_ACCENTS) RTL_CHARACTERS = ''.join([ - regex_range(0x0600,0x06FF), - regex_range(0x0750,0x077F), - regex_range(0x0590,0x05FF), - regex_range(0xFE70,0xFEFF) + regex_range(0x0600, 0x06FF), + regex_range(0x0750, 0x077F), + regex_range(0x0590, 0x05FF), + regex_range(0xFE70, 0xFEFF) ]) NON_LATIN_HASHTAG_CHARS = ''.join([ # Cyrillic (Russian, Ukrainian, etc.) - regex_range(0x0400, 0x04ff), # Cyrillic - regex_range(0x0500, 0x0527), # Cyrillic Supplement - regex_range(0x2de0, 0x2dff), # Cyrillic Extended A - regex_range(0xa640, 0xa69f), # Cyrillic Extended B - regex_range(0x0591, 0x05bf), # Hebrew + regex_range(0x0400, 0x04ff), # Cyrillic + regex_range(0x0500, 0x0527), # Cyrillic Supplement + regex_range(0x2de0, 0x2dff), # Cyrillic Extended A + regex_range(0xa640, 0xa69f), # Cyrillic Extended B + regex_range(0x0591, 0x05bf), # Hebrew regex_range(0x05c1, 0x05c2), regex_range(0x05c4, 0x05c5), regex_range(0x05c7), regex_range(0x05d0, 0x05ea), regex_range(0x05f0, 0x05f4), - regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms + regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms regex_range(0xfb2a, 0xfb36), regex_range(0xfb38, 0xfb3c), regex_range(0xfb3e), regex_range(0xfb40, 0xfb41), regex_range(0xfb43, 0xfb44), regex_range(0xfb46, 0xfb4f), - regex_range(0x0610, 0x061a), # Arabic + regex_range(0x0610, 0x061a), # Arabic regex_range(0x0620, 0x065f), regex_range(0x066e, 0x06d3), regex_range(0x06d5, 0x06dc), @@ -104,44 +104,51 @@ def regex_range(start, end = None): regex_range(0x06ea, 0x06ef), regex_range(0x06fa, 0x06fc), regex_range(0x06ff), - regex_range(0x0750, 0x077f), # Arabic Supplement + regex_range(0x0750, 0x077f), # Arabic Supplement regex_range(0x08a0), # Arabic Extended A regex_range(0x08a2, 0x08ac), regex_range(0x08e4, 0x08fe), - regex_range(0xfb50, 0xfbb1), # Arabic Pres. Forms A + regex_range(0xfb50, 0xfbb1), # Arabic Pres. Forms A regex_range(0xfbd3, 0xfd3d), regex_range(0xfd50, 0xfd8f), regex_range(0xfd92, 0xfdc7), regex_range(0xfdf0, 0xfdfb), - regex_range(0xfe70, 0xfe74), # Arabic Pres. Forms B + regex_range(0xfe70, 0xfe74), # Arabic Pres. Forms B regex_range(0xfe76, 0xfefc), - regex_range(0x200c, 0x200c), # Zero-Width Non-Joiner - regex_range(0x0e01, 0x0e3a), # Thai - regex_range(0x0e40, 0x0e4e), # Hangul (Korean) - regex_range(0x1100, 0x11ff), # Hangul Jamo - regex_range(0x3130, 0x3185), # Hangul Compatibility Jamo - regex_range(0xA960, 0xA97F), # Hangul Jamo Extended-A - regex_range(0xAC00, 0xD7AF), # Hangul Syllables - regex_range(0xD7B0, 0xD7FF), # Hangul Jamo Extended-B - regex_range(0xFFA1, 0xFFDC) # Half-width Hangul + regex_range(0x200c, 0x200c), # Zero-Width Non-Joiner + regex_range(0x0e01, 0x0e3a), # Thai + regex_range(0x0e40, 0x0e4e), # Hangul (Korean) + regex_range(0x1100, 0x11ff), # Hangul Jamo + regex_range(0x3130, 0x3185), # Hangul Compatibility Jamo + regex_range(0xA960, 0xA97F), # Hangul Jamo Extended-A + regex_range(0xAC00, 0xD7AF), # Hangul Syllables + regex_range(0xD7B0, 0xD7FF), # Hangul Jamo Extended-B + regex_range(0xFFA1, 0xFFDC), # Half-width Hangul ]) CJ_HASHTAG_CHARACTERS = ''.join([ - regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width) - regex_range(0xFF66, 0xFF9F), # Katakana (half-width) - regex_range(0xFF10, 0xFF19), regex_range(0xFF21, 0xFF3A), regex_range(0xFF41, 0xFF5A), # Latin (full-width) - regex_range(0x3041, 0x3096), regex_range(0x3099, 0x309E), # Hiragana - regex_range(0x3400, 0x4DBF), # Kanji (CJK Extension A) - regex_range(0x4E00, 0x9FFF), # Kanji (Unified) + regex_range(0x30A1, 0x30FA), # Katakana (full-width) + regex_range(0x30FC, 0x30FE), # Katakana (full-width) + regex_range(0xFF66, 0xFF9F), # Katakana (half-width) + regex_range(0xFF10, 0xFF19), # Latin (full-width) + regex_range(0xFF21, 0xFF3A), # Latin (full-width) + regex_range(0xFF41, 0xFF5A), # Latin (full-width) + regex_range(0x3041, 0x3096), # Hiragana + regex_range(0x3099, 0x309E), # Hiragana + regex_range(0x3400, 0x4DBF), # Kanji (CJK Extension A) + regex_range(0x4E00, 0x9FFF), # Kanji (Unified) ]) try: CJ_HASHTAG_CHARACTERS = ''.join([ CJ_HASHTAG_CHARACTERS, - regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B) - regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C) - regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D) - regex_range(0x2F800, 0x2FA1F), regex_range(0x3003), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement) + regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B) + regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C) + regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D) + regex_range(0x2F800, 0x2FA1F), # Kanji (CJK supplement) + regex_range(0x3003), # Kanji (CJK supplement) + regex_range(0x3005), # Kanji (CJK supplement) + regex_range(0x303B), # Kanji (CJK supplement) ]) except ValueError: # this is a narrow python build so these extended Kanji characters won't work @@ -171,7 +178,7 @@ def regex_range(start, end = None): ur'(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?' # list (optional) ) REGEXEN['valid_reply'] = re.compile(ur'^(?:[%s])*%s([a-zA-Z0-9_]{1,20})' % (REGEXEN['spaces'].pattern, REGEXEN['at_signs'].pattern), re.IGNORECASE | re.UNICODE) - # Used in Extractor for final filtering +# Used in Extractor for final filtering REGEXEN['end_mention_match'] = re.compile(ur'\A(?:%s|[%s]|:\/\/)' % (REGEXEN['at_signs'].pattern, REGEXEN['latin_accents'].pattern), re.IGNORECASE | re.UNICODE) # URL related hash regex collection diff --git a/twitter_text/templatetags/twitterize.py b/twitter_text/templatetags/twitterize.py index 01db63d..b58779a 100644 --- a/twitter_text/templatetags/twitterize.py +++ b/twitter_text/templatetags/twitterize.py @@ -8,15 +8,16 @@ register = Library() -@register.filter(name = 'twitter_text') + +@register.filter(name='twitter_text') @stringfilter -def twitter_text(text, search_query = False): +def twitter_text(text, search_query=False): """ Parses a text string through the TwitterText auto_link method and if search_query is passed, through the hit_highlight method. """ tt = TwitterText(text) if search_query: - tt.text = tt.highlighter.hit_highlight(query = search_query) - tt.text = tt.autolink.auto_link() + tt.text = tt.highlighter.hit_highlight(query=search_query) + tt.text = tt.autolink.auto_link() return tt.text -twitter_text.is_safe = True \ No newline at end of file +twitter_text.is_safe = True diff --git a/twitter_text/unicode.py b/twitter_text/unicode.py index 4e17267..e67238c 100644 --- a/twitter_text/unicode.py +++ b/twitter_text/unicode.py @@ -1,6 +1,8 @@ -import types, datetime +import datetime +import types from decimal import Decimal + # borrowed from django.utils.encoding class TwitterTextUnicodeDecodeError(UnicodeDecodeError): def __init__(self, obj, *args): @@ -10,7 +12,8 @@ def __init__(self, obj, *args): def __str__(self): original = UnicodeDecodeError.__str__(self) return '%s. You passed in %r (%s)' % (original, self.obj, - type(self.obj)) + type(self.obj)) + def is_protected_type(obj): """Determine if the object instance is of a protected type. @@ -25,6 +28,7 @@ def is_protected_type(obj): float, Decimal) ) + def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): """ Similar to smart_unicode, except that lazy instances are resolved to @@ -50,8 +54,8 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): # without raising a further exception. We do an # approximation to what the Exception's standard str() # output should be. - s = ' '.join([force_unicode(arg, encoding, strings_only, - errors) for arg in s]) + s = ' '.join([force_unicode(arg, encoding, strings_only, errors) + for arg in s]) elif not isinstance(s, unicode): # Note: We use .decode() here, instead of unicode(s, encoding, # errors), so that if s is a SafeString, it ends up being a @@ -66,6 +70,6 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): # working unicode method. Try to handle this without raising a # further exception by individually forcing the exception args # to unicode. - s = ' '.join([force_unicode(arg, encoding, strings_only, - errors) for arg in s]) + s = ' '.join([force_unicode(arg, encoding, strings_only, errors) + for arg in s]) return s diff --git a/twitter_text/validation.py b/twitter_text/validation.py index 6dea5f9..eabd955 100644 --- a/twitter_text/validation.py +++ b/twitter_text/validation.py @@ -9,17 +9,18 @@ MAX_LENGTH = 140 DEFAULT_TCO_URL_LENGTHS = { - 'short_url_length': 22, - 'short_url_length_https': 23, - 'characters_reserved_per_media': 22, + 'short_url_length': 22, + 'short_url_length_https': 23, + 'characters_reserved_per_media': 22, } + class Validation(object): def __init__(self, text, **kwargs): self.text = force_unicode(text) self.parent = kwargs.get('parent', False) - - def tweet_length(self, options = {}): + + def tweet_length(self, options={}): """ Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a @@ -34,10 +35,10 @@ def tweet_length(self, options = {}): The string could also contain U+00E9 already, in which case the canonicalization will not change the value. """ - assert (not self.parent or not getattr(self.parent, 'has_been_linked', False) ), 'The validator should only be run on text before it has been modified.' + assert (not self.parent or not getattr(self.parent, 'has_been_linked', False)), 'The validator should only be run on text before it has been modified.' for key in DEFAULT_TCO_URL_LENGTHS: - if not key in options: + if key not in options: options[key] = DEFAULT_TCO_URL_LENGTHS[key] length = len(self.text) @@ -52,21 +53,22 @@ def tweet_length(self, options = {}): if self.parent and hasattr(self.parent, 'tweet_length'): self.parent.tweet_length = length return length - + def tweet_invalid(self): """ Check the text for any reason that it may not be valid as a Tweet. This is meant as a pre-validation before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation will allow quicker feedback. - + Returns false if this text is valid. Otherwise one of the following Symbols will be returned: - + "Too long":: if the text is too long "Empty text":: if the text is empty "Invalid characters":: if the text contains non-Unicode or any of the disallowed Unicode characters """ - valid = True # optimism + # optimism + valid = True validation_error = None if not self.tweet_length(): @@ -77,7 +79,7 @@ def tweet_invalid(self): if re.search(ur''.join(REGEXEN['invalid_control_characters']), self.text): valid, validation_error = False, 'Invalid characters' - + if self.parent and hasattr(self.parent, 'tweet_is_valid'): self.parent.tweet_is_valid = valid if self.parent and hasattr(self.parent, 'tweet_validation_error'): @@ -108,7 +110,7 @@ def valid_hashtag(self): return len(extracted) == 1 and extracted[0] == self.text[1:] - def valid_url(self, unicode_domains = True, require_protocol = True): + def valid_url(self, unicode_domains=True, require_protocol=True): if not self.text: return False @@ -121,35 +123,32 @@ def valid_url(self, unicode_domains = True, require_protocol = True): if not ( ( - not require_protocol - or ( - self._valid_match(scheme, REGEXEN['validate_url_scheme']) - and re.compile(ur'^https?$', re.IGNORECASE).match(scheme) + not require_protocol or ( + self._valid_match(scheme, REGEXEN['validate_url_scheme']) and + re.compile(ur'^https?$', re.IGNORECASE).match(scheme) ) - ) - and ( - path == '' - or self._valid_match(path, REGEXEN['validate_url_path']) - ) - and self._valid_match(query, REGEXEN['validate_url_query'], True) - and self._valid_match(fragment, REGEXEN['validate_url_fragment'], True) + ) and ( + path == '' or + self._valid_match(path, REGEXEN['validate_url_path']) + ) and + self._valid_match(query, REGEXEN['validate_url_query'], True) and + self._valid_match(fragment, REGEXEN['validate_url_fragment'], True) ): return False return bool( ( - unicode_domains - and self._valid_match(authority, REGEXEN['validate_url_unicode_authority']) - and REGEXEN['validate_url_unicode_authority'].match(authority).string == authority - ) - or ( - not unicode_domains - and self._valid_match(authority, REGEXEN['validate_url_authority']) - and REGEXEN['validate_url_authority'].match(authority).string == authority + unicode_domains and + self._valid_match(authority, REGEXEN['validate_url_unicode_authority']) and + REGEXEN['validate_url_unicode_authority'].match(authority).string == authority + ) or ( + not unicode_domains and + self._valid_match(authority, REGEXEN['validate_url_authority']) and + REGEXEN['validate_url_authority'].match(authority).string == authority ) ) - def _valid_match(self, string, re_obj, optional = False): + def _valid_match(self, string, re_obj, optional=False): if optional and string is None: return True match = re_obj.match(string) From 07585c5058462251be47133ac3ac958f054e085a Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 13:54:59 -0700 Subject: [PATCH 02/30] Updated travis badge. [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fb6d3cd..1c0537d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ A port of the Ruby gem [twitter-text-rb](https://github.com/twitter/twitter-text-rb) to Python. -[![Build Status](https://travis-ci.org/dryan/twitter-text-py.png?branch=master)](https://travis-ci.org/dryan/twitter-text-py) +[![Build Status](https://travis-ci.org/muckrack/twitter-text-py.svg?branch=master)](https://travis-ci.org/muckrack/twitter-text-py) # Changes in 2.0 From ee1b68e78fe0949b5355572314af1684c9537afc Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Tue, 1 Mar 2016 16:34:17 -0800 Subject: [PATCH 03/30] Updated twitter-text-conformance submodule to latest master. --- twitter-text-conformance | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twitter-text-conformance b/twitter-text-conformance index 9b58c44..a39ec58 160000 --- a/twitter-text-conformance +++ b/twitter-text-conformance @@ -1 +1 @@ -Subproject commit 9b58c44302c4ab5bab261f6cfaf6ca89b5a6cf35 +Subproject commit a39ec5875528aaf0a874f384536fcd8e904d9fd8 From 4644183b8bcad7f062d8638b134ac8ac6b8ddc63 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Tue, 1 Mar 2016 17:43:43 -0800 Subject: [PATCH 04/30] Started transition to py.test testing This allows tests to run completely instead of halting at the first error, and removes some boilerplate. --- conftest.py | 134 ++++++++++++++++++++ requirements.txt | 2 + tests.py | 10 +- twitter_text/__init__.py | 4 +- twitter_text/autolink.py | 4 +- twitter_text/encoding.py | 239 ++++++++++++++++++++++++++++++++++++ twitter_text/extractor.py | 4 +- twitter_text/highlighter.py | 4 +- twitter_text/validation.py | 6 +- 9 files changed, 392 insertions(+), 15 deletions(-) create mode 100644 conftest.py create mode 100644 twitter_text/encoding.py diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..fb5d4b2 --- /dev/null +++ b/conftest.py @@ -0,0 +1,134 @@ +# encoding=utf-8 + +from __future__ import unicode_literals + +import argparse +import json +import os +import re +import sys + +import pytest + +import twitter_text +from twitter_text.encoding import force_text, smart_bytes + +try: + import yaml +except ImportError: + raise Exception('You need to install pyaml to run the tests') +# from http://stackoverflow.com/questions/2890146/how-to-force-pyyaml-to-load-strings-as-unicode-objects +from yaml import Loader, SafeLoader + + +narrow_build = True +try: + unichr(0x20000) + narrow_build = False +except: + pass + + +def construct_yaml_str(self, node): + return self.construct_scalar(node) +Loader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str) +SafeLoader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str) + + +try: + from bs4 import BeautifulSoup +except ImportError: + try: + from BeautifulSoup import BeautifulSoup + except ImportError: + raise Exception('You need to install BeautifulSoup to run the tests') + + +def assert_equal_without_attribute_order(result, test, failure_message=None): + # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through + assert BeautifulSoup(result) == BeautifulSoup(test.get('expected')) + + +def assert_equal(result, test): + assert result == test.get('expected') + + +def pytest_collect_file(parent, path): + if path.ext == '.yml': + return YamlFile(path, parent) + + +class YamlException(Exception): + """ custom exception for error reporting. """ + + +class YamlFile(pytest.File): + def collect(self): + raw = yaml.safe_load(force_text(self.fspath.open().read())) + if 'tests' not in raw: + return + filename = os.path.splitext(os.path.basename(self.fspath.strpath))[0] + for section, specs in raw['tests'].items(): + for spec in specs: + yield YamlItem(self, filename, section, spec) + + +TEST_MAP = { + 'autolink': { + 'cls': None, + 'methods': { + }, + }, + 'extract': { + 'cls': twitter_text.extractor.Extractor, + 'methods': { + 'mentions': 'extract_mentioned_screen_names', + 'mentions_with_indices': 'extract_mentioned_screen_names_with_indices', + 'mentions_or_lists_with_indices': 'extract_mentions_or_lists_with_indices', + 'replies': 'extract_reply_screen_name', + 'urls': 'extract_urls', + 'urls_with_indices': 'extract_urls_with_indices', + 'hashtags': 'extract_hashtags', + 'cashtags': 'extract_cashtags', + 'hashtags_with_indices': 'extract_hashtags_with_indices', + 'cashtags_with_indices': 'extract_cashtags_with_indices', + }, + }, + 'hit_highlighting': { + 'cls': None, + 'methods': { + }, + } +} + + +class YamlItem(pytest.Item): + def __init__(self, parent, filename, section, spec): + self.section = section + self.filename = filename + self.spec = spec + name = "{}:{}:{}".format(filename, section, spec['description']) + super(YamlItem, self).__init__(name, parent) + + def runtest(self): + if self.filename not in TEST_MAP: + raise YamlException("{} file not supported".format(self.section)) + if self.section not in TEST_MAP[self.filename]['methods']: + raise YamlException("{}:{} section not supported".format(self.section)) + cls = TEST_MAP[self.section]['cls'] + method_name = TEST_MAP[self.section]['methods'] + instance = cls(self.spec['text']) + result = getattr(instance, method_name)() + if result != self.spec['expected']: + raise YamlException("{} != {}".format(result, self.spec['expected'])) + + def repr_failure(self, excinfo): + """ called when self.runtest() raises an exception. """ + if isinstance(excinfo.value, YamlException): + return smart_bytes("\n".join([ + "usecase execution failed", + " {}".format(*excinfo.value.args) + ])) + + def reportinfo(self): + return self.fspath, 0, smart_bytes("usecase: %s" % self.name) diff --git a/requirements.txt b/requirements.txt index 0ac3552..d001e1b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ argparse==1.2.1 PyYAML==3.10 beautifulsoup4==4.2.0 +pytest==2.87 +py==1.4.29 diff --git a/tests.py b/tests.py index dd896c5..97da4c0 100644 --- a/tests.py +++ b/tests.py @@ -53,6 +53,8 @@ def error(text): return (u'\033[91m%s\033[0m\n' % text).encode('utf-8') +CURRENT_DIR = os.path.dirname(__file__) +CONFORMANCE_DIR = os.path.join(CURRENT_DIR, 'twitter-text-conformance/conformance') attempted = 0 @@ -73,7 +75,7 @@ def assert_equal(result, test): sys.stdout.flush() # extractor section -extractor_file = open(os.path.join('twitter-text-conformance', 'extract.yml'), 'r') +extractor_file = open(os.path.join(CONFORMANCE_DIR, 'extract.yml'), 'r') extractor_tests = yaml.load(force_unicode(extractor_file.read())) extractor_file.close() @@ -111,7 +113,7 @@ def assert_equal(result, test): assert_equal(extractor.extract_cashtags_with_indices(), test) # autolink section -autolink_file = open(os.path.join('twitter-text-conformance', 'autolink.yml'), 'r') +autolink_file = open(os.path.join(CONFORMANCE_DIR, 'autolink.yml'), 'r') autolink_tests = yaml.load(force_unicode(autolink_file.read())) autolink_file.close() @@ -144,7 +146,7 @@ def assert_equal(result, test): assert_equal_without_attribute_order(autolink.auto_link_with_json(json.loads(test.get('json')), autolink_options), test) # hit_highlighting section -hit_highlighting_file = open(os.path.join('twitter-text-conformance', 'hit_highlighting.yml'), 'r') +hit_highlighting_file = open(os.path.join(CONFORMANCE_DIR, 'hit_highlighting.yml'), 'r') hit_highlighting_tests = yaml.load(force_unicode(hit_highlighting_file.read())) hit_highlighting_file.close() @@ -164,7 +166,7 @@ def assert_equal(result, test): validation_tested = False validate_tests = None try: - validate_file = open(os.path.join('twitter-text-conformance', 'validate.yml'), 'r') + validate_file = open(os.path.join(CONFORMANCE_DIR, 'validate.yml'), 'r') validate_file_contents = validate_file.read() validate_tests = yaml.load(re.sub(ur'\\n', '\n', validate_file_contents.encode('unicode-escape'))) validate_file.close() diff --git a/twitter_text/__init__.py b/twitter_text/__init__.py index 6f17ac2..e267dac 100644 --- a/twitter_text/__init__.py +++ b/twitter_text/__init__.py @@ -4,13 +4,13 @@ from twitter_text.extractor import Extractor from twitter_text.highlighter import HitHighlighter from twitter_text.validation import Validation -from twitter_text.unicode import force_unicode +from twitter_text.encoding import force_text class TwitterText(object): def __init__(self, text): # this will get modified by some functions - self.text = force_unicode(text) + self.text = force_text(text) # this never changes; use it as a fallback or for comparison self.original_text = self.text self.has_been_linked = False diff --git a/twitter_text/autolink.py b/twitter_text/autolink.py index 85e21aa..8e40227 100644 --- a/twitter_text/autolink.py +++ b/twitter_text/autolink.py @@ -3,7 +3,7 @@ import re from twitter_text.regex import REGEXEN -from twitter_text.unicode import force_unicode +from twitter_text.encoding import force_text from twitter_text.extractor import Extractor # Default CSS class for auto-linked lists @@ -90,7 +90,7 @@ def default_transform(entity, text): class Autolink(object): def __init__(self, text, **kwargs): - self.text = force_unicode(text) + self.text = force_text(text) self.parent = kwargs.get('parent', False) self.extractor = Extractor(self.text) diff --git a/twitter_text/encoding.py b/twitter_text/encoding.py new file mode 100644 index 0000000..bde3ce7 --- /dev/null +++ b/twitter_text/encoding.py @@ -0,0 +1,239 @@ +# flake8: noqa +# Taken from django.utils.encoding +from __future__ import unicode_literals + +import codecs +import datetime +from decimal import Decimal +import locale + +from django.utils.functional import Promise +from django.utils import six +from django.utils.six.moves.urllib.parse import quote + + +class DjangoUnicodeDecodeError(UnicodeDecodeError): + def __init__(self, obj, *args): + self.obj = obj + UnicodeDecodeError.__init__(self, *args) + + def __str__(self): + original = UnicodeDecodeError.__str__(self) + return '%s. You passed in %r (%s)' % (original, self.obj, + type(self.obj)) + + +def python_2_unicode_compatible(klass): + """ + A decorator that defines __unicode__ and __str__ methods under Python 2. + Under Python 3 it does nothing. + + To support Python 2 and 3 with a single code base, define a __str__ method + returning text and apply this decorator to the class. + """ + if six.PY2: + if '__str__' not in klass.__dict__: + raise ValueError("@python_2_unicode_compatible cannot be applied " + "to %s because it doesn't define __str__()." % + klass.__name__) + klass.__unicode__ = klass.__str__ + klass.__str__ = lambda self: self.__unicode__().encode('utf-8') + return klass + + +def smart_text(s, encoding='utf-8', strings_only=False, errors='strict'): + """ + Returns a text object representing 's' -- unicode on Python 2 and str on + Python 3. Treats bytestrings using the 'encoding' codec. + + If strings_only is True, don't convert (some) non-string-like objects. + """ + if isinstance(s, Promise): + # The input is the result of a gettext_lazy() call. + return s + return force_text(s, encoding, strings_only, errors) + + +def is_protected_type(obj): + """Determine if the object instance is of a protected type. + + Objects of protected types are preserved as-is when passed to + force_text(strings_only=True). + """ + return isinstance(obj, six.integer_types + (type(None), float, Decimal, + datetime.datetime, datetime.date, datetime.time)) + + +def force_text(s, encoding='utf-8', strings_only=False, errors='strict'): + """ + Similar to smart_text, except that lazy instances are resolved to + strings, rather than kept as lazy objects. + + If strings_only is True, don't convert (some) non-string-like objects. + """ + # Handle the common case first for performance reasons. + if isinstance(s, six.text_type): + return s + if strings_only and is_protected_type(s): + return s + try: + if not isinstance(s, six.string_types): + if six.PY3: + if isinstance(s, bytes): + s = six.text_type(s, encoding, errors) + else: + s = six.text_type(s) + elif hasattr(s, '__unicode__'): + s = six.text_type(s) + else: + s = six.text_type(bytes(s), encoding, errors) + else: + # Note: We use .decode() here, instead of six.text_type(s, encoding, + # errors), so that if s is a SafeBytes, it ends up being a + # SafeText at the end. + s = s.decode(encoding, errors) + except UnicodeDecodeError as e: + if not isinstance(s, Exception): + raise DjangoUnicodeDecodeError(s, *e.args) + else: + # If we get to here, the caller has passed in an Exception + # subclass populated with non-ASCII bytestring data without a + # working unicode method. Try to handle this without raising a + # further exception by individually forcing the exception args + # to unicode. + s = ' '.join([force_text(arg, encoding, strings_only, + errors) for arg in s]) + return s + + +def smart_bytes(s, encoding='utf-8', strings_only=False, errors='strict'): + """ + Returns a bytestring version of 's', encoded as specified in 'encoding'. + + If strings_only is True, don't convert (some) non-string-like objects. + """ + if isinstance(s, Promise): + # The input is the result of a gettext_lazy() call. + return s + return force_bytes(s, encoding, strings_only, errors) + + +def force_bytes(s, encoding='utf-8', strings_only=False, errors='strict'): + """ + Similar to smart_bytes, except that lazy instances are resolved to + strings, rather than kept as lazy objects. + + If strings_only is True, don't convert (some) non-string-like objects. + """ + # Handle the common case first for performance reasons. + if isinstance(s, bytes): + if encoding == 'utf-8': + return s + else: + return s.decode('utf-8', errors).encode(encoding, errors) + if strings_only and is_protected_type(s): + return s + if isinstance(s, six.memoryview): + return bytes(s) + if isinstance(s, Promise): + return six.text_type(s).encode(encoding, errors) + if not isinstance(s, six.string_types): + try: + if six.PY3: + return six.text_type(s).encode(encoding) + else: + return bytes(s) + except UnicodeEncodeError: + if isinstance(s, Exception): + # An Exception subclass containing non-ASCII data that doesn't + # know how to print itself properly. We shouldn't raise a + # further exception. + return b' '.join([force_bytes(arg, encoding, strings_only, + errors) for arg in s]) + return six.text_type(s).encode(encoding, errors) + else: + return s.encode(encoding, errors) + +if six.PY3: + smart_str = smart_text + force_str = force_text +else: + smart_str = smart_bytes + force_str = force_bytes + # backwards compatibility for Python 2 + smart_unicode = smart_text + force_unicode = force_text + +smart_str.__doc__ = """ +Apply smart_text in Python 3 and smart_bytes in Python 2. + +This is suitable for writing to sys.stdout (for instance). +""" + +force_str.__doc__ = """ +Apply force_text in Python 3 and force_bytes in Python 2. +""" + + +def iri_to_uri(iri): + """ + Convert an Internationalized Resource Identifier (IRI) portion to a URI + portion that is suitable for inclusion in a URL. + + This is the algorithm from section 3.1 of RFC 3987. However, since we are + assuming input is either UTF-8 or unicode already, we can simplify things a + little from the full method. + + Returns an ASCII string containing the encoded result. + """ + # The list of safe characters here is constructed from the "reserved" and + # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986: + # reserved = gen-delims / sub-delims + # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" + # sub-delims = "!" / "$" / "&" / "'" / "(" / ")" + # / "*" / "+" / "," / ";" / "=" + # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" + # Of the unreserved characters, urllib.quote already considers all but + # the ~ safe. + # The % character is also added to the list of safe characters here, as the + # end of section 3.1 of RFC 3987 specifically mentions that % must not be + # converted. + if iri is None: + return iri + return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~") + + +def filepath_to_uri(path): + """Convert a file system path to a URI portion that is suitable for + inclusion in a URL. + + We are assuming input is either UTF-8 or unicode already. + + This method will encode certain chars that would normally be recognized as + special chars for URIs. Note that this method does not encode the ' + character, as it is a valid character within URIs. See + encodeURIComponent() JavaScript function for more details. + + Returns an ASCII string containing the encoded result. + """ + if path is None: + return path + # I know about `os.sep` and `os.altsep` but I want to leave + # some flexibility for hardcoding separators. + return quote(force_bytes(path).replace(b"\\", b"/"), safe=b"/~!*()'") + + +def get_system_encoding(): + """ + The encoding of the default system locale but falls back to the given + fallback encoding if the encoding is unsupported by python or could + not be determined. See tickets #10335 and #5846 + """ + try: + encoding = locale.getdefaultlocale()[1] or 'ascii' + codecs.lookup(encoding) + except Exception: + encoding = 'ascii' + return encoding + +DEFAULT_LOCALE_ENCODING = get_system_encoding() diff --git a/twitter_text/extractor.py b/twitter_text/extractor.py index 0898370..87245b8 100644 --- a/twitter_text/extractor.py +++ b/twitter_text/extractor.py @@ -1,7 +1,7 @@ # encoding=utf-8 from twitter_text.regex import REGEXEN -from twitter_text.unicode import force_unicode +from twitter_text.encoding import force_text class Extractor(object): @@ -11,7 +11,7 @@ class Extractor(object): """ def __init__(self, text): - self.text = force_unicode(text) + self.text = force_text(text) def _remove_overlapping_entities(self, entities): """ diff --git a/twitter_text/highlighter.py b/twitter_text/highlighter.py index 90bbdfd..3311c29 100644 --- a/twitter_text/highlighter.py +++ b/twitter_text/highlighter.py @@ -3,7 +3,7 @@ import re from HTMLParser import HTMLParser -from twitter_text.unicode import force_unicode +from twitter_text.encoding import force_text DEFAULT_HIGHLIGHT_TAG = 'em' @@ -29,7 +29,7 @@ def strip_tags(html): class HitHighlighter(object): def __init__(self, text, **kwargs): - self.text = force_unicode(text) + self.text = force_text(text) self.parent = kwargs.get('parent', False) def hit_highlight(self, hits=[], **kwargs): diff --git a/twitter_text/validation.py b/twitter_text/validation.py index eabd955..25b4d3d 100644 --- a/twitter_text/validation.py +++ b/twitter_text/validation.py @@ -2,7 +2,7 @@ import re -from twitter_text.unicode import force_unicode +from twitter_text.encoding import force_text from twitter_text.extractor import Extractor from twitter_text.regex import REGEXEN @@ -17,7 +17,7 @@ class Validation(object): def __init__(self, text, **kwargs): - self.text = force_unicode(text) + self.text = force_text(text) self.parent = kwargs.get('parent', False) def tweet_length(self, options={}): @@ -42,7 +42,7 @@ def tweet_length(self, options={}): options[key] = DEFAULT_TCO_URL_LENGTHS[key] length = len(self.text) - # thanks force_unicode for making this so much simpler than the ruby version + # thanks force_text for making this so much simpler than the ruby version for url in Extractor(self.text).extract_urls_with_indices(): # remove the link of the original URL From 433c48df30631ed252529a7d4cbe279048b86a38 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Tue, 1 Mar 2016 17:46:54 -0800 Subject: [PATCH 05/30] Removed old tests.py --- tests.py | 198 ------------------------------------------------------- 1 file changed, 198 deletions(-) delete mode 100644 tests.py diff --git a/tests.py b/tests.py deleted file mode 100644 index 97da4c0..0000000 --- a/tests.py +++ /dev/null @@ -1,198 +0,0 @@ -# encoding=utf-8 - -import argparse -import json -import os -import re -import sys -import twitter_text - -from twitter_text.unicode import force_unicode - -try: - import yaml -except ImportError: - raise Exception('You need to install pyaml to run the tests') -# from http://stackoverflow.com/questions/2890146/how-to-force-pyyaml-to-load-strings-as-unicode-objects -from yaml import Loader, SafeLoader - - -narrow_build = True -try: - unichr(0x20000) - narrow_build = False -except: - pass - - -parser = argparse.ArgumentParser(description=u'Run the integration tests for twitter_text') -parser.add_argument('--ignore-narrow-errors', '-i', help=u'Ignore errors caused by narrow builds', default=False, action='store_true') -args = parser.parse_args() - - -def construct_yaml_str(self, node): - return self.construct_scalar(node) -Loader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str) -SafeLoader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str) - - -try: - from bs4 import BeautifulSoup -except ImportError: - try: - from BeautifulSoup import BeautifulSoup - except ImportError: - raise Exception('You need to install BeautifulSoup to run the tests') - - -def success(text): - return (u'\033[92m%s\033[0m\n' % text).encode('utf-8') - - -def error(text): - return (u'\033[91m%s\033[0m\n' % text).encode('utf-8') - - -CURRENT_DIR = os.path.dirname(__file__) -CONFORMANCE_DIR = os.path.join(CURRENT_DIR, 'twitter-text-conformance/conformance') -attempted = 0 - - -def assert_equal_without_attribute_order(result, test, failure_message=None): - global attempted - attempted += 1 - # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through - assert BeautifulSoup(result) == BeautifulSoup(test.get('expected')), error(u'Test %d Failed: %s' % (attempted, test.get('description'))) - sys.stdout.write(success(u'Test %d Passed: %s' % (attempted, test.get('description')))) - sys.stdout.flush() - - -def assert_equal(result, test): - global attempted - attempted += 1 - assert result == test.get('expected'), error(u'\nTest %d Failed: %s%s' % (attempted, test.get('description'), u'\n%s' % test.get('hits') if test.get('hits') else '')) - sys.stdout.write(success(u'Test %d Passed: %s' % (attempted, test.get('description')))) - sys.stdout.flush() - -# extractor section -extractor_file = open(os.path.join(CONFORMANCE_DIR, 'extract.yml'), 'r') -extractor_tests = yaml.load(force_unicode(extractor_file.read())) -extractor_file.close() - -sys.stdout.write('Testing Extractor\n') -sys.stdout.flush() - -for section in extractor_tests.get('tests'): - sys.stdout.write('\nTesting Extractor: %s\n' % section) - sys.stdout.flush() - for test in extractor_tests.get('tests').get(section): - if (args.ignore_narrow_errors or narrow_build) and section in ['hashtags'] and test.get('description') in ['Hashtag with ideographic iteration mark']: - sys.stdout.write('Skipping: %s\n' % test.get('description')) - sys.stdout.flush() - continue - extractor = twitter_text.extractor.Extractor(test.get('text')) - if section == 'mentions': - assert_equal(extractor.extract_mentioned_screen_names(), test) - elif section == 'mentions_with_indices': - assert_equal(extractor.extract_mentioned_screen_names_with_indices(), test) - elif section == 'mentions_or_lists_with_indices': - assert_equal(extractor.extract_mentions_or_lists_with_indices(), test) - elif section == 'replies': - assert_equal(extractor.extract_reply_screen_name(), test) - elif section == 'urls': - assert_equal(extractor.extract_urls(), test) - elif section == 'urls_with_indices': - assert_equal(extractor.extract_urls_with_indices(), test) - elif section == 'hashtags': - assert_equal(extractor.extract_hashtags(), test) - elif section == 'cashtags': - assert_equal(extractor.extract_cashtags(), test) - elif section == 'hashtags_with_indices': - assert_equal(extractor.extract_hashtags_with_indices(), test) - elif section == 'cashtags_with_indices': - assert_equal(extractor.extract_cashtags_with_indices(), test) - -# autolink section -autolink_file = open(os.path.join(CONFORMANCE_DIR, 'autolink.yml'), 'r') -autolink_tests = yaml.load(force_unicode(autolink_file.read())) -autolink_file.close() - -sys.stdout.write('\nTesting Autolink\n') -sys.stdout.flush() - -autolink_options = {'suppress_no_follow': True} - -for section in autolink_tests.get('tests'): - sys.stdout.write('\nTesting Autolink: %s\n' % section) - for test in autolink_tests.get('tests').get(section): - if (args.ignore_narrow_errors or narrow_build) and section in ['hashtags'] and test.get('description') in ['Autolink a hashtag containing ideographic iteration mark']: - sys.stdout.write('Skipping: %s\n' % test.get('description')) - sys.stdout.flush() - continue - autolink = twitter_text.autolink.Autolink(test.get('text')) - if section == 'usernames': - assert_equal_without_attribute_order(autolink.auto_link_usernames_or_lists(autolink_options), test) - elif section == 'cashtags': - assert_equal_without_attribute_order(autolink.auto_link_cashtags(autolink_options), test) - elif section == 'urls': - assert_equal_without_attribute_order(autolink.auto_link_urls(autolink_options), test) - elif section == 'hashtags': - assert_equal_without_attribute_order(autolink.auto_link_hashtags(autolink_options), test) - elif section == 'all': - assert_equal_without_attribute_order(autolink.auto_link(autolink_options), test) - elif section == 'lists': - assert_equal_without_attribute_order(autolink.auto_link_usernames_or_lists(autolink_options), test) - elif section == 'json': - assert_equal_without_attribute_order(autolink.auto_link_with_json(json.loads(test.get('json')), autolink_options), test) - -# hit_highlighting section -hit_highlighting_file = open(os.path.join(CONFORMANCE_DIR, 'hit_highlighting.yml'), 'r') -hit_highlighting_tests = yaml.load(force_unicode(hit_highlighting_file.read())) -hit_highlighting_file.close() - -sys.stdout.write('\nTesting Hit Highlighting\n') -sys.stdout.flush() - -for section in hit_highlighting_tests.get('tests'): - sys.stdout.write('\nTesting Hit Highlighting: %s\n' % section) - for test in hit_highlighting_tests.get('tests').get(section): - hit_highlighter = twitter_text.highlighter.HitHighlighter(test.get('text')) - if section == 'plain_text': - assert_equal(hit_highlighter.hit_highlight(hits=test.get('hits')), test) - elif section == 'with_links': - assert_equal_without_attribute_order(hit_highlighter.hit_highlight(hits=test.get('hits')), test) - -# validation section -validation_tested = False -validate_tests = None -try: - validate_file = open(os.path.join(CONFORMANCE_DIR, 'validate.yml'), 'r') - validate_file_contents = validate_file.read() - validate_tests = yaml.load(re.sub(ur'\\n', '\n', validate_file_contents.encode('unicode-escape'))) - validate_file.close() -except ValueError: - sys.stdout.write('\nValidation tests were skipped because of wide character issues\n') - sys.stdout.flush() - -if validate_tests: - sys.stdout.write('\nTesting Validation\n') - sys.stdout.flush() - - for section in validate_tests.get('tests'): - sys.stdout.write('\nTesting Validation: %s\n' % section) - for test in validate_tests.get('tests').get(section): - validator = twitter_text.validation.Validation(test.get('text')) - if section == 'tweets': - assert_equal(not validator.tweet_invalid(), test) - elif section == 'usernames': - assert_equal(validator.valid_username(), test) - elif section == 'lists': - assert_equal(validator.valid_list(), test) - elif section == 'hashtags': - assert_equal(validator.valid_hashtag(), test) - elif section == 'urls': - assert_equal(validator.valid_url(), test) - -sys.stdout.write(u'\033[0m-------\n\033[92m%d tests passed.\033[0m\n' % attempted) -sys.stdout.flush() -sys.exit(os.EX_OK) From 40cbe7ad2ddbf2bd97702d66ba42e38eb45b613a Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Wed, 2 Mar 2016 13:43:58 -0800 Subject: [PATCH 06/30] Further work on getting tests to pass. --- conftest.py | 86 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 66 insertions(+), 20 deletions(-) diff --git a/conftest.py b/conftest.py index fb5d4b2..7f22751 100644 --- a/conftest.py +++ b/conftest.py @@ -18,7 +18,7 @@ except ImportError: raise Exception('You need to install pyaml to run the tests') # from http://stackoverflow.com/questions/2890146/how-to-force-pyyaml-to-load-strings-as-unicode-objects -from yaml import Loader, SafeLoader +#from yaml import Loader, SafeLoader narrow_build = True @@ -29,10 +29,10 @@ pass -def construct_yaml_str(self, node): - return self.construct_scalar(node) -Loader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str) -SafeLoader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str) +#def construct_yaml_str(self, node): +# return self.construct_scalar(node) +#Loader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str) +#SafeLoader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str) try: @@ -44,15 +44,6 @@ def construct_yaml_str(self, node): raise Exception('You need to install BeautifulSoup to run the tests') -def assert_equal_without_attribute_order(result, test, failure_message=None): - # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through - assert BeautifulSoup(result) == BeautifulSoup(test.get('expected')) - - -def assert_equal(result, test): - assert result == test.get('expected') - - def pytest_collect_file(parent, path): if path.ext == '.yml': return YamlFile(path, parent) @@ -64,10 +55,16 @@ class YamlException(Exception): class YamlFile(pytest.File): def collect(self): + filename = os.path.splitext(os.path.basename(self.fspath.strpath))[0] + if filename not in TEST_MAP: + print "Skipping {}; not supported".format(filename) + return + if TEST_MAP[filename].get('requires_wide_build') and narrow_build: + print "Skipping {} due to narrow build".format(filename) + return raw = yaml.safe_load(force_text(self.fspath.open().read())) if 'tests' not in raw: return - filename = os.path.splitext(os.path.basename(self.fspath.strpath))[0] for section, specs in raw['tests'].items(): for spec in specs: yield YamlItem(self, filename, section, spec) @@ -75,9 +72,26 @@ def collect(self): TEST_MAP = { 'autolink': { - 'cls': None, + 'cls': twitter_text.autolink.Autolink, + 'options': {'suppress_no_follow': True}, 'methods': { + 'usernames': 'auto_link_usernames_or_lists', + 'cashtags': 'auto_link_cashtags', + 'urls': 'auto_link_urls', + 'hashtags': 'auto_link_hashtags', + 'all': 'auto_link', + 'lists': 'auto_link_usernames_or_lists', + 'json': 'auto_link_with_json', }, + 'ignore_attribute_order': set([ + 'usernames', + 'cashtags', + 'urls', + 'hashtags', + 'all', + 'lists', + 'json', + ]) }, 'extract': { 'cls': twitter_text.extractor.Extractor, @@ -95,8 +109,24 @@ def collect(self): }, }, 'hit_highlighting': { - 'cls': None, + 'cls': twitter_text.highlighter.HitHighlighter, + 'methods': { + 'plain_text': 'hit_highlight', + 'with_links': 'hit_highlight', + }, + 'ignore_attribute_order': set([ + 'with_links', + ]) + }, + 'validate': { + 'cls': twitter_text.validation.Validation, + 'requires_wide_build': True, 'methods': { + 'tweets': 'valid_tweet_text', + 'usernames': 'valid_username', + 'lists': 'valid_list', + 'hashtags': 'valid_hashtag', + 'urls': 'valid_url', }, } } @@ -110,16 +140,32 @@ def __init__(self, parent, filename, section, spec): name = "{}:{}:{}".format(filename, section, spec['description']) super(YamlItem, self).__init__(name, parent) + def _equal_without_attribute_order(result, expected): + # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through + return BeautifulSoup(result) == BeautifulSoup(expected) + def runtest(self): if self.filename not in TEST_MAP: raise YamlException("{} file not supported".format(self.section)) if self.section not in TEST_MAP[self.filename]['methods']: raise YamlException("{}:{} section not supported".format(self.section)) cls = TEST_MAP[self.section]['cls'] - method_name = TEST_MAP[self.section]['methods'] + method_name = TEST_MAP[self.filename]['methods'][self.section] instance = cls(self.spec['text']) - result = getattr(instance, method_name)() - if result != self.spec['expected']: + args = [] + kwargs = {} + if 'json' in self.spec: + args.append(json.loads(self.spec['json'])) + if 'options' in TEST_MAP[self.filename]: + args.append(TEST_MAP[self.filename]) + if 'hits' in self.spec: + kwargs['hits'] = self.spec['hits'] + result = getattr(instance, method_name)(*args, **kwargs) + if self.section in TEST_MAP[self.filename].get('ignore_attribute_order', ()): + equal = self._equal_without_attribute_order(result, self.spec['expected']) + else: + equal = result == self.spect['expected'] + if not equal: raise YamlException("{} != {}".format(result, self.spec['expected'])) def repr_failure(self, excinfo): From 17f3c13cf78a930d6be19b5c5425b49714276237 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 14:02:23 -0700 Subject: [PATCH 07/30] Removed twitter-text-conformance submodule. --- twitter-text-conformance | 1 - 1 file changed, 1 deletion(-) delete mode 160000 twitter-text-conformance diff --git a/twitter-text-conformance b/twitter-text-conformance deleted file mode 160000 index a39ec58..0000000 --- a/twitter-text-conformance +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a39ec5875528aaf0a874f384536fcd8e904d9fd8 From 15d03c7d16b7e095211f31a28d03f72070712334 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 14:03:19 -0700 Subject: [PATCH 08/30] Added twitter-text mono repo submodule. --- .gitmodules | 6 +++--- twitter-text | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) create mode 160000 twitter-text diff --git a/.gitmodules b/.gitmodules index f051160..e5d7799 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "twitter-text-conformance"] - path = twitter-text-conformance - url = https://github.com/dryan/twitter-text-conformance.git +[submodule "twitter-text"] + path = twitter-text + url = https://github.com/twitter/twitter-text.git diff --git a/twitter-text b/twitter-text new file mode 160000 index 0000000..fb07f2e --- /dev/null +++ b/twitter-text @@ -0,0 +1 @@ +Subproject commit fb07f2e30c1d3d053cf2bb2ad6971a3bcfc9b568 From 9f2fbd40e62efac7098d4abdbd553c57f14c37b5 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 14:20:23 -0700 Subject: [PATCH 09/30] Corrected a couple errors re: test running. --- conftest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conftest.py b/conftest.py index 7f22751..e40dcb6 100644 --- a/conftest.py +++ b/conftest.py @@ -140,7 +140,7 @@ def __init__(self, parent, filename, section, spec): name = "{}:{}:{}".format(filename, section, spec['description']) super(YamlItem, self).__init__(name, parent) - def _equal_without_attribute_order(result, expected): + def _equal_without_attribute_order(self, result, expected): # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through return BeautifulSoup(result) == BeautifulSoup(expected) @@ -149,7 +149,7 @@ def runtest(self): raise YamlException("{} file not supported".format(self.section)) if self.section not in TEST_MAP[self.filename]['methods']: raise YamlException("{}:{} section not supported".format(self.section)) - cls = TEST_MAP[self.section]['cls'] + cls = TEST_MAP[self.filename]['cls'] method_name = TEST_MAP[self.filename]['methods'][self.section] instance = cls(self.spec['text']) args = [] @@ -157,7 +157,7 @@ def runtest(self): if 'json' in self.spec: args.append(json.loads(self.spec['json'])) if 'options' in TEST_MAP[self.filename]: - args.append(TEST_MAP[self.filename]) + kwargs['options'] = TEST_MAP[self.filename]['options'] if 'hits' in self.spec: kwargs['hits'] = self.spec['hits'] result = getattr(instance, method_name)(*args, **kwargs) From a18a8a348f536ce1c7d43a509b848caa94901fa2 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 14:23:29 -0700 Subject: [PATCH 10/30] Tweaked travis to run py.test --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index b954140..3c79536 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,8 +4,8 @@ python: - "2.7" install: - - "pip install . --use-mirrors" - - "pip install -r requirements.txt --use-mirrors" -script: "python ./tests.py" + - "pip install ." + - "pip install -r requirements.txt" +script: "py.test" notifications: email: false From 8ceafcfea8d5541a5d002637cf63cba5fa7e3e74 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 14:23:59 -0700 Subject: [PATCH 11/30] Added /.cache to gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 17b1f10..20a1408 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.pyc build* *.egg* -dist \ No newline at end of file +dist +/.cache From e567d392207f8abc7946a337b6b2349985d82758 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 14:28:44 -0700 Subject: [PATCH 12/30] Corrected case preservation in autolinked screen_names --- twitter_text/autolink.py | 1 - 1 file changed, 1 deletion(-) diff --git a/twitter_text/autolink.py b/twitter_text/autolink.py index 8e40227..7b69f83 100644 --- a/twitter_text/autolink.py +++ b/twitter_text/autolink.py @@ -388,7 +388,6 @@ def _link_to_cashtag(self, entity, chars, options={}): def _link_to_screen_name(self, entity, chars, options={}): name = u'%s%s' % (entity['screen_name'], entity.get('list_slug') or '') chunk = options.get('link_text_transform', default_transform)(entity, name) - name = name.lower() at = chars[entity['indices'][0]] From 6581a0a474415612603fdc284f4dd9ede8314b59 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 14:35:12 -0700 Subject: [PATCH 13/30] Required beautifulsoup4 for better unicode handling. --- conftest.py | 7 ++----- requirements.txt | 3 ++- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/conftest.py b/conftest.py index e40dcb6..0add157 100644 --- a/conftest.py +++ b/conftest.py @@ -38,10 +38,7 @@ try: from bs4 import BeautifulSoup except ImportError: - try: - from BeautifulSoup import BeautifulSoup - except ImportError: - raise Exception('You need to install BeautifulSoup to run the tests') + raise Exception('You need to install BeautifulSoup4 to run the tests') def pytest_collect_file(parent, path): @@ -142,7 +139,7 @@ def __init__(self, parent, filename, section, spec): def _equal_without_attribute_order(self, result, expected): # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through - return BeautifulSoup(result) == BeautifulSoup(expected) + return BeautifulSoup(result, "lxml") == BeautifulSoup(expected, "lxml") def runtest(self): if self.filename not in TEST_MAP: diff --git a/requirements.txt b/requirements.txt index d001e1b..1c8d134 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ argparse==1.2.1 PyYAML==3.10 -beautifulsoup4==4.2.0 +beautifulsoup4==4.4.1 +lxml==3.4.4 pytest==2.87 py==1.4.29 From 7dc0301589a438290e2656d2acf3d36b4ed5f44a Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 14:45:57 -0700 Subject: [PATCH 14/30] Corrected nested balanced paren handling. --- twitter_text/regex.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/twitter_text/regex.py b/twitter_text/regex.py index fffebde..049bc04 100644 --- a/twitter_text/regex.py +++ b/twitter_text/regex.py @@ -208,7 +208,14 @@ def regex_range(start, end=None): # Allow URL paths to contain balanced parens # 1. Used in Wikipedia URLs like /Primer_(film) # 2. Used in IIS sessions like /S(dfd346)/ -REGEXEN['valid_url_balanced_parens'] = re.compile(ur'\(%s+\)' % REGEXEN['valid_general_url_path_chars'].pattern, re.IGNORECASE | re.UNICODE) +# Allow one nested level of balanced parentheses +REGEXEN['valid_url_balanced_parens'] = re.compile( + ur'\((?:%s+|(?:%s*\(%s+\)%s*))\)' % ( + REGEXEN['valid_general_url_path_chars'].pattern, + REGEXEN['valid_general_url_path_chars'].pattern, + REGEXEN['valid_general_url_path_chars'].pattern, + REGEXEN['valid_general_url_path_chars'].pattern, + ), re.IGNORECASE | re.UNICODE) # Valid end-of-path chracters (so /foo. does not gobble the period). # 1. Allow =&# for empty URL parameters and other URL-join artifacts REGEXEN['valid_url_path_ending_chars'] = re.compile(ur'[a-z0-9=_#\/\+\-%s]|(?:%s)' % (LATIN_ACCENTS, REGEXEN['valid_url_balanced_parens'].pattern), re.IGNORECASE | re.UNICODE) From e24ebdc9b0fdc3c108d2d30dea092b6e5c9c6a10 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 14:49:21 -0700 Subject: [PATCH 15/30] Bumped pytest version. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 1c8d134..f3f1ba6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,5 @@ argparse==1.2.1 PyYAML==3.10 beautifulsoup4==4.4.1 lxml==3.4.4 -pytest==2.87 +pytest==2.9.1 py==1.4.29 From 91423dbc9216534d4bbbb86e3542e4c3ee5e4fd1 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 15:11:58 -0700 Subject: [PATCH 16/30] Improved non-latin regex support. --- requirements.txt | 1 + twitter_text/regex.py | 37 +++++++++++++++++++++++++++---------- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/requirements.txt b/requirements.txt index f3f1ba6..334171d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ beautifulsoup4==4.4.1 lxml==3.4.4 pytest==2.9.1 py==1.4.29 +regex==2016.04.25 diff --git a/twitter_text/regex.py b/twitter_text/regex.py index 049bc04..0eef03e 100644 --- a/twitter_text/regex.py +++ b/twitter_text/regex.py @@ -3,8 +3,9 @@ # A collection of regular expressions for parsing Tweet text. The regular expression # list is frozen at load time to ensure immutability. These reular expressions are # used throughout the Twitter classes. Special care has been taken to make -# sure these reular expressions work with Tweets in all languages. -import re +# sure these regular expressions work with Tweets in all languages. +from __future__ import absolute_import +import regex as re REGEXEN = {} # :nodoc: @@ -79,10 +80,7 @@ def regex_range(start, end=None): NON_LATIN_HASHTAG_CHARS = ''.join([ # Cyrillic (Russian, Ukrainian, etc.) - regex_range(0x0400, 0x04ff), # Cyrillic - regex_range(0x0500, 0x0527), # Cyrillic Supplement - regex_range(0x2de0, 0x2dff), # Cyrillic Extended A - regex_range(0xa640, 0xa69f), # Cyrillic Extended B + '\p{Cyrillic}', # Cyrillic regex_range(0x0591, 0x05bf), # Hebrew regex_range(0x05c1, 0x05c2), regex_range(0x05c4, 0x05c5), @@ -204,7 +202,11 @@ def regex_range(start, end=None): REGEXEN['valid_port_number'] = re.compile(ur'[0-9]+') -REGEXEN['valid_general_url_path_chars'] = re.compile(ur"[a-z0-9!\*';:=\+\,\.\$\/%%#\[\]\-_~&|@%s]" % LATIN_ACCENTS, re.IGNORECASE | re.UNICODE) +REGEXEN['valid_general_url_path_chars'] = re.compile( + ur"[a-z%s0-9!\*';:=\+\,\.\$\/%%#\[\]\-_~&|@%s]" % ( + NON_LATIN_HASHTAG_CHARS, + LATIN_ACCENTS, + ), re.IGNORECASE | re.UNICODE) # Allow URL paths to contain balanced parens # 1. Used in Wikipedia URLs like /Primer_(film) # 2. Used in IIS sessions like /S(dfd346)/ @@ -218,8 +220,20 @@ def regex_range(start, end=None): ), re.IGNORECASE | re.UNICODE) # Valid end-of-path chracters (so /foo. does not gobble the period). # 1. Allow =&# for empty URL parameters and other URL-join artifacts -REGEXEN['valid_url_path_ending_chars'] = re.compile(ur'[a-z0-9=_#\/\+\-%s]|(?:%s)' % (LATIN_ACCENTS, REGEXEN['valid_url_balanced_parens'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['valid_url_path'] = re.compile(ur'(?:(?:%s*(?:%s %s*)*%s)|(?:%s+\/))' % (REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_balanced_parens'].pattern, REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_path_ending_chars'].pattern, REGEXEN['valid_general_url_path_chars'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_url_path_ending_chars'] = re.compile( + ur'[a-z%s0-9=_#\/\+\-%s]|(?:%s)' % ( + NON_LATIN_HASHTAG_CHARS, + LATIN_ACCENTS, + REGEXEN['valid_url_balanced_parens'].pattern + ), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_url_path'] = re.compile( + ur'(?:(?:%s*(?:%s %s*)*%s)|(?:%s+\/))' % ( + REGEXEN['valid_general_url_path_chars'].pattern, + REGEXEN['valid_url_balanced_parens'].pattern, + REGEXEN['valid_general_url_path_chars'].pattern, + REGEXEN['valid_url_path_ending_chars'].pattern, + REGEXEN['valid_general_url_path_chars'].pattern + ), re.IGNORECASE | re.UNICODE) REGEXEN['valid_url_query_chars'] = re.compile(ur"[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]", re.IGNORECASE | re.UNICODE) REGEXEN['valid_url_query_ending_chars'] = re.compile(ur'[a-z0-9_&=#\/]', re.IGNORECASE | re.UNICODE) @@ -245,7 +259,10 @@ def regex_range(start, end=None): REGEXEN['valid_cashtag'] = re.compile(ur'(^|[%s])(\$|$|﹩)(%s)(?=$|\s|[%s])' % (REGEXEN['spaces'].pattern, REGEXEN['cashtag'].pattern, PUNCTUATION_CHARS), re.IGNORECASE) # These URL validation pattern strings are based on the ABNF from RFC 3986 -REGEXEN['validate_url_unreserved'] = re.compile(ur'[a-z0-9\-._~]', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unreserved'] = re.compile( + ur'[a-z%s0-9\-._~]' % ( + NON_LATIN_HASHTAG_CHARS, + ), re.IGNORECASE | re.UNICODE) REGEXEN['validate_url_pct_encoded'] = re.compile(ur'(?:%[0-9a-f]{2})', re.IGNORECASE | re.UNICODE) REGEXEN['validate_url_sub_delims'] = re.compile(ur"[!$&'()*+,;=]", re.IGNORECASE | re.UNICODE) REGEXEN['validate_url_pchar'] = re.compile(ur'(?:%s|%s|%s|[:\|@])' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE) From f012873a26fee9875e49b825bd44371746b38445 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 15:18:54 -0700 Subject: [PATCH 17/30] Improved unicode hashcode support. Autolink 100% passing. --- twitter_text/regex.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/twitter_text/regex.py b/twitter_text/regex.py index 0eef03e..60c1334 100644 --- a/twitter_text/regex.py +++ b/twitter_text/regex.py @@ -157,11 +157,16 @@ def regex_range(start, end=None): CTRL_CHARS = ur"\x00-\x1F\x7F" # A hashtag must contain latin characters, numbers and underscores, but not all numbers. -HASHTAG_ALPHA = ur'[a-z_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) +HASHTAG_ALPHA = ur'[\p{L}\p{M}]' HASHTAG_ALPHANUMERIC = ur'[a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) HASHTAG_BOUNDARY = ur'\A|\z|\[|[^&a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) -HASHTAG = re.compile(ur'(%s)(#|#)(%s*%s%s*)' % (HASHTAG_BOUNDARY, HASHTAG_ALPHANUMERIC, HASHTAG_ALPHA, HASHTAG_ALPHANUMERIC), re.IGNORECASE) +HASHTAG = re.compile(ur'(%s)(#|#)(?!\ufe0f|\u20e3)(%s*%s%s*)' % ( + HASHTAG_BOUNDARY, + HASHTAG_ALPHANUMERIC, + HASHTAG_ALPHA, + HASHTAG_ALPHANUMERIC, +), re.IGNORECASE) REGEXEN['valid_hashtag'] = HASHTAG REGEXEN['end_hashtag_match'] = re.compile(ur'\A(?:[##]|:\/\/)', re.IGNORECASE | re.UNICODE) From 9a9bfe813cb9c776b6d05958aa32f2a65f81d161 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 15:26:06 -0700 Subject: [PATCH 18/30] Corrected typo affecting tests that don't ignore attribute order. --- conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index 0add157..4a2da8e 100644 --- a/conftest.py +++ b/conftest.py @@ -161,7 +161,7 @@ def runtest(self): if self.section in TEST_MAP[self.filename].get('ignore_attribute_order', ()): equal = self._equal_without_attribute_order(result, self.spec['expected']) else: - equal = result == self.spect['expected'] + equal = result == self.spec['expected'] if not equal: raise YamlException("{} != {}".format(result, self.spec['expected'])) From 7ad41afd8971ad60add77fdb7d8f1244ef893f95 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 15:31:16 -0700 Subject: [PATCH 19/30] Limited regex to Cyrillic in same places where other conformant libs do. --- twitter_text/regex.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/twitter_text/regex.py b/twitter_text/regex.py index 60c1334..f839bfb 100644 --- a/twitter_text/regex.py +++ b/twitter_text/regex.py @@ -208,8 +208,7 @@ def regex_range(start, end=None): REGEXEN['valid_port_number'] = re.compile(ur'[0-9]+') REGEXEN['valid_general_url_path_chars'] = re.compile( - ur"[a-z%s0-9!\*';:=\+\,\.\$\/%%#\[\]\-_~&|@%s]" % ( - NON_LATIN_HASHTAG_CHARS, + ur"[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%%#\[\]\-_~&|@%s]" % ( LATIN_ACCENTS, ), re.IGNORECASE | re.UNICODE) # Allow URL paths to contain balanced parens @@ -226,8 +225,7 @@ def regex_range(start, end=None): # Valid end-of-path chracters (so /foo. does not gobble the period). # 1. Allow =&# for empty URL parameters and other URL-join artifacts REGEXEN['valid_url_path_ending_chars'] = re.compile( - ur'[a-z%s0-9=_#\/\+\-%s]|(?:%s)' % ( - NON_LATIN_HASHTAG_CHARS, + ur'[a-z\p{Cyrillic}0-9=_#\/\+\-%s]|(?:%s)' % ( LATIN_ACCENTS, REGEXEN['valid_url_balanced_parens'].pattern ), re.IGNORECASE | re.UNICODE) @@ -264,10 +262,7 @@ def regex_range(start, end=None): REGEXEN['valid_cashtag'] = re.compile(ur'(^|[%s])(\$|$|﹩)(%s)(?=$|\s|[%s])' % (REGEXEN['spaces'].pattern, REGEXEN['cashtag'].pattern, PUNCTUATION_CHARS), re.IGNORECASE) # These URL validation pattern strings are based on the ABNF from RFC 3986 -REGEXEN['validate_url_unreserved'] = re.compile( - ur'[a-z%s0-9\-._~]' % ( - NON_LATIN_HASHTAG_CHARS, - ), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unreserved'] = re.compile(ur'[a-z\p{Cyrillic}0-9\-._~]', re.IGNORECASE | re.UNICODE) REGEXEN['validate_url_pct_encoded'] = re.compile(ur'(?:%[0-9a-f]{2})', re.IGNORECASE | re.UNICODE) REGEXEN['validate_url_sub_delims'] = re.compile(ur"[!$&'()*+,;=]", re.IGNORECASE | re.UNICODE) REGEXEN['validate_url_pchar'] = re.compile(ur'(?:%s|%s|%s|[:\|@])' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE) From e6e8e6a3e56e0f6a7cb621d4f5dc0da100c1cad5 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 15:59:49 -0700 Subject: [PATCH 20/30] Improved & updated TLD detection Started pulling list of tlds directly from conformance core. --- conftest.py | 8 +------- twitter_text/extractor.py | 8 +++++--- twitter_text/regex.py | 34 +++++++++++++++++++++++++++++++--- 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/conftest.py b/conftest.py index 4a2da8e..93c5a6c 100644 --- a/conftest.py +++ b/conftest.py @@ -2,21 +2,15 @@ from __future__ import unicode_literals -import argparse import json import os -import re -import sys import pytest +import yaml import twitter_text from twitter_text.encoding import force_text, smart_bytes -try: - import yaml -except ImportError: - raise Exception('You need to install pyaml to run the tests') # from http://stackoverflow.com/questions/2890146/how-to-force-pyyaml-to-load-strings-as-unicode-objects #from yaml import Loader, SafeLoader diff --git a/twitter_text/extractor.py b/twitter_text/extractor.py index 87245b8..1f51fce 100644 --- a/twitter_text/extractor.py +++ b/twitter_text/extractor.py @@ -173,10 +173,12 @@ def extract_urls_with_indices(self, options={'extract_url_without_protocol': Tru ascii_domain = ascii_domain.group() last_url = { 'url': ascii_domain, - 'indices': [start_position - len(before or '') + complete.find(ascii_domain), start_position - len(before or '') + complete.find(ascii_domain) + len(ascii_domain)] + 'indices': [start_position - len(before or '') + complete.find(ascii_domain), + start_position - len(before or '') + complete.find(ascii_domain) + len(ascii_domain)] } - last_url_invalid_match = REGEXEN['invalid_short_domain'].search(ascii_domain) is not None - if not last_url_invalid_match: + if (path or + REGEXEN['valid_special_short_domain'].search(ascii_domain) or + not REGEXEN['invalid_short_domain'].search(ascii_domain)): urls.append(last_url) # no ASCII-only domain found. Skip the entire URL if not last_url: diff --git a/twitter_text/regex.py b/twitter_text/regex.py index f839bfb..12ca9b8 100644 --- a/twitter_text/regex.py +++ b/twitter_text/regex.py @@ -5,7 +5,12 @@ # used throughout the Twitter classes. Special care has been taken to make # sure these regular expressions work with Tweets in all languages. from __future__ import absolute_import +import os + import regex as re +import yaml + +from twitter_text.encoding import force_text REGEXEN = {} # :nodoc: @@ -16,6 +21,15 @@ def regex_range(start, end=None): else: return u'%s' % unichr(start) +TLDS = yaml.safe_load(force_text( + open(os.path.join( + os.path.dirname(os.path.dirname(__file__)), + 'twitter-text', + 'conformance', + 'tld_lib.yml' + )).read() +)) + # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand # to access both the list of characters and a pattern suitible for use with String#split @@ -189,11 +203,24 @@ def regex_range(start, end=None): REGEXEN['invalid_url_without_protocol_preceding_chars'] = re.compile(ur'[-_.\/]$') DOMAIN_VALID_CHARS = ur'[^%s%s%s%s%s]' % (PUNCTUATION_CHARS, SPACE_CHARS, CTRL_CHARS, ur''.join(REGEXEN['invalid_control_characters']), ur''.join(UNICODE_SPACES)) REGEXEN['valid_subdomain'] = re.compile(ur'(?:(?:%s(?:[_-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE) -REGEXEN['valid_domain_name'] = re.compile(ur'(?:(?:%s(?:[-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE) -REGEXEN['valid_gTLD'] = re.compile(ur'(?:(?:academy|actor|aero|agency|arpa|asia|bar|bargains|berlin|best|bid|bike|biz|blue|boutique|build|builders|buzz|cab|camera|camp|cards|careers|cat|catering|center|ceo|cheap|christmas|cleaning|clothing|club|codes|coffee|com|community|company|computer|construction|contractors|cool|coop|cruises|dance|dating|democrat|diamonds|directory|domains|edu|education|email|enterprises|equipment|estate|events|expert|exposed|farm|fish|flights|florist|foundation|futbol|gallery|gift|glass|gov|graphics|guitars|guru|holdings|holiday|house|immobilien|industries|info|institute|int|international|jobs|kaufen|kim|kitchen|kiwi|koeln|kred|land|lighting|limo|link|luxury|management|mango|marketing|menu|mil|mobi|moda|monash|museum|nagoya|name|net|neustar|ninja|okinawa|onl|org|partners|parts|photo|photography|photos|pics|pink|plumbing|post|pro|productions|properties|pub|qpon|recipes|red|rentals|repair|report|reviews|rich|ruhr|sexy|shiksha|shoes|singles|social|solar|solutions|supplies|supply|support|systems|tattoo|technology|tel|tienda|tips|today|tokyo|tools|training|travel|uno|vacations|ventures|viajes|villas|vision|vote|voting|voto|voyage|wang|watch|wed|wien|wiki|works|xxx|xyz|zone|дети|онлайн|орг|сайт|بازار|شبكة|みんな|中信|中文网|公司|公>益|在线|我爱你|政务|游戏|移动|网络|集团|삼성)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE) -REGEXEN['valid_ccTLD'] = re.compile(ur'(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bl|bm|bn|bo|bq|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mf|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|мон|рф|срб|укр|қаз|الاردن|الجزائر|السعودية|المغرب|امارات|ایران|بھارت|تونس|سودان|سورية|عمان|فلسطين|قطر|مصر|مليسيا|پاکستان|भारत|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|ไทย|გე|中国|中加坡|湾|台灣|新香港|한국)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE) +REGEXEN['valid_domain_name'] = re.compile( + ur'(?:(?:%s(?:[-]|%s)*)?%s\.)' % ( + DOMAIN_VALID_CHARS, + DOMAIN_VALID_CHARS, + DOMAIN_VALID_CHARS + ), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_gTLD'] = re.compile( + ur'(?:(?:%s)(?=[^0-9a-z@]|$))' % ( + '|'.join(TLDS['generic']), + ), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_ccTLD'] = re.compile( + ur'(?:(?:%s)(?=[^0-9a-z@]|$))' % ( + '|'.join(TLDS['country']), + ), re.IGNORECASE | re.UNICODE) REGEXEN['valid_punycode'] = re.compile(ur'(?:xn--[0-9a-z]+)', re.IGNORECASE | re.UNICODE) +REGEXEN['valid_special_cctld'] = re.compile(ur'(?:(?:co|tv)(?=[^0-9a-z@]|$))') + REGEXEN['valid_domain'] = re.compile(ur'(?:%s*%s(?:%s|%s|%s))' % (REGEXEN['valid_subdomain'].pattern, REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE) # This is used in Extractor @@ -204,6 +231,7 @@ def regex_range(start, end=None): # This is used in Extractor to filter out unwanted URLs. REGEXEN['invalid_short_domain'] = re.compile(ur'\A%s%s\Z' % (REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_ccTLD'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_special_short_domain'] = re.compile(ur'\A%s%s\Z' % (REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_special_cctld'].pattern)) REGEXEN['valid_port_number'] = re.compile(ur'[0-9]+') From 263b5cf8832ebe19bfd0379963a2d8a814d61513 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 16:20:07 -0700 Subject: [PATCH 21/30] Corrected list of CJ hashtag chars that work in narrow build --- twitter_text/regex.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/twitter_text/regex.py b/twitter_text/regex.py index 12ca9b8..d3a7d7b 100644 --- a/twitter_text/regex.py +++ b/twitter_text/regex.py @@ -149,6 +149,9 @@ def regex_range(start, end=None): regex_range(0x3099, 0x309E), # Hiragana regex_range(0x3400, 0x4DBF), # Kanji (CJK Extension A) regex_range(0x4E00, 0x9FFF), # Kanji (Unified) + regex_range(0x3003), # Kanji (CJK supplement) + regex_range(0x3005), # Kanji (CJK supplement) + regex_range(0x303B), # Kanji (CJK supplement) ]) try: @@ -158,9 +161,6 @@ def regex_range(start, end=None): regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C) regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D) regex_range(0x2F800, 0x2FA1F), # Kanji (CJK supplement) - regex_range(0x3003), # Kanji (CJK supplement) - regex_range(0x3005), # Kanji (CJK supplement) - regex_range(0x303B), # Kanji (CJK supplement) ]) except ValueError: # this is a narrow python build so these extended Kanji characters won't work From 35467a1d2b800b79a8a71555a60b8e24a06f5577 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 16:27:54 -0700 Subject: [PATCH 22/30] Matched hashtag regex exactly to ruby version --- twitter_text/regex.py | 78 ++----------------------------------------- 1 file changed, 2 insertions(+), 76 deletions(-) diff --git a/twitter_text/regex.py b/twitter_text/regex.py index d3a7d7b..8bbdd38 100644 --- a/twitter_text/regex.py +++ b/twitter_text/regex.py @@ -92,88 +92,14 @@ def regex_range(start, end=None): regex_range(0xFE70, 0xFEFF) ]) -NON_LATIN_HASHTAG_CHARS = ''.join([ - # Cyrillic (Russian, Ukrainian, etc.) - '\p{Cyrillic}', # Cyrillic - regex_range(0x0591, 0x05bf), # Hebrew - regex_range(0x05c1, 0x05c2), - regex_range(0x05c4, 0x05c5), - regex_range(0x05c7), - regex_range(0x05d0, 0x05ea), - regex_range(0x05f0, 0x05f4), - regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms - regex_range(0xfb2a, 0xfb36), - regex_range(0xfb38, 0xfb3c), - regex_range(0xfb3e), - regex_range(0xfb40, 0xfb41), - regex_range(0xfb43, 0xfb44), - regex_range(0xfb46, 0xfb4f), - regex_range(0x0610, 0x061a), # Arabic - regex_range(0x0620, 0x065f), - regex_range(0x066e, 0x06d3), - regex_range(0x06d5, 0x06dc), - regex_range(0x06de, 0x06e8), - regex_range(0x06ea, 0x06ef), - regex_range(0x06fa, 0x06fc), - regex_range(0x06ff), - regex_range(0x0750, 0x077f), # Arabic Supplement - regex_range(0x08a0), # Arabic Extended A - regex_range(0x08a2, 0x08ac), - regex_range(0x08e4, 0x08fe), - regex_range(0xfb50, 0xfbb1), # Arabic Pres. Forms A - regex_range(0xfbd3, 0xfd3d), - regex_range(0xfd50, 0xfd8f), - regex_range(0xfd92, 0xfdc7), - regex_range(0xfdf0, 0xfdfb), - regex_range(0xfe70, 0xfe74), # Arabic Pres. Forms B - regex_range(0xfe76, 0xfefc), - regex_range(0x200c, 0x200c), # Zero-Width Non-Joiner - regex_range(0x0e01, 0x0e3a), # Thai - regex_range(0x0e40, 0x0e4e), # Hangul (Korean) - regex_range(0x1100, 0x11ff), # Hangul Jamo - regex_range(0x3130, 0x3185), # Hangul Compatibility Jamo - regex_range(0xA960, 0xA97F), # Hangul Jamo Extended-A - regex_range(0xAC00, 0xD7AF), # Hangul Syllables - regex_range(0xD7B0, 0xD7FF), # Hangul Jamo Extended-B - regex_range(0xFFA1, 0xFFDC), # Half-width Hangul -]) - -CJ_HASHTAG_CHARACTERS = ''.join([ - regex_range(0x30A1, 0x30FA), # Katakana (full-width) - regex_range(0x30FC, 0x30FE), # Katakana (full-width) - regex_range(0xFF66, 0xFF9F), # Katakana (half-width) - regex_range(0xFF10, 0xFF19), # Latin (full-width) - regex_range(0xFF21, 0xFF3A), # Latin (full-width) - regex_range(0xFF41, 0xFF5A), # Latin (full-width) - regex_range(0x3041, 0x3096), # Hiragana - regex_range(0x3099, 0x309E), # Hiragana - regex_range(0x3400, 0x4DBF), # Kanji (CJK Extension A) - regex_range(0x4E00, 0x9FFF), # Kanji (Unified) - regex_range(0x3003), # Kanji (CJK supplement) - regex_range(0x3005), # Kanji (CJK supplement) - regex_range(0x303B), # Kanji (CJK supplement) -]) - -try: - CJ_HASHTAG_CHARACTERS = ''.join([ - CJ_HASHTAG_CHARACTERS, - regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B) - regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C) - regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D) - regex_range(0x2F800, 0x2FA1F), # Kanji (CJK supplement) - ]) -except ValueError: - # this is a narrow python build so these extended Kanji characters won't work - pass - PUNCTUATION_CHARS = ur'!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~' SPACE_CHARS = ur" \t\n\x0B\f\r" CTRL_CHARS = ur"\x00-\x1F\x7F" # A hashtag must contain latin characters, numbers and underscores, but not all numbers. HASHTAG_ALPHA = ur'[\p{L}\p{M}]' -HASHTAG_ALPHANUMERIC = ur'[a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) -HASHTAG_BOUNDARY = ur'\A|\z|\[|[^&a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) +HASHTAG_ALPHANUMERIC = ur'[\p{L}\p{M}\p{Nd}_\u200c\u200d\u0482\ua673\ua67e\u05be\u05f3\u05f4\uff5e\u301c\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7]' +HASHTAG_BOUNDARY = ur'\A|\z|[^&\p{L}\p{M}\p{Nd}_\u200c\u200d\u0482\ua673\ua67e\u05be\u05f3\u05f4\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7]' HASHTAG = re.compile(ur'(%s)(#|#)(?!\ufe0f|\u20e3)(%s*%s%s*)' % ( HASHTAG_BOUNDARY, From d88d2e53f9278b9a3d03ffad84d2d9e15e4b3c6b Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 16:31:30 -0700 Subject: [PATCH 23/30] Updated mention preceding characters regex. All running tests passing. --- twitter_text/regex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twitter_text/regex.py b/twitter_text/regex.py index 8bbdd38..3129e51 100644 --- a/twitter_text/regex.py +++ b/twitter_text/regex.py @@ -112,7 +112,7 @@ def regex_range(start, end=None): REGEXEN['end_hashtag_match'] = re.compile(ur'\A(?:[##]|:\/\/)', re.IGNORECASE | re.UNICODE) REGEXEN['numeric_only'] = re.compile(ur'^[\d]+$') -REGEXEN['valid_mention_preceding_chars'] = re.compile(r'(?:[^a-zA-Z0-9_!#\$%&*@@]|^|RT:?)') +REGEXEN['valid_mention_preceding_chars'] = re.compile(r'(?:[^a-zA-Z0-9_!#\$%&*@@]|^|(?:^|[^a-zA-Z0-9_+~.-])[rR][tT]:?)') REGEXEN['at_signs'] = re.compile(ur'[@@]') REGEXEN['valid_mention_or_list'] = re.compile( ur'(%s)' % REGEXEN['valid_mention_preceding_chars'].pattern.decode('utf-8') + # preceding character From 00b9f201cd926ed9e934ee90d1b6e9ead9eadf94 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 16:35:40 -0700 Subject: [PATCH 24/30] Added django to test requirements. --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 334171d..3cfae74 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ argparse==1.2.1 PyYAML==3.10 beautifulsoup4==4.4.1 +Django==1.9.6 lxml==3.4.4 pytest==2.9.1 py==1.4.29 From 86e84d2d278c4a339e7feaf102aa1fecb65559e5 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 16:47:48 -0700 Subject: [PATCH 25/30] Added sudo:false to travis.yml See https://docs.travis-ci.com/user/workers/container-based-infrastructure/ --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 3c79536..57be34a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,4 @@ +sudo: false language: python python: - "2.6" From 284192dfa6b38f91f627f7219abcc07dc87cb692 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 16:48:59 -0700 Subject: [PATCH 26/30] Removed python 2.6 test run. --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 57be34a..b29bfb2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,6 @@ sudo: false language: python python: - - "2.6" - "2.7" install: From 74691912871c7ecc21b9b441abca2db4d43d5adb Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 17:03:58 -0700 Subject: [PATCH 27/30] Corrected Validation._valid_match logic. --- conftest.py | 2 ++ twitter_text/validation.py | 5 ++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/conftest.py b/conftest.py index 93c5a6c..a2218f0 100644 --- a/conftest.py +++ b/conftest.py @@ -112,12 +112,14 @@ def collect(self): 'validate': { 'cls': twitter_text.validation.Validation, 'requires_wide_build': True, + 'options': {'require_protocol': False}, 'methods': { 'tweets': 'valid_tweet_text', 'usernames': 'valid_username', 'lists': 'valid_list', 'hashtags': 'valid_hashtag', 'urls': 'valid_url', + 'urls_without_protocol': 'valid_url', }, } } diff --git a/twitter_text/validation.py b/twitter_text/validation.py index 25b4d3d..efd8bd9 100644 --- a/twitter_text/validation.py +++ b/twitter_text/validation.py @@ -149,9 +149,8 @@ def valid_url(self, unicode_domains=True, require_protocol=True): ) def _valid_match(self, string, re_obj, optional=False): - if optional and string is None: - return True - match = re_obj.match(string) + if string: + match = re_obj.match(string) if optional: return not (string and (match is None or not match.string[match.span()[0]:match.span()[1]] == string)) else: From ccbd4cf75941b1c5e95fb56269f8c6672cb7079e Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 17:04:14 -0700 Subject: [PATCH 28/30] Added handling of tweet_length tests. --- conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/conftest.py b/conftest.py index a2218f0..b076a15 100644 --- a/conftest.py +++ b/conftest.py @@ -120,6 +120,7 @@ def collect(self): 'hashtags': 'valid_hashtag', 'urls': 'valid_url', 'urls_without_protocol': 'valid_url', + 'lengths': 'tweet_length', }, } } From 801274c6ecee777ff078701a33e0d23d11805957 Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 17:35:48 -0700 Subject: [PATCH 29/30] Corrected running of url without protocol tests --- conftest.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/conftest.py b/conftest.py index b076a15..221c885 100644 --- a/conftest.py +++ b/conftest.py @@ -112,14 +112,13 @@ def collect(self): 'validate': { 'cls': twitter_text.validation.Validation, 'requires_wide_build': True, - 'options': {'require_protocol': False}, 'methods': { 'tweets': 'valid_tweet_text', 'usernames': 'valid_username', 'lists': 'valid_list', 'hashtags': 'valid_hashtag', 'urls': 'valid_url', - 'urls_without_protocol': 'valid_url', + 'urls_without_protocol': ('valid_url', {'require_protocol': False}), 'lengths': 'tweet_length', }, } @@ -144,10 +143,14 @@ def runtest(self): if self.section not in TEST_MAP[self.filename]['methods']: raise YamlException("{}:{} section not supported".format(self.section)) cls = TEST_MAP[self.filename]['cls'] - method_name = TEST_MAP[self.filename]['methods'][self.section] instance = cls(self.spec['text']) args = [] - kwargs = {} + try: + method_name, kwargs = TEST_MAP[self.filename]['methods'][self.section] + kwargs = kwargs.copy() + except ValueError: + kwargs = {} + method_name = TEST_MAP[self.filename]['methods'][self.section] if 'json' in self.spec: args.append(json.loads(self.spec['json'])) if 'options' in TEST_MAP[self.filename]: From 09a504339babf3911674c1855051ce90bffb158d Mon Sep 17 00:00:00 2001 From: Stephen Burrows Date: Thu, 12 May 2016 17:36:06 -0700 Subject: [PATCH 30/30] Updated short url length --- twitter_text/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twitter_text/validation.py b/twitter_text/validation.py index efd8bd9..3990ddf 100644 --- a/twitter_text/validation.py +++ b/twitter_text/validation.py @@ -9,7 +9,7 @@ MAX_LENGTH = 140 DEFAULT_TCO_URL_LENGTHS = { - 'short_url_length': 22, + 'short_url_length': 23, 'short_url_length_https': 23, 'characters_reserved_per_media': 22, }