From 45191e43d11fd95876157a4432109cc558253202 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Tue, 1 Mar 2016 16:59:12 -0800
Subject: [PATCH 01/30] PEP8 cleanup

---
 setup.py                                |   4 +-
 tests.py                                |  42 +++++---
 twitter_text/__init__.py                |  34 ++++---
 twitter_text/autolink.py                |  72 +++++++-------
 twitter_text/extractor.py               |  59 ++++++------
 twitter_text/highlighter.py             |  14 ++-
 twitter_text/regex.py                   | 123 +++++++++++++-----------
 twitter_text/templatetags/twitterize.py |  11 ++-
 twitter_text/unicode.py                 |  16 +--
 twitter_text/validation.py              |  65 ++++++-------
 10 files changed, 241 insertions(+), 199 deletions(-)

diff --git a/setup.py b/setup.py
index fcdabb2..bb27c76 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,5 @@
 from setuptools import setup, find_packages
- 
+
 setup(
     name='twitter-text-py',
     version='2.0.2',
@@ -19,5 +19,5 @@
     ],
     include_package_data=True,
     install_requires=['setuptools'],
-    license = "BSD"
+    license="BSD"
 )
diff --git a/tests.py b/tests.py
index 891b35e..dd896c5 100644
--- a/tests.py
+++ b/tests.py
@@ -1,8 +1,22 @@
 # encoding=utf-8
 
-import twitter_text, sys, os, json, argparse, re
+import argparse
+import json
+import os
+import re
+import sys
+import twitter_text
+
 from twitter_text.unicode import force_unicode
 
+try:
+    import yaml
+except ImportError:
+    raise Exception('You need to install pyaml to run the tests')
+# from http://stackoverflow.com/questions/2890146/how-to-force-pyyaml-to-load-strings-as-unicode-objects
+from yaml import Loader, SafeLoader
+
+
 narrow_build = True
 try:
     unichr(0x20000)
@@ -10,21 +24,18 @@
 except:
     pass
 
-parser = argparse.ArgumentParser(description = u'Run the integration tests for twitter_text')
-parser.add_argument('--ignore-narrow-errors', '-i', help = u'Ignore errors caused by narrow builds', default = False, action = 'store_true')
+
+parser = argparse.ArgumentParser(description=u'Run the integration tests for twitter_text')
+parser.add_argument('--ignore-narrow-errors', '-i', help=u'Ignore errors caused by narrow builds', default=False, action='store_true')
 args = parser.parse_args()
 
-try:
-    import yaml
-except ImportError:
-    raise Exception('You need to install pyaml to run the tests')
-# from http://stackoverflow.com/questions/2890146/how-to-force-pyyaml-to-load-strings-as-unicode-objects
-from yaml import Loader, SafeLoader
+
 def construct_yaml_str(self, node):
     return self.construct_scalar(node)
 Loader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str)
 SafeLoader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str)
 
+
 try:
     from bs4 import BeautifulSoup
 except ImportError:
@@ -33,15 +44,19 @@ def construct_yaml_str(self, node):
     except ImportError:
         raise Exception('You need to install BeautifulSoup to run the tests')
 
+
 def success(text):
     return (u'\033[92m%s\033[0m\n' % text).encode('utf-8')
 
+
 def error(text):
     return (u'\033[91m%s\033[0m\n' % text).encode('utf-8')
 
+
 attempted = 0
 
-def assert_equal_without_attribute_order(result, test, failure_message = None):
+
+def assert_equal_without_attribute_order(result, test, failure_message=None):
     global attempted
     attempted += 1
     # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through
@@ -49,6 +64,7 @@ def assert_equal_without_attribute_order(result, test, failure_message = None):
     sys.stdout.write(success(u'Test %d Passed: %s' % (attempted, test.get('description'))))
     sys.stdout.flush()
 
+
 def assert_equal(result, test):
     global attempted
     attempted += 1
@@ -140,9 +156,9 @@ def assert_equal(result, test):
     for test in hit_highlighting_tests.get('tests').get(section):
         hit_highlighter = twitter_text.highlighter.HitHighlighter(test.get('text'))
         if section == 'plain_text':
-            assert_equal(hit_highlighter.hit_highlight(hits = test.get('hits')), test)
+            assert_equal(hit_highlighter.hit_highlight(hits=test.get('hits')), test)
         elif section == 'with_links':
-            assert_equal_without_attribute_order(hit_highlighter.hit_highlight(hits = test.get('hits')), test)
+            assert_equal_without_attribute_order(hit_highlighter.hit_highlight(hits=test.get('hits')), test)
 
 # validation section
 validation_tested = False
@@ -177,4 +193,4 @@ def assert_equal(result, test):
 
 sys.stdout.write(u'\033[0m-------\n\033[92m%d tests passed.\033[0m\n' % attempted)
 sys.stdout.flush()
-sys.exit(os.EX_OK)
\ No newline at end of file
+sys.exit(os.EX_OK)
diff --git a/twitter_text/__init__.py b/twitter_text/__init__.py
index bb06120..6f17ac2 100644
--- a/twitter_text/__init__.py
+++ b/twitter_text/__init__.py
@@ -6,33 +6,39 @@
 from twitter_text.validation import Validation
 from twitter_text.unicode import force_unicode
 
+
 class TwitterText(object):
     def __init__(self, text):
-        self.text = force_unicode(text) # this will get modified by some functions
-        self.original_text = self.text # this never changes; use it as a fallback or for comparison
+        # this will get modified by some functions
+        self.text = force_unicode(text)
+        # this never changes; use it as a fallback or for comparison
+        self.original_text = self.text
         self.has_been_linked = False
-        self.tweet_length = None # gets changed by validation method
-        self.tweet_is_valid = None # gets changed by validation method
-        self.validation_error = None # gets changed by validation method
-        
+        # gets changed by validation method
+        self.tweet_length = None
+        # gets changed by validation method
+        self.tweet_is_valid = None
+        # gets changed by validation method
+        self.validation_error = None
+
     def __unicode__(self):
         return self.text
-        
+
     def __repr__(self):
         return self.__unicode__()
-    
+
     @property
     def autolink(self):
-        return Autolink(self.text, parent = self)
-        
+        return Autolink(self.text, parent=self)
+
     @property
     def extractor(self):
         return Extractor(self.text)
-        
+
     @property
     def highlighter(self):
-        return HitHighlighter(self.text, parent = self)
-        
+        return HitHighlighter(self.text, parent=self)
+
     @property
     def validation(self):
-        return Validation(self.text, parent = self)
\ No newline at end of file
+        return Validation(self.text, parent=self)
diff --git a/twitter_text/autolink.py b/twitter_text/autolink.py
index 821d042..85e21aa 100644
--- a/twitter_text/autolink.py
+++ b/twitter_text/autolink.py
@@ -1,6 +1,6 @@
 # encoding=utf-8
 
-import re, cgi
+import re
 
 from twitter_text.regex import REGEXEN
 from twitter_text.unicode import force_unicode
@@ -28,17 +28,17 @@
 DEFAULT_INVISIBLE_TAG_ATTRS = "style='position:absolute;left:-9999px;'"
 
 DEFAULT_OPTIONS = {
-  'list_class':             DEFAULT_LIST_CLASS,
-  'username_class':         DEFAULT_USERNAME_CLASS,
-  'hashtag_class':          DEFAULT_HASHTAG_CLASS,
-  'cashtag_class':          DEFAULT_CASHTAG_CLASS,
+    'list_class':             DEFAULT_LIST_CLASS,
+    'username_class':         DEFAULT_USERNAME_CLASS,
+    'hashtag_class':          DEFAULT_HASHTAG_CLASS,
+    'cashtag_class':          DEFAULT_CASHTAG_CLASS,
 
-  'username_url_base':      DEFAULT_USERNAME_URL_BASE,
-  'list_url_base':          DEFAULT_LIST_URL_BASE,
-  'hashtag_url_base':       DEFAULT_HASHTAG_URL_BASE,
-  'cashtag_url_base':       DEFAULT_CASHTAG_URL_BASE,
+    'username_url_base':      DEFAULT_USERNAME_URL_BASE,
+    'list_url_base':          DEFAULT_LIST_URL_BASE,
+    'hashtag_url_base':       DEFAULT_HASHTAG_URL_BASE,
+    'cashtag_url_base':       DEFAULT_CASHTAG_URL_BASE,
 
-  'invisible_tag_attrs':    DEFAULT_INVISIBLE_TAG_ATTRS,
+    'invisible_tag_attrs':    DEFAULT_INVISIBLE_TAG_ATTRS,
 }
 
 OPTIONS_NOT_ATTRIBUTES = (
@@ -69,30 +69,32 @@
 )
 
 HTML_ENTITIES = {
-  '&': '&amp;',
-  '>': '&gt;',
-  '<': '&lt;',
-  '"': '&quot;',
-  "'": '&#39;',
+    '&': '&amp;',
+    '>': '&gt;',
+    '<': '&lt;',
+    '"': '&quot;',
+    "'": '&#39;',
 }
 
 BOOLEAN_ATTRIBUTES = (
-    'disabled', 
+    'disabled',
     'readonly',
     'multiple',
     'checked',
 )
 
+
 def default_transform(entity, text):
     return text
 
+
 class Autolink(object):
     def __init__(self, text, **kwargs):
         self.text = force_unicode(text)
         self.parent = kwargs.get('parent', False)
         self.extractor = Extractor(self.text)
 
-    def auto_link_with_json(self, json_obj, options = {}):
+    def auto_link_with_json(self, json_obj, options={}):
         # concantenate entities
         entities = []
         if 'entities' in json_obj:
@@ -108,7 +110,7 @@ def auto_link_with_json(self, json_obj, options = {}):
 
         return self.auto_link_entities(entities, options)
 
-    def auto_link_entities(self, entities = [], options = {}):
+    def auto_link_entities(self, entities=[], options={}):
         if not self.text:
             return self.text
 
@@ -118,7 +120,7 @@ def auto_link_entities(self, entities = [], options = {}):
         if not options.get('suppress_no_follow', False):
             options['html_attrs']['rel'] = "nofollow"
 
-        entities.sort(key = lambda entity: entity['indices'][0], reverse = True)
+        entities.sort(key=lambda entity: entity['indices'][0], reverse=True)
         chars = self.text
 
         for entity in entities:
@@ -133,7 +135,7 @@ def auto_link_entities(self, entities = [], options = {}):
 
         return chars
 
-    def auto_link(self, options = {}):
+    def auto_link(self, options={}):
         """
         Add <a></a> tags around the usernames, lists, hashtags and URLs in the provided text.
         The <a> tags can be controlled with the following entries in the options hash.
@@ -161,7 +163,7 @@ def auto_link(self, options = {}):
         """
         return self.auto_link_entities(self.extractor.extract_entities_with_indices({'extract_url_without_protocol': False}), options)
 
-    def auto_link_usernames_or_lists(self, options = {}):
+    def auto_link_usernames_or_lists(self, options={}):
         """
         Add <a></a> tags around the usernames and lists in the provided text. The
         <a> tags can be controlled with the following entries in the options hash.
@@ -182,7 +184,7 @@ def auto_link_usernames_or_lists(self, options = {}):
         """
         return self.auto_link_entities(self.extractor.extract_mentions_or_lists_with_indices(), options)
 
-    def auto_link_hashtags(self, options = {}):
+    def auto_link_hashtags(self, options={}):
         """
         Add <a></a> tags around the hashtags in the provided text.
         The <a> tags can be controlled with the following entries in the options hash.
@@ -199,7 +201,7 @@ def auto_link_hashtags(self, options = {}):
         """
         return self.auto_link_entities(self.extractor.extract_hashtags_with_indices(), options)
 
-    def auto_link_cashtags(self, options = {}):
+    def auto_link_cashtags(self, options={}):
         """
         Add <a></a> tags around the cashtags in the provided text.
         The <a> tags can be controlled with the following entries in the options hash.
@@ -216,7 +218,7 @@ def auto_link_cashtags(self, options = {}):
         """
         return self.auto_link_entities(self.extractor.extract_cashtags_with_indices(), options)
 
-    def auto_link_urls(self, options = {}):
+    def auto_link_urls(self, options={}):
         """
         Add <a></a> tags around the URLs in the provided text.
         The <a> tags can be controlled with the following entries in the options hash.
@@ -240,13 +242,13 @@ def _html_escape(self, text):
             text = text.replace(char, HTML_ENTITIES[char])
         return text
 
-    def _extract_html_attrs_from_options(self, options = {}):
+    def _extract_html_attrs_from_options(self, options={}):
         html_attrs = options.get('html_attrs', {})
         options = options.copy()
         if 'html_attrs' in options:
             del(options['html_attrs'])
         for option in options.keys():
-            if not option in OPTIONS_NOT_ATTRIBUTES:
+            if option not in OPTIONS_NOT_ATTRIBUTES:
                 html_attrs[option] = options[option]
         return html_attrs
 
@@ -256,7 +258,7 @@ def _url_entities_hash(self, url_entities):
             entities[entity.get('url')] = entity
         return entities
 
-    def _link_to_url(self, entity, chars, options = {}):
+    def _link_to_url(self, entity, chars, options={}):
         url = entity.get('url')
 
         href = options.get('link_url_transform', lambda x: x)(url)
@@ -284,7 +286,7 @@ def _link_to_url(self, entity, chars, options = {}):
         link = self._link_to_text(entity, link_text, href, html_attrs, options)
         return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:]
 
-    def _link_url_with_entity(self, entity, options = {}):
+    def _link_url_with_entity(self, entity, options={}):
         """
         Goal: If a user copies and pastes a tweet containing t.co'ed link, the resulting paste
         should contain the full original URL (expanded_url), not the display URL.
@@ -348,7 +350,7 @@ def _link_url_with_entity(self, entity, options = {}):
         else:
             return self._html_escape(display_url)
 
-    def _link_to_hashtag(self, entity, chars, options = {}):
+    def _link_to_hashtag(self, entity, chars, options={}):
         hashchar = chars[entity['indices'][0]]
         hashtag = entity['hashtag']
         hashtag_class = options.get('hashtag_class')
@@ -368,7 +370,7 @@ def _link_to_hashtag(self, entity, chars, options = {}):
         link = self._link_to_text_with_symbol(entity, hashchar, hashtag, href, html_attrs, options)
         return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:]
 
-    def _link_to_cashtag(self, entity, chars, options = {}):
+    def _link_to_cashtag(self, entity, chars, options={}):
         dollar = chars[entity['indices'][0]]
         cashtag = entity['cashtag']
 
@@ -383,7 +385,7 @@ def _link_to_cashtag(self, entity, chars, options = {}):
         link = self._link_to_text_with_symbol(entity, dollar, cashtag, href, html_attrs, options)
         return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:]
 
-    def _link_to_screen_name(self, entity, chars, options = {}):
+    def _link_to_screen_name(self, entity, chars, options={}):
         name = u'%s%s' % (entity['screen_name'], entity.get('list_slug') or '')
         chunk = options.get('link_text_transform', default_transform)(entity, name)
         name = name.lower()
@@ -404,7 +406,7 @@ def _link_to_screen_name(self, entity, chars, options = {}):
         link = self._link_to_text_with_symbol(entity, at, chunk, href, html_attrs, options)
         return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:]
 
-    def _link_to_text_with_symbol(self, entity, symbol, text, href, attributes = {}, options = {}):
+    def _link_to_text_with_symbol(self, entity, symbol, text, href, attributes={}, options={}):
         tagged_symbol = u'<%s>%s</%s>' % (options.get('symbol_tag'), symbol, options.get('symbol_tag')) if options.get('symbol_tag') else symbol
         text = self._html_escape(text)
         tagged_text = u'<%s>%s</%s>' % (options.get('text_with_symbol_tag'), text, options.get('text_with_symbol_tag')) if options.get('text_with_symbol_tag') else text
@@ -413,14 +415,14 @@ def _link_to_text_with_symbol(self, entity, symbol, text, href, attributes = {},
         else:
             return u'%s%s' % (tagged_symbol, self._link_to_text(entity, tagged_text, href, attributes, options))
 
-    def _link_to_text(self, entity, text, href, attributes = {}, options = {}):
+    def _link_to_text(self, entity, text, href, attributes={}, options={}):
         attributes['href'] = href
         if options.get('link_attribute_transform'):
             attributes = options.get('link_attribute_transform')(entity, attributes)
         text = options.get('link_text_transform', default_transform)(entity, text)
         return u'<a %s>%s</a>' % (self._tag_attrs(attributes), text)
 
-    def _tag_attrs(self, attributes = {}):
+    def _tag_attrs(self, attributes={}):
         attrs = []
         for key in sorted(attributes.keys()):
             value = attributes[key]
@@ -431,4 +433,4 @@ def _tag_attrs(self, attributes = {}):
                 value = u' '.join(value)
             attrs.append(u'%s="%s"' % (self._html_escape(key), self._html_escape(value)))
 
-        return u' '.join(attrs)
\ No newline at end of file
+        return u' '.join(attrs)
diff --git a/twitter_text/extractor.py b/twitter_text/extractor.py
index 1015b8c..0898370 100644
--- a/twitter_text/extractor.py
+++ b/twitter_text/extractor.py
@@ -3,12 +3,13 @@
 from twitter_text.regex import REGEXEN
 from twitter_text.unicode import force_unicode
 
+
 class Extractor(object):
     """
     A module for including Tweet parsing in a class. This module provides function for the extraction and processing
     of usernames, lists, URLs and hashtags.
     """
-    
+
     def __init__(self, text):
         self.text = force_unicode(text)
 
@@ -19,18 +20,18 @@ def _remove_overlapping_entities(self, entities):
         """
 
         # sort by start index
-        entities.sort(key = lambda entity: entity['indices'][0])
+        entities.sort(key=lambda entity: entity['indices'][0])
 
         # remove duplicates
-        prev    =   None
+        prev = None
         for entity in [e for e in entities]:
             if prev and prev['indices'][1] > entity['indices'][0]:
                 entities.remove(entity)
             else:
-                prev    =   entity
+                prev = entity
         return entities
 
-    def extract_entities_with_indices(self, options = {}, transform = lambda x: x):
+    def extract_entities_with_indices(self, options={}, transform=lambda x: x):
         """
         Extracts all usernames, lists, hashtags and URLs  in the Tweet text
         along with the indices for where the entity ocurred
@@ -43,19 +44,21 @@ def extract_entities_with_indices(self, options = {}, transform = lambda x: x):
             return []
 
         # extract all entities
-        entities    =   self.extract_urls_with_indices(options) + \
-                        self.extract_hashtags_with_indices({'check_url_overlap': False}) + \
-                        self.extract_mentions_or_lists_with_indices() + \
-                        self.extract_cashtags_with_indices()
+        entities = (
+            self.extract_urls_with_indices(options) +
+            self.extract_hashtags_with_indices({'check_url_overlap': False}) +
+            self.extract_mentions_or_lists_with_indices() +
+            self.extract_cashtags_with_indices()
+        )
 
-        entities    =   self._remove_overlapping_entities(entities)
+        entities = self._remove_overlapping_entities(entities)
 
         for entity in entities:
-            entity  =   transform(entity)
+            entity = transform(entity)
 
         return entities
 
-    def extract_mentioned_screen_names(self, transform = lambda x: x):
+    def extract_mentioned_screen_names(self, transform=lambda x: x):
         """
         Extracts a list of all usernames mentioned in the Tweet text. If the
         text is None or contains no username mentions an empty list
@@ -65,7 +68,7 @@ def extract_mentioned_screen_names(self, transform = lambda x: x):
         """
         return [transform(mention['screen_name']) for mention in self.extract_mentioned_screen_names_with_indices()]
 
-    def extract_mentioned_screen_names_with_indices(self, transform = lambda x: x):
+    def extract_mentioned_screen_names_with_indices(self, transform=lambda x: x):
         """
         Extracts a list of all usernames mentioned in the Tweet text
         along with the indices for where the mention ocurred.  If the
@@ -87,7 +90,7 @@ def extract_mentioned_screen_names_with_indices(self, transform = lambda x: x):
                 })
         return possible_screen_names
 
-    def extract_mentions_or_lists_with_indices(self, transform = lambda x: x):
+    def extract_mentions_or_lists_with_indices(self, transform=lambda x: x):
         """
         Extracts a list of all usernames or lists mentioned in the Tweet text
         along with the indices for where the mention ocurred.  If the
@@ -101,7 +104,7 @@ def extract_mentions_or_lists_with_indices(self, transform = lambda x: x):
         if not REGEXEN['at_signs'].search(self.text):
             return []
 
-        possible_entries    =   []
+        possible_entries = []
         for match in REGEXEN['valid_mention_or_list'].finditer(self.text):
             try:
                 after = self.text[match.end()]
@@ -117,8 +120,8 @@ def extract_mentions_or_lists_with_indices(self, transform = lambda x: x):
             })
 
         return possible_entries
-        
-    def extract_reply_screen_name(self, transform = lambda x: x):
+
+    def extract_reply_screen_name(self, transform=lambda x: x):
         """
         Extracts the username username replied to in the Tweet text. If the
         text is None or is not a reply None will be returned.
@@ -135,8 +138,8 @@ def extract_reply_screen_name(self, transform = lambda x: x):
             else:
                 possible_screen_name = transform(possible_screen_name.group(1))
         return possible_screen_name
-        
-    def extract_urls(self, transform = lambda x: x):
+
+    def extract_urls(self, transform=lambda x: x):
         """
         Extracts a list of all URLs included in the Tweet text. If the
         text is None or contains no URLs an empty list
@@ -145,8 +148,8 @@ def extract_urls(self, transform = lambda x: x):
         If a transform is given then it will be called for each URL.
         """
         return [transform(url['url']) for url in self.extract_urls_with_indices()]
-        
-    def extract_urls_with_indices(self, options = {'extract_url_without_protocol': True}):
+
+    def extract_urls_with_indices(self, options={'extract_url_without_protocol': True}):
         """
         Extracts a list of all URLs included in the Tweet text along
         with the indices. If the text is None or contains no
@@ -192,8 +195,8 @@ def extract_urls_with_indices(self, options = {'extract_url_without_protocol': T
                     'indices':  [start_position, end_position]
                 })
         return urls
-        
-    def extract_hashtags(self, transform = lambda x: x):
+
+    def extract_hashtags(self, transform=lambda x: x):
         """
         Extracts a list of all hashtags included in the Tweet text. If the
         text is None or contains no hashtags an empty list
@@ -203,8 +206,8 @@ def extract_hashtags(self, transform = lambda x: x):
         If a block is given then it will be called for each hashtag.
         """
         return [transform(hashtag['hashtag']) for hashtag in self.extract_hashtags_with_indices()]
-        
-    def extract_hashtags_with_indices(self, options = {'check_url_overlap': True}, transform = lambda x: x):
+
+    def extract_hashtags_with_indices(self, options={'check_url_overlap': True}, transform=lambda x: x):
         """
         Extracts a list of all hashtags included in the Tweet text. If the
         text is None or contains no hashtags an empty list
@@ -234,7 +237,7 @@ def extract_hashtags_with_indices(self, options = {'check_url_overlap': True}, t
 
         return tags
 
-    def extract_cashtags(self, transform = lambda x: x):
+    def extract_cashtags(self, transform=lambda x: x):
         """
         Extracts a list of all cashtags included in the Tweet text. If the
         text is None or contains no cashtags an empty list
@@ -245,7 +248,7 @@ def extract_cashtags(self, transform = lambda x: x):
         """
         return [cashtag['cashtag'] for cashtag in self.extract_cashtags_with_indices()]
 
-    def extract_cashtags_with_indices(self, transform = lambda x: x):
+    def extract_cashtags_with_indices(self, transform=lambda x: x):
         """
         Extracts a list of all cashtags included in the Tweet text. If the
         text is None or contains no cashtags an empty list
@@ -267,4 +270,4 @@ def extract_cashtags_with_indices(self, transform = lambda x: x):
                 'indices':  [start_position, end_position]
             })
 
-        return tags
\ No newline at end of file
+        return tags
diff --git a/twitter_text/highlighter.py b/twitter_text/highlighter.py
index ec128ca..90bbdfd 100644
--- a/twitter_text/highlighter.py
+++ b/twitter_text/highlighter.py
@@ -3,37 +3,41 @@
 import re
 from HTMLParser import HTMLParser
 
-from twitter_text.regex import UNICODE_SPACES
 from twitter_text.unicode import force_unicode
 
 DEFAULT_HIGHLIGHT_TAG = 'em'
 
+
 # from http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
 class MLStripper(HTMLParser):
     def __init__(self):
         self.reset()
         self.fed = []
+
     def handle_data(self, d):
         self.fed.append(d)
+
     def get_data(self):
         return ''.join(self.fed)
 
+
 def strip_tags(html):
     s = MLStripper()
     s.feed(html)
     return s.get_data()
 
+
 class HitHighlighter(object):
     def __init__(self, text, **kwargs):
         self.text = force_unicode(text)
         self.parent = kwargs.get('parent', False)
 
-    def hit_highlight(self, hits = [], **kwargs):
+    def hit_highlight(self, hits=[], **kwargs):
         if not hits and not kwargs.get('query'):
             return self.text
 
         if not hits and kwargs.get('query'):
-            stripped_text   =   strip_tags(self.text)
+            stripped_text = strip_tags(self.text)
             for match in re.finditer(ur'%s' % kwargs.get('query'), stripped_text):
                 hits.append(match.span())
 
@@ -49,7 +53,7 @@ def hit_highlight(self, hits = [], **kwargs):
         for index, chunk in enumerate(chunks):
             if not index % 2:
                 text_chunks.append(chunk)
-        for hit in sorted(hits, key = lambda chunk: chunk[1], reverse = True):
+        for hit in sorted(hits, key=lambda chunk: chunk[1], reverse=True):
             hit_start, hit_end = hit
             placed = 0
             for index, chunk in enumerate(chunks):
@@ -80,4 +84,4 @@ def hit_highlight(self, hits = [], **kwargs):
             else:
                 result.append(chunk)
         self.text = u''.join(result)
-        return self.text
\ No newline at end of file
+        return self.text
diff --git a/twitter_text/regex.py b/twitter_text/regex.py
index c136f80..fffebde 100644
--- a/twitter_text/regex.py
+++ b/twitter_text/regex.py
@@ -4,44 +4,44 @@
 # list is frozen at load time to ensure immutability. These reular expressions are
 # used throughout the Twitter classes. Special care has been taken to make
 # sure these reular expressions work with Tweets in all languages.
-import re, string
+import re
 
-REGEXEN = {} # :nodoc:
+REGEXEN = {}  # :nodoc:
 
-def regex_range(start, end = None):
+
+def regex_range(start, end=None):
     if end:
         return u'%s-%s' % (unichr(start), unichr(end))
     else:
         return u'%s' % unichr(start)
 
+
 # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
 # to access both the list of characters and a pattern suitible for use with String#split
 #  Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
-UNICODE_SPACES = []
-for space in reduce(lambda x,y: x + y if type(y) == list else x + [y], [
-        range(0x0009, 0x000D),  # White_Space # Cc   [5] <control-0009>..<control-000D>
-        0x0020,                 # White_Space # Zs       SPACE
-        0x0085,                 # White_Space # Cc       <control-0085>
-        0x00A0,                 # White_Space # Zs       NO-BREAK SPACE
-        0x1680,                 # White_Space # Zs       OGHAM SPACE MARK
-        0x180E,                 # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
-        range(0x2000, 0x200A),  # White_Space # Zs  [11] EN QUAD..HAIR SPACE
-        0x2028,                 # White_Space # Zl       LINE SEPARATOR
-        0x2029,                 # White_Space # Zp       PARAGRAPH SEPARATOR
-        0x202F,                 # White_Space # Zs       NARROW NO-BREAK SPACE
-        0x205F,                 # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
-        0x3000,                 # White_Space # Zs       IDEOGRAPHIC SPACE
-    ]):
-    UNICODE_SPACES.append(unichr(space))
+UNICODE_SPACES = [unichr(space) for space in reduce(lambda x, y: x + y if type(y) == list else x + [y], [
+    range(0x0009, 0x000D),  # White_Space # Cc   [5] <control-0009>..<control-000D>
+    0x0020,                 # White_Space # Zs       SPACE
+    0x0085,                 # White_Space # Cc       <control-0085>
+    0x00A0,                 # White_Space # Zs       NO-BREAK SPACE
+    0x1680,                 # White_Space # Zs       OGHAM SPACE MARK
+    0x180E,                 # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
+    range(0x2000, 0x200A),  # White_Space # Zs  [11] EN QUAD..HAIR SPACE
+    0x2028,                 # White_Space # Zl       LINE SEPARATOR
+    0x2029,                 # White_Space # Zp       PARAGRAPH SEPARATOR
+    0x202F,                 # White_Space # Zs       NARROW NO-BREAK SPACE
+    0x205F,                 # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
+    0x3000,                 # White_Space # Zs       IDEOGRAPHIC SPACE
+])]
 REGEXEN['spaces'] = re.compile(ur''.join(UNICODE_SPACES))
 
 # Characters not allowed in Tweets
-INVALID_CHARACTERS  =   [
-    0xFFFE, 0xFEFF,                         # BOM
-    0xFFFF,                                 # Special
-    0x202A, 0x202B, 0x202C, 0x202D, 0x202E, # Directional change
+INVALID_CHARACTERS = [
+    0xFFFE, 0xFEFF,                          # BOM
+    0xFFFF,                                  # Special
+    0x202A, 0x202B, 0x202C, 0x202D, 0x202E,  # Directional change
 ]
-REGEXEN['invalid_control_characters']   =   [unichr(x) for x in INVALID_CHARACTERS]
+REGEXEN['invalid_control_characters'] = [unichr(x) for x in INVALID_CHARACTERS]
 
 REGEXEN['list_name'] = re.compile(ur'^[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}$')
 
@@ -71,32 +71,32 @@ def regex_range(start, end = None):
 LATIN_ACCENTS = u''.join(LATIN_ACCENTS)
 
 RTL_CHARACTERS = ''.join([
-    regex_range(0x0600,0x06FF),
-    regex_range(0x0750,0x077F),
-    regex_range(0x0590,0x05FF),
-    regex_range(0xFE70,0xFEFF)
+    regex_range(0x0600, 0x06FF),
+    regex_range(0x0750, 0x077F),
+    regex_range(0x0590, 0x05FF),
+    regex_range(0xFE70, 0xFEFF)
 ])
 
 NON_LATIN_HASHTAG_CHARS = ''.join([
     # Cyrillic (Russian, Ukrainian, etc.)
-    regex_range(0x0400, 0x04ff), # Cyrillic
-    regex_range(0x0500, 0x0527), # Cyrillic Supplement
-    regex_range(0x2de0, 0x2dff), # Cyrillic Extended A
-    regex_range(0xa640, 0xa69f), # Cyrillic Extended B
-    regex_range(0x0591, 0x05bf), # Hebrew
+    regex_range(0x0400, 0x04ff),  # Cyrillic
+    regex_range(0x0500, 0x0527),  # Cyrillic Supplement
+    regex_range(0x2de0, 0x2dff),  # Cyrillic Extended A
+    regex_range(0xa640, 0xa69f),  # Cyrillic Extended B
+    regex_range(0x0591, 0x05bf),  # Hebrew
     regex_range(0x05c1, 0x05c2),
     regex_range(0x05c4, 0x05c5),
     regex_range(0x05c7),
     regex_range(0x05d0, 0x05ea),
     regex_range(0x05f0, 0x05f4),
-    regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms
+    regex_range(0xfb12, 0xfb28),  # Hebrew Presentation Forms
     regex_range(0xfb2a, 0xfb36),
     regex_range(0xfb38, 0xfb3c),
     regex_range(0xfb3e),
     regex_range(0xfb40, 0xfb41),
     regex_range(0xfb43, 0xfb44),
     regex_range(0xfb46, 0xfb4f),
-    regex_range(0x0610, 0x061a), # Arabic
+    regex_range(0x0610, 0x061a),  # Arabic
     regex_range(0x0620, 0x065f),
     regex_range(0x066e, 0x06d3),
     regex_range(0x06d5, 0x06dc),
@@ -104,44 +104,51 @@ def regex_range(start, end = None):
     regex_range(0x06ea, 0x06ef),
     regex_range(0x06fa, 0x06fc),
     regex_range(0x06ff),
-    regex_range(0x0750, 0x077f), # Arabic Supplement
+    regex_range(0x0750, 0x077f),  # Arabic Supplement
     regex_range(0x08a0),         # Arabic Extended A
     regex_range(0x08a2, 0x08ac),
     regex_range(0x08e4, 0x08fe),
-    regex_range(0xfb50, 0xfbb1), # Arabic Pres. Forms A
+    regex_range(0xfb50, 0xfbb1),  # Arabic Pres. Forms A
     regex_range(0xfbd3, 0xfd3d),
     regex_range(0xfd50, 0xfd8f),
     regex_range(0xfd92, 0xfdc7),
     regex_range(0xfdf0, 0xfdfb),
-    regex_range(0xfe70, 0xfe74), # Arabic Pres. Forms B
+    regex_range(0xfe70, 0xfe74),  # Arabic Pres. Forms B
     regex_range(0xfe76, 0xfefc),
-    regex_range(0x200c, 0x200c), # Zero-Width Non-Joiner
-    regex_range(0x0e01, 0x0e3a), # Thai
-    regex_range(0x0e40, 0x0e4e), # Hangul (Korean)
-    regex_range(0x1100, 0x11ff), # Hangul Jamo
-    regex_range(0x3130, 0x3185), # Hangul Compatibility Jamo
-    regex_range(0xA960, 0xA97F), # Hangul Jamo Extended-A
-    regex_range(0xAC00, 0xD7AF), # Hangul Syllables
-    regex_range(0xD7B0, 0xD7FF), # Hangul Jamo Extended-B
-    regex_range(0xFFA1, 0xFFDC)  # Half-width Hangul
+    regex_range(0x200c, 0x200c),  # Zero-Width Non-Joiner
+    regex_range(0x0e01, 0x0e3a),  # Thai
+    regex_range(0x0e40, 0x0e4e),  # Hangul (Korean)
+    regex_range(0x1100, 0x11ff),  # Hangul Jamo
+    regex_range(0x3130, 0x3185),  # Hangul Compatibility Jamo
+    regex_range(0xA960, 0xA97F),  # Hangul Jamo Extended-A
+    regex_range(0xAC00, 0xD7AF),  # Hangul Syllables
+    regex_range(0xD7B0, 0xD7FF),  # Hangul Jamo Extended-B
+    regex_range(0xFFA1, 0xFFDC),  # Half-width Hangul
 ])
 
 CJ_HASHTAG_CHARACTERS = ''.join([
-    regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width)
-    regex_range(0xFF66, 0xFF9F), # Katakana (half-width)
-    regex_range(0xFF10, 0xFF19), regex_range(0xFF21, 0xFF3A), regex_range(0xFF41, 0xFF5A), # Latin (full-width)
-    regex_range(0x3041, 0x3096), regex_range(0x3099, 0x309E), # Hiragana
-    regex_range(0x3400, 0x4DBF), # Kanji (CJK Extension A)
-    regex_range(0x4E00, 0x9FFF), # Kanji (Unified)
+    regex_range(0x30A1, 0x30FA),  # Katakana (full-width)
+    regex_range(0x30FC, 0x30FE),  # Katakana (full-width)
+    regex_range(0xFF66, 0xFF9F),  # Katakana (half-width)
+    regex_range(0xFF10, 0xFF19),  # Latin (full-width)
+    regex_range(0xFF21, 0xFF3A),  # Latin (full-width)
+    regex_range(0xFF41, 0xFF5A),  # Latin (full-width)
+    regex_range(0x3041, 0x3096),  # Hiragana
+    regex_range(0x3099, 0x309E),  # Hiragana
+    regex_range(0x3400, 0x4DBF),  # Kanji (CJK Extension A)
+    regex_range(0x4E00, 0x9FFF),  # Kanji (Unified)
 ])
 
 try:
     CJ_HASHTAG_CHARACTERS = ''.join([
         CJ_HASHTAG_CHARACTERS,
-        regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B)
-        regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C)
-        regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D)
-        regex_range(0x2F800, 0x2FA1F), regex_range(0x3003), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
+        regex_range(0x20000, 0x2A6DF),  # Kanji (CJK Extension B)
+        regex_range(0x2A700, 0x2B73F),  # Kanji (CJK Extension C)
+        regex_range(0x2B740, 0x2B81F),  # Kanji (CJK Extension D)
+        regex_range(0x2F800, 0x2FA1F),  # Kanji (CJK supplement)
+        regex_range(0x3003),  # Kanji (CJK supplement)
+        regex_range(0x3005),  # Kanji (CJK supplement)
+        regex_range(0x303B),  # Kanji (CJK supplement)
     ])
 except ValueError:
     # this is a narrow python build so these extended Kanji characters won't work
@@ -171,7 +178,7 @@ def regex_range(start, end = None):
     ur'(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?'                                           # list (optional)
 )
 REGEXEN['valid_reply'] = re.compile(ur'^(?:[%s])*%s([a-zA-Z0-9_]{1,20})' % (REGEXEN['spaces'].pattern, REGEXEN['at_signs'].pattern), re.IGNORECASE | re.UNICODE)
- # Used in Extractor for final filtering
+# Used in Extractor for final filtering
 REGEXEN['end_mention_match'] = re.compile(ur'\A(?:%s|[%s]|:\/\/)' % (REGEXEN['at_signs'].pattern, REGEXEN['latin_accents'].pattern), re.IGNORECASE | re.UNICODE)
 
 # URL related hash regex collection
diff --git a/twitter_text/templatetags/twitterize.py b/twitter_text/templatetags/twitterize.py
index 01db63d..b58779a 100644
--- a/twitter_text/templatetags/twitterize.py
+++ b/twitter_text/templatetags/twitterize.py
@@ -8,15 +8,16 @@
 
 register = Library()
 
-@register.filter(name = 'twitter_text')
+
+@register.filter(name='twitter_text')
 @stringfilter
-def twitter_text(text, search_query = False):
+def twitter_text(text, search_query=False):
     """
     Parses a text string through the TwitterText auto_link method and if search_query is passed, through the hit_highlight method.
     """
     tt = TwitterText(text)
     if search_query:
-        tt.text     =   tt.highlighter.hit_highlight(query = search_query)
-    tt.text         =   tt.autolink.auto_link()
+        tt.text = tt.highlighter.hit_highlight(query=search_query)
+    tt.text = tt.autolink.auto_link()
     return tt.text
-twitter_text.is_safe = True
\ No newline at end of file
+twitter_text.is_safe = True
diff --git a/twitter_text/unicode.py b/twitter_text/unicode.py
index 4e17267..e67238c 100644
--- a/twitter_text/unicode.py
+++ b/twitter_text/unicode.py
@@ -1,6 +1,8 @@
-import types, datetime
+import datetime
+import types
 from decimal import Decimal
 
+
 # borrowed from django.utils.encoding
 class TwitterTextUnicodeDecodeError(UnicodeDecodeError):
     def __init__(self, obj, *args):
@@ -10,7 +12,8 @@ def __init__(self, obj, *args):
     def __str__(self):
         original = UnicodeDecodeError.__str__(self)
         return '%s. You passed in %r (%s)' % (original, self.obj,
-                type(self.obj))
+                                              type(self.obj))
+
 
 def is_protected_type(obj):
     """Determine if the object instance is of a protected type.
@@ -25,6 +28,7 @@ def is_protected_type(obj):
         float, Decimal)
     )
 
+
 def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
     """
     Similar to smart_unicode, except that lazy instances are resolved to
@@ -50,8 +54,8 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
                     # without raising a further exception. We do an
                     # approximation to what the Exception's standard str()
                     # output should be.
-                    s = ' '.join([force_unicode(arg, encoding, strings_only,
-                            errors) for arg in s])
+                    s = ' '.join([force_unicode(arg, encoding, strings_only, errors)
+                                  for arg in s])
         elif not isinstance(s, unicode):
             # Note: We use .decode() here, instead of unicode(s, encoding,
             # errors), so that if s is a SafeString, it ends up being a
@@ -66,6 +70,6 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
             # working unicode method. Try to handle this without raising a
             # further exception by individually forcing the exception args
             # to unicode.
-            s = ' '.join([force_unicode(arg, encoding, strings_only,
-                    errors) for arg in s])
+            s = ' '.join([force_unicode(arg, encoding, strings_only, errors)
+                          for arg in s])
     return s
diff --git a/twitter_text/validation.py b/twitter_text/validation.py
index 6dea5f9..eabd955 100644
--- a/twitter_text/validation.py
+++ b/twitter_text/validation.py
@@ -9,17 +9,18 @@
 MAX_LENGTH = 140
 
 DEFAULT_TCO_URL_LENGTHS = {
-  'short_url_length': 22,
-  'short_url_length_https': 23,
-  'characters_reserved_per_media': 22,
+    'short_url_length': 22,
+    'short_url_length_https': 23,
+    'characters_reserved_per_media': 22,
 }
 
+
 class Validation(object):
     def __init__(self, text, **kwargs):
         self.text = force_unicode(text)
         self.parent = kwargs.get('parent', False)
-        
-    def tweet_length(self, options = {}):
+
+    def tweet_length(self, options={}):
         """
         Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
         (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
@@ -34,10 +35,10 @@ def tweet_length(self, options = {}):
          The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
         """
 
-        assert (not self.parent or not getattr(self.parent, 'has_been_linked', False) ), 'The validator should only be run on text before it has been modified.'
+        assert (not self.parent or not getattr(self.parent, 'has_been_linked', False)), 'The validator should only be run on text before it has been modified.'
 
         for key in DEFAULT_TCO_URL_LENGTHS:
-            if not key in options:
+            if key not in options:
                 options[key] = DEFAULT_TCO_URL_LENGTHS[key]
 
         length = len(self.text)
@@ -52,21 +53,22 @@ def tweet_length(self, options = {}):
         if self.parent and hasattr(self.parent, 'tweet_length'):
             self.parent.tweet_length = length
         return length
-    
+
     def tweet_invalid(self):
         """
         Check the text for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
         before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
         will allow quicker feedback.
-        
+
         Returns false if this text is valid. Otherwise one of the following Symbols will be returned:
-        
+
             "Too long":: if the text is too long
             "Empty text":: if the text is empty
             "Invalid characters":: if the text contains non-Unicode or any of the disallowed Unicode characters
         """
 
-        valid = True # optimism
+        # optimism
+        valid = True
         validation_error = None
 
         if not self.tweet_length():
@@ -77,7 +79,7 @@ def tweet_invalid(self):
 
         if re.search(ur''.join(REGEXEN['invalid_control_characters']), self.text):
             valid, validation_error = False, 'Invalid characters'
-            
+
         if self.parent and hasattr(self.parent, 'tweet_is_valid'):
             self.parent.tweet_is_valid = valid
         if self.parent and hasattr(self.parent, 'tweet_validation_error'):
@@ -108,7 +110,7 @@ def valid_hashtag(self):
 
         return len(extracted) == 1 and extracted[0] == self.text[1:]
 
-    def valid_url(self, unicode_domains = True, require_protocol = True):
+    def valid_url(self, unicode_domains=True, require_protocol=True):
         if not self.text:
             return False
 
@@ -121,35 +123,32 @@ def valid_url(self, unicode_domains = True, require_protocol = True):
 
         if not (
             (
-                not require_protocol 
-                or (
-                    self._valid_match(scheme, REGEXEN['validate_url_scheme']) 
-                    and re.compile(ur'^https?$', re.IGNORECASE).match(scheme)
+                not require_protocol or (
+                    self._valid_match(scheme, REGEXEN['validate_url_scheme']) and
+                    re.compile(ur'^https?$', re.IGNORECASE).match(scheme)
                 )
-            )
-            and (
-                path == ''
-                or self._valid_match(path, REGEXEN['validate_url_path'])
-            )
-            and self._valid_match(query, REGEXEN['validate_url_query'], True)
-            and self._valid_match(fragment, REGEXEN['validate_url_fragment'], True)
+            ) and (
+                path == '' or
+                self._valid_match(path, REGEXEN['validate_url_path'])
+            ) and
+            self._valid_match(query, REGEXEN['validate_url_query'], True) and
+            self._valid_match(fragment, REGEXEN['validate_url_fragment'], True)
         ):
             return False
 
         return bool(
             (
-                unicode_domains 
-                and self._valid_match(authority, REGEXEN['validate_url_unicode_authority'])
-                and REGEXEN['validate_url_unicode_authority'].match(authority).string == authority
-            )
-            or (
-                not unicode_domains
-                and self._valid_match(authority, REGEXEN['validate_url_authority'])
-                and REGEXEN['validate_url_authority'].match(authority).string == authority
+                unicode_domains and
+                self._valid_match(authority, REGEXEN['validate_url_unicode_authority']) and
+                REGEXEN['validate_url_unicode_authority'].match(authority).string == authority
+            ) or (
+                not unicode_domains and
+                self._valid_match(authority, REGEXEN['validate_url_authority']) and
+                REGEXEN['validate_url_authority'].match(authority).string == authority
             )
         )
 
-    def _valid_match(self, string, re_obj, optional = False):
+    def _valid_match(self, string, re_obj, optional=False):
         if optional and string is None:
             return True
         match = re_obj.match(string)

From 07585c5058462251be47133ac3ac958f054e085a Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 13:54:59 -0700
Subject: [PATCH 02/30] Updated travis badge.

[skip ci]
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fb6d3cd..1c0537d 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 A port of the Ruby gem [twitter-text-rb](https://github.com/twitter/twitter-text-rb) to Python.
 
-[![Build Status](https://travis-ci.org/dryan/twitter-text-py.png?branch=master)](https://travis-ci.org/dryan/twitter-text-py)
+[![Build Status](https://travis-ci.org/muckrack/twitter-text-py.svg?branch=master)](https://travis-ci.org/muckrack/twitter-text-py)
 
 # Changes in 2.0
 

From ee1b68e78fe0949b5355572314af1684c9537afc Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Tue, 1 Mar 2016 16:34:17 -0800
Subject: [PATCH 03/30] Updated twitter-text-conformance submodule to latest
 master.

---
 twitter-text-conformance | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/twitter-text-conformance b/twitter-text-conformance
index 9b58c44..a39ec58 160000
--- a/twitter-text-conformance
+++ b/twitter-text-conformance
@@ -1 +1 @@
-Subproject commit 9b58c44302c4ab5bab261f6cfaf6ca89b5a6cf35
+Subproject commit a39ec5875528aaf0a874f384536fcd8e904d9fd8

From 4644183b8bcad7f062d8638b134ac8ac6b8ddc63 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Tue, 1 Mar 2016 17:43:43 -0800
Subject: [PATCH 04/30] Started transition to py.test testing

This allows tests to run completely instead of halting at the first error,
and removes some boilerplate.
---
 conftest.py                 | 134 ++++++++++++++++++++
 requirements.txt            |   2 +
 tests.py                    |  10 +-
 twitter_text/__init__.py    |   4 +-
 twitter_text/autolink.py    |   4 +-
 twitter_text/encoding.py    | 239 ++++++++++++++++++++++++++++++++++++
 twitter_text/extractor.py   |   4 +-
 twitter_text/highlighter.py |   4 +-
 twitter_text/validation.py  |   6 +-
 9 files changed, 392 insertions(+), 15 deletions(-)
 create mode 100644 conftest.py
 create mode 100644 twitter_text/encoding.py

diff --git a/conftest.py b/conftest.py
new file mode 100644
index 0000000..fb5d4b2
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,134 @@
+# encoding=utf-8
+
+from __future__ import unicode_literals
+
+import argparse
+import json
+import os
+import re
+import sys
+
+import pytest
+
+import twitter_text
+from twitter_text.encoding import force_text, smart_bytes
+
+try:
+    import yaml
+except ImportError:
+    raise Exception('You need to install pyaml to run the tests')
+# from http://stackoverflow.com/questions/2890146/how-to-force-pyyaml-to-load-strings-as-unicode-objects
+from yaml import Loader, SafeLoader
+
+
+narrow_build = True
+try:
+    unichr(0x20000)
+    narrow_build = False
+except:
+    pass
+
+
+def construct_yaml_str(self, node):
+    return self.construct_scalar(node)
+Loader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str)
+SafeLoader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str)
+
+
+try:
+    from bs4 import BeautifulSoup
+except ImportError:
+    try:
+        from BeautifulSoup import BeautifulSoup
+    except ImportError:
+        raise Exception('You need to install BeautifulSoup to run the tests')
+
+
+def assert_equal_without_attribute_order(result, test, failure_message=None):
+    # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through
+    assert BeautifulSoup(result) == BeautifulSoup(test.get('expected'))
+
+
+def assert_equal(result, test):
+    assert result == test.get('expected')
+
+
+def pytest_collect_file(parent, path):
+    if path.ext == '.yml':
+        return YamlFile(path, parent)
+
+
+class YamlException(Exception):
+    """ custom exception for error reporting. """
+
+
+class YamlFile(pytest.File):
+    def collect(self):
+        raw = yaml.safe_load(force_text(self.fspath.open().read()))
+        if 'tests' not in raw:
+            return
+        filename = os.path.splitext(os.path.basename(self.fspath.strpath))[0]
+        for section, specs in raw['tests'].items():
+            for spec in specs:
+                yield YamlItem(self, filename, section, spec)
+
+
+TEST_MAP = {
+    'autolink': {
+        'cls': None,
+        'methods': {
+        },
+    },
+    'extract': {
+        'cls': twitter_text.extractor.Extractor,
+        'methods': {
+            'mentions': 'extract_mentioned_screen_names',
+            'mentions_with_indices': 'extract_mentioned_screen_names_with_indices',
+            'mentions_or_lists_with_indices': 'extract_mentions_or_lists_with_indices',
+            'replies': 'extract_reply_screen_name',
+            'urls': 'extract_urls',
+            'urls_with_indices': 'extract_urls_with_indices',
+            'hashtags': 'extract_hashtags',
+            'cashtags': 'extract_cashtags',
+            'hashtags_with_indices': 'extract_hashtags_with_indices',
+            'cashtags_with_indices': 'extract_cashtags_with_indices',
+        },
+    },
+    'hit_highlighting': {
+        'cls': None,
+        'methods': {
+        },
+    }
+}
+
+
+class YamlItem(pytest.Item):
+    def __init__(self, parent, filename, section, spec):
+        self.section = section
+        self.filename = filename
+        self.spec = spec
+        name = "{}:{}:{}".format(filename, section, spec['description'])
+        super(YamlItem, self).__init__(name, parent)
+
+    def runtest(self):
+        if self.filename not in TEST_MAP:
+            raise YamlException("{} file not supported".format(self.section))
+        if self.section not in TEST_MAP[self.filename]['methods']:
+            raise YamlException("{}:{} section not supported".format(self.section))
+        cls = TEST_MAP[self.section]['cls']
+        method_name = TEST_MAP[self.section]['methods']
+        instance = cls(self.spec['text'])
+        result = getattr(instance, method_name)()
+        if result != self.spec['expected']:
+            raise YamlException("{} != {}".format(result, self.spec['expected']))
+
+    def repr_failure(self, excinfo):
+        """ called when self.runtest() raises an exception. """
+        if isinstance(excinfo.value, YamlException):
+            return smart_bytes("\n".join([
+                "usecase execution failed",
+                "   {}".format(*excinfo.value.args)
+            ]))
+
+    def reportinfo(self):
+        return self.fspath, 0, smart_bytes("usecase: %s" % self.name)
diff --git a/requirements.txt b/requirements.txt
index 0ac3552..d001e1b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
 argparse==1.2.1
 PyYAML==3.10
 beautifulsoup4==4.2.0
+pytest==2.87
+py==1.4.29
diff --git a/tests.py b/tests.py
index dd896c5..97da4c0 100644
--- a/tests.py
+++ b/tests.py
@@ -53,6 +53,8 @@ def error(text):
     return (u'\033[91m%s\033[0m\n' % text).encode('utf-8')
 
 
+CURRENT_DIR = os.path.dirname(__file__)
+CONFORMANCE_DIR = os.path.join(CURRENT_DIR, 'twitter-text-conformance/conformance')
 attempted = 0
 
 
@@ -73,7 +75,7 @@ def assert_equal(result, test):
     sys.stdout.flush()
 
 # extractor section
-extractor_file = open(os.path.join('twitter-text-conformance', 'extract.yml'), 'r')
+extractor_file = open(os.path.join(CONFORMANCE_DIR, 'extract.yml'), 'r')
 extractor_tests = yaml.load(force_unicode(extractor_file.read()))
 extractor_file.close()
 
@@ -111,7 +113,7 @@ def assert_equal(result, test):
             assert_equal(extractor.extract_cashtags_with_indices(), test)
 
 # autolink section
-autolink_file = open(os.path.join('twitter-text-conformance', 'autolink.yml'), 'r')
+autolink_file = open(os.path.join(CONFORMANCE_DIR, 'autolink.yml'), 'r')
 autolink_tests = yaml.load(force_unicode(autolink_file.read()))
 autolink_file.close()
 
@@ -144,7 +146,7 @@ def assert_equal(result, test):
             assert_equal_without_attribute_order(autolink.auto_link_with_json(json.loads(test.get('json')), autolink_options), test)
 
 # hit_highlighting section
-hit_highlighting_file = open(os.path.join('twitter-text-conformance', 'hit_highlighting.yml'), 'r')
+hit_highlighting_file = open(os.path.join(CONFORMANCE_DIR, 'hit_highlighting.yml'), 'r')
 hit_highlighting_tests = yaml.load(force_unicode(hit_highlighting_file.read()))
 hit_highlighting_file.close()
 
@@ -164,7 +166,7 @@ def assert_equal(result, test):
 validation_tested = False
 validate_tests = None
 try:
-    validate_file = open(os.path.join('twitter-text-conformance', 'validate.yml'), 'r')
+    validate_file = open(os.path.join(CONFORMANCE_DIR, 'validate.yml'), 'r')
     validate_file_contents = validate_file.read()
     validate_tests = yaml.load(re.sub(ur'\\n', '\n', validate_file_contents.encode('unicode-escape')))
     validate_file.close()
diff --git a/twitter_text/__init__.py b/twitter_text/__init__.py
index 6f17ac2..e267dac 100644
--- a/twitter_text/__init__.py
+++ b/twitter_text/__init__.py
@@ -4,13 +4,13 @@
 from twitter_text.extractor import Extractor
 from twitter_text.highlighter import HitHighlighter
 from twitter_text.validation import Validation
-from twitter_text.unicode import force_unicode
+from twitter_text.encoding import force_text
 
 
 class TwitterText(object):
     def __init__(self, text):
         # this will get modified by some functions
-        self.text = force_unicode(text)
+        self.text = force_text(text)
         # this never changes; use it as a fallback or for comparison
         self.original_text = self.text
         self.has_been_linked = False
diff --git a/twitter_text/autolink.py b/twitter_text/autolink.py
index 85e21aa..8e40227 100644
--- a/twitter_text/autolink.py
+++ b/twitter_text/autolink.py
@@ -3,7 +3,7 @@
 import re
 
 from twitter_text.regex import REGEXEN
-from twitter_text.unicode import force_unicode
+from twitter_text.encoding import force_text
 from twitter_text.extractor import Extractor
 
 # Default CSS class for auto-linked lists
@@ -90,7 +90,7 @@ def default_transform(entity, text):
 
 class Autolink(object):
     def __init__(self, text, **kwargs):
-        self.text = force_unicode(text)
+        self.text = force_text(text)
         self.parent = kwargs.get('parent', False)
         self.extractor = Extractor(self.text)
 
diff --git a/twitter_text/encoding.py b/twitter_text/encoding.py
new file mode 100644
index 0000000..bde3ce7
--- /dev/null
+++ b/twitter_text/encoding.py
@@ -0,0 +1,239 @@
+# flake8: noqa
+# Taken from django.utils.encoding
+from __future__ import unicode_literals
+
+import codecs
+import datetime
+from decimal import Decimal
+import locale
+
+from django.utils.functional import Promise
+from django.utils import six
+from django.utils.six.moves.urllib.parse import quote
+
+
+class DjangoUnicodeDecodeError(UnicodeDecodeError):
+    def __init__(self, obj, *args):
+        self.obj = obj
+        UnicodeDecodeError.__init__(self, *args)
+
+    def __str__(self):
+        original = UnicodeDecodeError.__str__(self)
+        return '%s. You passed in %r (%s)' % (original, self.obj,
+                type(self.obj))
+
+
+def python_2_unicode_compatible(klass):
+    """
+    A decorator that defines __unicode__ and __str__ methods under Python 2.
+    Under Python 3 it does nothing.
+
+    To support Python 2 and 3 with a single code base, define a __str__ method
+    returning text and apply this decorator to the class.
+    """
+    if six.PY2:
+        if '__str__' not in klass.__dict__:
+            raise ValueError("@python_2_unicode_compatible cannot be applied "
+                             "to %s because it doesn't define __str__()." %
+                             klass.__name__)
+        klass.__unicode__ = klass.__str__
+        klass.__str__ = lambda self: self.__unicode__().encode('utf-8')
+    return klass
+
+
+def smart_text(s, encoding='utf-8', strings_only=False, errors='strict'):
+    """
+    Returns a text object representing 's' -- unicode on Python 2 and str on
+    Python 3. Treats bytestrings using the 'encoding' codec.
+
+    If strings_only is True, don't convert (some) non-string-like objects.
+    """
+    if isinstance(s, Promise):
+        # The input is the result of a gettext_lazy() call.
+        return s
+    return force_text(s, encoding, strings_only, errors)
+
+
+def is_protected_type(obj):
+    """Determine if the object instance is of a protected type.
+
+    Objects of protected types are preserved as-is when passed to
+    force_text(strings_only=True).
+    """
+    return isinstance(obj, six.integer_types + (type(None), float, Decimal,
+        datetime.datetime, datetime.date, datetime.time))
+
+
+def force_text(s, encoding='utf-8', strings_only=False, errors='strict'):
+    """
+    Similar to smart_text, except that lazy instances are resolved to
+    strings, rather than kept as lazy objects.
+
+    If strings_only is True, don't convert (some) non-string-like objects.
+    """
+    # Handle the common case first for performance reasons.
+    if isinstance(s, six.text_type):
+        return s
+    if strings_only and is_protected_type(s):
+        return s
+    try:
+        if not isinstance(s, six.string_types):
+            if six.PY3:
+                if isinstance(s, bytes):
+                    s = six.text_type(s, encoding, errors)
+                else:
+                    s = six.text_type(s)
+            elif hasattr(s, '__unicode__'):
+                s = six.text_type(s)
+            else:
+                s = six.text_type(bytes(s), encoding, errors)
+        else:
+            # Note: We use .decode() here, instead of six.text_type(s, encoding,
+            # errors), so that if s is a SafeBytes, it ends up being a
+            # SafeText at the end.
+            s = s.decode(encoding, errors)
+    except UnicodeDecodeError as e:
+        if not isinstance(s, Exception):
+            raise DjangoUnicodeDecodeError(s, *e.args)
+        else:
+            # If we get to here, the caller has passed in an Exception
+            # subclass populated with non-ASCII bytestring data without a
+            # working unicode method. Try to handle this without raising a
+            # further exception by individually forcing the exception args
+            # to unicode.
+            s = ' '.join([force_text(arg, encoding, strings_only,
+                    errors) for arg in s])
+    return s
+
+
+def smart_bytes(s, encoding='utf-8', strings_only=False, errors='strict'):
+    """
+    Returns a bytestring version of 's', encoded as specified in 'encoding'.
+
+    If strings_only is True, don't convert (some) non-string-like objects.
+    """
+    if isinstance(s, Promise):
+        # The input is the result of a gettext_lazy() call.
+        return s
+    return force_bytes(s, encoding, strings_only, errors)
+
+
+def force_bytes(s, encoding='utf-8', strings_only=False, errors='strict'):
+    """
+    Similar to smart_bytes, except that lazy instances are resolved to
+    strings, rather than kept as lazy objects.
+
+    If strings_only is True, don't convert (some) non-string-like objects.
+    """
+    # Handle the common case first for performance reasons.
+    if isinstance(s, bytes):
+        if encoding == 'utf-8':
+            return s
+        else:
+            return s.decode('utf-8', errors).encode(encoding, errors)
+    if strings_only and is_protected_type(s):
+        return s
+    if isinstance(s, six.memoryview):
+        return bytes(s)
+    if isinstance(s, Promise):
+        return six.text_type(s).encode(encoding, errors)
+    if not isinstance(s, six.string_types):
+        try:
+            if six.PY3:
+                return six.text_type(s).encode(encoding)
+            else:
+                return bytes(s)
+        except UnicodeEncodeError:
+            if isinstance(s, Exception):
+                # An Exception subclass containing non-ASCII data that doesn't
+                # know how to print itself properly. We shouldn't raise a
+                # further exception.
+                return b' '.join([force_bytes(arg, encoding, strings_only,
+                        errors) for arg in s])
+            return six.text_type(s).encode(encoding, errors)
+    else:
+        return s.encode(encoding, errors)
+
+if six.PY3:
+    smart_str = smart_text
+    force_str = force_text
+else:
+    smart_str = smart_bytes
+    force_str = force_bytes
+    # backwards compatibility for Python 2
+    smart_unicode = smart_text
+    force_unicode = force_text
+
+smart_str.__doc__ = """
+Apply smart_text in Python 3 and smart_bytes in Python 2.
+
+This is suitable for writing to sys.stdout (for instance).
+"""
+
+force_str.__doc__ = """
+Apply force_text in Python 3 and force_bytes in Python 2.
+"""
+
+
+def iri_to_uri(iri):
+    """
+    Convert an Internationalized Resource Identifier (IRI) portion to a URI
+    portion that is suitable for inclusion in a URL.
+
+    This is the algorithm from section 3.1 of RFC 3987.  However, since we are
+    assuming input is either UTF-8 or unicode already, we can simplify things a
+    little from the full method.
+
+    Returns an ASCII string containing the encoded result.
+    """
+    # The list of safe characters here is constructed from the "reserved" and
+    # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986:
+    #     reserved    = gen-delims / sub-delims
+    #     gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+    #     sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
+    #                   / "*" / "+" / "," / ";" / "="
+    #     unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
+    # Of the unreserved characters, urllib.quote already considers all but
+    # the ~ safe.
+    # The % character is also added to the list of safe characters here, as the
+    # end of section 3.1 of RFC 3987 specifically mentions that % must not be
+    # converted.
+    if iri is None:
+        return iri
+    return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~")
+
+
+def filepath_to_uri(path):
+    """Convert a file system path to a URI portion that is suitable for
+    inclusion in a URL.
+
+    We are assuming input is either UTF-8 or unicode already.
+
+    This method will encode certain chars that would normally be recognized as
+    special chars for URIs.  Note that this method does not encode the '
+    character, as it is a valid character within URIs.  See
+    encodeURIComponent() JavaScript function for more details.
+
+    Returns an ASCII string containing the encoded result.
+    """
+    if path is None:
+        return path
+    # I know about `os.sep` and `os.altsep` but I want to leave
+    # some flexibility for hardcoding separators.
+    return quote(force_bytes(path).replace(b"\\", b"/"), safe=b"/~!*()'")
+
+
+def get_system_encoding():
+    """
+    The encoding of the default system locale but falls back to the given
+    fallback encoding if the encoding is unsupported by python or could
+    not be determined.  See tickets #10335 and #5846
+    """
+    try:
+        encoding = locale.getdefaultlocale()[1] or 'ascii'
+        codecs.lookup(encoding)
+    except Exception:
+        encoding = 'ascii'
+    return encoding
+
+DEFAULT_LOCALE_ENCODING = get_system_encoding()
diff --git a/twitter_text/extractor.py b/twitter_text/extractor.py
index 0898370..87245b8 100644
--- a/twitter_text/extractor.py
+++ b/twitter_text/extractor.py
@@ -1,7 +1,7 @@
 # encoding=utf-8
 
 from twitter_text.regex import REGEXEN
-from twitter_text.unicode import force_unicode
+from twitter_text.encoding import force_text
 
 
 class Extractor(object):
@@ -11,7 +11,7 @@ class Extractor(object):
     """
 
     def __init__(self, text):
-        self.text = force_unicode(text)
+        self.text = force_text(text)
 
     def _remove_overlapping_entities(self, entities):
         """
diff --git a/twitter_text/highlighter.py b/twitter_text/highlighter.py
index 90bbdfd..3311c29 100644
--- a/twitter_text/highlighter.py
+++ b/twitter_text/highlighter.py
@@ -3,7 +3,7 @@
 import re
 from HTMLParser import HTMLParser
 
-from twitter_text.unicode import force_unicode
+from twitter_text.encoding import force_text
 
 DEFAULT_HIGHLIGHT_TAG = 'em'
 
@@ -29,7 +29,7 @@ def strip_tags(html):
 
 class HitHighlighter(object):
     def __init__(self, text, **kwargs):
-        self.text = force_unicode(text)
+        self.text = force_text(text)
         self.parent = kwargs.get('parent', False)
 
     def hit_highlight(self, hits=[], **kwargs):
diff --git a/twitter_text/validation.py b/twitter_text/validation.py
index eabd955..25b4d3d 100644
--- a/twitter_text/validation.py
+++ b/twitter_text/validation.py
@@ -2,7 +2,7 @@
 
 import re
 
-from twitter_text.unicode import force_unicode
+from twitter_text.encoding import force_text
 from twitter_text.extractor import Extractor
 from twitter_text.regex import REGEXEN
 
@@ -17,7 +17,7 @@
 
 class Validation(object):
     def __init__(self, text, **kwargs):
-        self.text = force_unicode(text)
+        self.text = force_text(text)
         self.parent = kwargs.get('parent', False)
 
     def tweet_length(self, options={}):
@@ -42,7 +42,7 @@ def tweet_length(self, options={}):
                 options[key] = DEFAULT_TCO_URL_LENGTHS[key]
 
         length = len(self.text)
-        # thanks force_unicode for making this so much simpler than the ruby version
+        # thanks force_text for making this so much simpler than the ruby version
 
         for url in Extractor(self.text).extract_urls_with_indices():
             # remove the link of the original URL

From 433c48df30631ed252529a7d4cbe279048b86a38 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Tue, 1 Mar 2016 17:46:54 -0800
Subject: [PATCH 05/30] Removed old tests.py

---
 tests.py | 198 -------------------------------------------------------
 1 file changed, 198 deletions(-)
 delete mode 100644 tests.py

diff --git a/tests.py b/tests.py
deleted file mode 100644
index 97da4c0..0000000
--- a/tests.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# encoding=utf-8
-
-import argparse
-import json
-import os
-import re
-import sys
-import twitter_text
-
-from twitter_text.unicode import force_unicode
-
-try:
-    import yaml
-except ImportError:
-    raise Exception('You need to install pyaml to run the tests')
-# from http://stackoverflow.com/questions/2890146/how-to-force-pyyaml-to-load-strings-as-unicode-objects
-from yaml import Loader, SafeLoader
-
-
-narrow_build = True
-try:
-    unichr(0x20000)
-    narrow_build = False
-except:
-    pass
-
-
-parser = argparse.ArgumentParser(description=u'Run the integration tests for twitter_text')
-parser.add_argument('--ignore-narrow-errors', '-i', help=u'Ignore errors caused by narrow builds', default=False, action='store_true')
-args = parser.parse_args()
-
-
-def construct_yaml_str(self, node):
-    return self.construct_scalar(node)
-Loader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str)
-SafeLoader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str)
-
-
-try:
-    from bs4 import BeautifulSoup
-except ImportError:
-    try:
-        from BeautifulSoup import BeautifulSoup
-    except ImportError:
-        raise Exception('You need to install BeautifulSoup to run the tests')
-
-
-def success(text):
-    return (u'\033[92m%s\033[0m\n' % text).encode('utf-8')
-
-
-def error(text):
-    return (u'\033[91m%s\033[0m\n' % text).encode('utf-8')
-
-
-CURRENT_DIR = os.path.dirname(__file__)
-CONFORMANCE_DIR = os.path.join(CURRENT_DIR, 'twitter-text-conformance/conformance')
-attempted = 0
-
-
-def assert_equal_without_attribute_order(result, test, failure_message=None):
-    global attempted
-    attempted += 1
-    # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through
-    assert BeautifulSoup(result) == BeautifulSoup(test.get('expected')), error(u'Test %d Failed: %s' % (attempted, test.get('description')))
-    sys.stdout.write(success(u'Test %d Passed: %s' % (attempted, test.get('description'))))
-    sys.stdout.flush()
-
-
-def assert_equal(result, test):
-    global attempted
-    attempted += 1
-    assert result == test.get('expected'), error(u'\nTest %d Failed: %s%s' % (attempted, test.get('description'), u'\n%s' % test.get('hits') if test.get('hits') else ''))
-    sys.stdout.write(success(u'Test %d Passed: %s' % (attempted, test.get('description'))))
-    sys.stdout.flush()
-
-# extractor section
-extractor_file = open(os.path.join(CONFORMANCE_DIR, 'extract.yml'), 'r')
-extractor_tests = yaml.load(force_unicode(extractor_file.read()))
-extractor_file.close()
-
-sys.stdout.write('Testing Extractor\n')
-sys.stdout.flush()
-
-for section in extractor_tests.get('tests'):
-    sys.stdout.write('\nTesting Extractor: %s\n' % section)
-    sys.stdout.flush()
-    for test in extractor_tests.get('tests').get(section):
-        if (args.ignore_narrow_errors or narrow_build) and section in ['hashtags'] and test.get('description') in ['Hashtag with ideographic iteration mark']:
-            sys.stdout.write('Skipping: %s\n' % test.get('description'))
-            sys.stdout.flush()
-            continue
-        extractor = twitter_text.extractor.Extractor(test.get('text'))
-        if section == 'mentions':
-            assert_equal(extractor.extract_mentioned_screen_names(), test)
-        elif section == 'mentions_with_indices':
-            assert_equal(extractor.extract_mentioned_screen_names_with_indices(), test)
-        elif section == 'mentions_or_lists_with_indices':
-            assert_equal(extractor.extract_mentions_or_lists_with_indices(), test)
-        elif section == 'replies':
-            assert_equal(extractor.extract_reply_screen_name(), test)
-        elif section == 'urls':
-            assert_equal(extractor.extract_urls(), test)
-        elif section == 'urls_with_indices':
-            assert_equal(extractor.extract_urls_with_indices(), test)
-        elif section == 'hashtags':
-            assert_equal(extractor.extract_hashtags(), test)
-        elif section == 'cashtags':
-            assert_equal(extractor.extract_cashtags(), test)
-        elif section == 'hashtags_with_indices':
-            assert_equal(extractor.extract_hashtags_with_indices(), test)
-        elif section == 'cashtags_with_indices':
-            assert_equal(extractor.extract_cashtags_with_indices(), test)
-
-# autolink section
-autolink_file = open(os.path.join(CONFORMANCE_DIR, 'autolink.yml'), 'r')
-autolink_tests = yaml.load(force_unicode(autolink_file.read()))
-autolink_file.close()
-
-sys.stdout.write('\nTesting Autolink\n')
-sys.stdout.flush()
-
-autolink_options = {'suppress_no_follow': True}
-
-for section in autolink_tests.get('tests'):
-    sys.stdout.write('\nTesting Autolink: %s\n' % section)
-    for test in autolink_tests.get('tests').get(section):
-        if (args.ignore_narrow_errors or narrow_build) and section in ['hashtags'] and test.get('description') in ['Autolink a hashtag containing ideographic iteration mark']:
-            sys.stdout.write('Skipping: %s\n' % test.get('description'))
-            sys.stdout.flush()
-            continue
-        autolink = twitter_text.autolink.Autolink(test.get('text'))
-        if section == 'usernames':
-            assert_equal_without_attribute_order(autolink.auto_link_usernames_or_lists(autolink_options), test)
-        elif section == 'cashtags':
-            assert_equal_without_attribute_order(autolink.auto_link_cashtags(autolink_options), test)
-        elif section == 'urls':
-            assert_equal_without_attribute_order(autolink.auto_link_urls(autolink_options), test)
-        elif section == 'hashtags':
-            assert_equal_without_attribute_order(autolink.auto_link_hashtags(autolink_options), test)
-        elif section == 'all':
-            assert_equal_without_attribute_order(autolink.auto_link(autolink_options), test)
-        elif section == 'lists':
-            assert_equal_without_attribute_order(autolink.auto_link_usernames_or_lists(autolink_options), test)
-        elif section == 'json':
-            assert_equal_without_attribute_order(autolink.auto_link_with_json(json.loads(test.get('json')), autolink_options), test)
-
-# hit_highlighting section
-hit_highlighting_file = open(os.path.join(CONFORMANCE_DIR, 'hit_highlighting.yml'), 'r')
-hit_highlighting_tests = yaml.load(force_unicode(hit_highlighting_file.read()))
-hit_highlighting_file.close()
-
-sys.stdout.write('\nTesting Hit Highlighting\n')
-sys.stdout.flush()
-
-for section in hit_highlighting_tests.get('tests'):
-    sys.stdout.write('\nTesting Hit Highlighting: %s\n' % section)
-    for test in hit_highlighting_tests.get('tests').get(section):
-        hit_highlighter = twitter_text.highlighter.HitHighlighter(test.get('text'))
-        if section == 'plain_text':
-            assert_equal(hit_highlighter.hit_highlight(hits=test.get('hits')), test)
-        elif section == 'with_links':
-            assert_equal_without_attribute_order(hit_highlighter.hit_highlight(hits=test.get('hits')), test)
-
-# validation section
-validation_tested = False
-validate_tests = None
-try:
-    validate_file = open(os.path.join(CONFORMANCE_DIR, 'validate.yml'), 'r')
-    validate_file_contents = validate_file.read()
-    validate_tests = yaml.load(re.sub(ur'\\n', '\n', validate_file_contents.encode('unicode-escape')))
-    validate_file.close()
-except ValueError:
-    sys.stdout.write('\nValidation tests were skipped because of wide character issues\n')
-    sys.stdout.flush()
-
-if validate_tests:
-    sys.stdout.write('\nTesting Validation\n')
-    sys.stdout.flush()
-
-    for section in validate_tests.get('tests'):
-        sys.stdout.write('\nTesting Validation: %s\n' % section)
-        for test in validate_tests.get('tests').get(section):
-            validator = twitter_text.validation.Validation(test.get('text'))
-            if section == 'tweets':
-                assert_equal(not validator.tweet_invalid(), test)
-            elif section == 'usernames':
-                assert_equal(validator.valid_username(), test)
-            elif section == 'lists':
-                assert_equal(validator.valid_list(), test)
-            elif section == 'hashtags':
-                assert_equal(validator.valid_hashtag(), test)
-            elif section == 'urls':
-                assert_equal(validator.valid_url(), test)
-
-sys.stdout.write(u'\033[0m-------\n\033[92m%d tests passed.\033[0m\n' % attempted)
-sys.stdout.flush()
-sys.exit(os.EX_OK)

From 40cbe7ad2ddbf2bd97702d66ba42e38eb45b613a Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Wed, 2 Mar 2016 13:43:58 -0800
Subject: [PATCH 06/30] Further work on getting tests to pass.

---
 conftest.py | 86 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 66 insertions(+), 20 deletions(-)

diff --git a/conftest.py b/conftest.py
index fb5d4b2..7f22751 100644
--- a/conftest.py
+++ b/conftest.py
@@ -18,7 +18,7 @@
 except ImportError:
     raise Exception('You need to install pyaml to run the tests')
 # from http://stackoverflow.com/questions/2890146/how-to-force-pyyaml-to-load-strings-as-unicode-objects
-from yaml import Loader, SafeLoader
+#from yaml import Loader, SafeLoader
 
 
 narrow_build = True
@@ -29,10 +29,10 @@
     pass
 
 
-def construct_yaml_str(self, node):
-    return self.construct_scalar(node)
-Loader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str)
-SafeLoader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str)
+#def construct_yaml_str(self, node):
+#    return self.construct_scalar(node)
+#Loader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str)
+#SafeLoader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str)
 
 
 try:
@@ -44,15 +44,6 @@ def construct_yaml_str(self, node):
         raise Exception('You need to install BeautifulSoup to run the tests')
 
 
-def assert_equal_without_attribute_order(result, test, failure_message=None):
-    # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through
-    assert BeautifulSoup(result) == BeautifulSoup(test.get('expected'))
-
-
-def assert_equal(result, test):
-    assert result == test.get('expected')
-
-
 def pytest_collect_file(parent, path):
     if path.ext == '.yml':
         return YamlFile(path, parent)
@@ -64,10 +55,16 @@ class YamlException(Exception):
 
 class YamlFile(pytest.File):
     def collect(self):
+        filename = os.path.splitext(os.path.basename(self.fspath.strpath))[0]
+        if filename not in TEST_MAP:
+            print "Skipping {}; not supported".format(filename)
+            return
+        if TEST_MAP[filename].get('requires_wide_build') and narrow_build:
+            print "Skipping {} due to narrow build".format(filename)
+            return
         raw = yaml.safe_load(force_text(self.fspath.open().read()))
         if 'tests' not in raw:
             return
-        filename = os.path.splitext(os.path.basename(self.fspath.strpath))[0]
         for section, specs in raw['tests'].items():
             for spec in specs:
                 yield YamlItem(self, filename, section, spec)
@@ -75,9 +72,26 @@ def collect(self):
 
 TEST_MAP = {
     'autolink': {
-        'cls': None,
+        'cls': twitter_text.autolink.Autolink,
+        'options': {'suppress_no_follow': True},
         'methods': {
+            'usernames': 'auto_link_usernames_or_lists',
+            'cashtags': 'auto_link_cashtags',
+            'urls': 'auto_link_urls',
+            'hashtags': 'auto_link_hashtags',
+            'all': 'auto_link',
+            'lists': 'auto_link_usernames_or_lists',
+            'json': 'auto_link_with_json',
         },
+        'ignore_attribute_order': set([
+            'usernames',
+            'cashtags',
+            'urls',
+            'hashtags',
+            'all',
+            'lists',
+            'json',
+        ])
     },
     'extract': {
         'cls': twitter_text.extractor.Extractor,
@@ -95,8 +109,24 @@ def collect(self):
         },
     },
     'hit_highlighting': {
-        'cls': None,
+        'cls': twitter_text.highlighter.HitHighlighter,
+        'methods': {
+            'plain_text': 'hit_highlight',
+            'with_links': 'hit_highlight',
+        },
+        'ignore_attribute_order': set([
+            'with_links',
+        ])
+    },
+    'validate': {
+        'cls': twitter_text.validation.Validation,
+        'requires_wide_build': True,
         'methods': {
+            'tweets': 'valid_tweet_text',
+            'usernames': 'valid_username',
+            'lists': 'valid_list',
+            'hashtags': 'valid_hashtag',
+            'urls': 'valid_url',
         },
     }
 }
@@ -110,16 +140,32 @@ def __init__(self, parent, filename, section, spec):
         name = "{}:{}:{}".format(filename, section, spec['description'])
         super(YamlItem, self).__init__(name, parent)
 
+    def _equal_without_attribute_order(result, expected):
+        # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through
+        return BeautifulSoup(result) == BeautifulSoup(expected)
+
     def runtest(self):
         if self.filename not in TEST_MAP:
             raise YamlException("{} file not supported".format(self.section))
         if self.section not in TEST_MAP[self.filename]['methods']:
             raise YamlException("{}:{} section not supported".format(self.section))
         cls = TEST_MAP[self.section]['cls']
-        method_name = TEST_MAP[self.section]['methods']
+        method_name = TEST_MAP[self.filename]['methods'][self.section]
         instance = cls(self.spec['text'])
-        result = getattr(instance, method_name)()
-        if result != self.spec['expected']:
+        args = []
+        kwargs = {}
+        if 'json' in self.spec:
+            args.append(json.loads(self.spec['json']))
+        if 'options' in TEST_MAP[self.filename]:
+            args.append(TEST_MAP[self.filename])
+        if 'hits' in self.spec:
+            kwargs['hits'] = self.spec['hits']
+        result = getattr(instance, method_name)(*args, **kwargs)
+        if self.section in TEST_MAP[self.filename].get('ignore_attribute_order', ()):
+            equal = self._equal_without_attribute_order(result, self.spec['expected'])
+        else:
+            equal = result == self.spect['expected']
+        if not equal:
             raise YamlException("{} != {}".format(result, self.spec['expected']))
 
     def repr_failure(self, excinfo):

From 17f3c13cf78a930d6be19b5c5425b49714276237 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 14:02:23 -0700
Subject: [PATCH 07/30] Removed twitter-text-conformance submodule.

---
 twitter-text-conformance | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 twitter-text-conformance

diff --git a/twitter-text-conformance b/twitter-text-conformance
deleted file mode 160000
index a39ec58..0000000
--- a/twitter-text-conformance
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit a39ec5875528aaf0a874f384536fcd8e904d9fd8

From 15d03c7d16b7e095211f31a28d03f72070712334 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 14:03:19 -0700
Subject: [PATCH 08/30] Added twitter-text mono repo submodule.

---
 .gitmodules  | 6 +++---
 twitter-text | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)
 create mode 160000 twitter-text

diff --git a/.gitmodules b/.gitmodules
index f051160..e5d7799 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "twitter-text-conformance"]
-	path = twitter-text-conformance
-	url = https://github.com/dryan/twitter-text-conformance.git
+[submodule "twitter-text"]
+	path = twitter-text
+	url = https://github.com/twitter/twitter-text.git
diff --git a/twitter-text b/twitter-text
new file mode 160000
index 0000000..fb07f2e
--- /dev/null
+++ b/twitter-text
@@ -0,0 +1 @@
+Subproject commit fb07f2e30c1d3d053cf2bb2ad6971a3bcfc9b568

From 9f2fbd40e62efac7098d4abdbd553c57f14c37b5 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 14:20:23 -0700
Subject: [PATCH 09/30] Corrected a couple errors re: test running.

---
 conftest.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/conftest.py b/conftest.py
index 7f22751..e40dcb6 100644
--- a/conftest.py
+++ b/conftest.py
@@ -140,7 +140,7 @@ def __init__(self, parent, filename, section, spec):
         name = "{}:{}:{}".format(filename, section, spec['description'])
         super(YamlItem, self).__init__(name, parent)
 
-    def _equal_without_attribute_order(result, expected):
+    def _equal_without_attribute_order(self, result, expected):
         # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through
         return BeautifulSoup(result) == BeautifulSoup(expected)
 
@@ -149,7 +149,7 @@ def runtest(self):
             raise YamlException("{} file not supported".format(self.section))
         if self.section not in TEST_MAP[self.filename]['methods']:
             raise YamlException("{}:{} section not supported".format(self.section))
-        cls = TEST_MAP[self.section]['cls']
+        cls = TEST_MAP[self.filename]['cls']
         method_name = TEST_MAP[self.filename]['methods'][self.section]
         instance = cls(self.spec['text'])
         args = []
@@ -157,7 +157,7 @@ def runtest(self):
         if 'json' in self.spec:
             args.append(json.loads(self.spec['json']))
         if 'options' in TEST_MAP[self.filename]:
-            args.append(TEST_MAP[self.filename])
+            kwargs['options'] = TEST_MAP[self.filename]['options']
         if 'hits' in self.spec:
             kwargs['hits'] = self.spec['hits']
         result = getattr(instance, method_name)(*args, **kwargs)

From a18a8a348f536ce1c7d43a509b848caa94901fa2 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 14:23:29 -0700
Subject: [PATCH 10/30] Tweaked travis to run py.test

---
 .travis.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index b954140..3c79536 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,8 +4,8 @@ python:
   - "2.7"
 
 install: 
-  - "pip install . --use-mirrors"
-  - "pip install -r requirements.txt --use-mirrors"
-script: "python ./tests.py"
+  - "pip install ."
+  - "pip install -r requirements.txt"
+script: "py.test"
 notifications:
   email: false

From 8ceafcfea8d5541a5d002637cf63cba5fa7e3e74 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 14:23:59 -0700
Subject: [PATCH 11/30] Added /.cache to gitignore

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 17b1f10..20a1408 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 *.pyc
 build*
 *.egg*
-dist
\ No newline at end of file
+dist
+/.cache

From e567d392207f8abc7946a337b6b2349985d82758 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 14:28:44 -0700
Subject: [PATCH 12/30] Corrected case preservation in autolinked screen_names

---
 twitter_text/autolink.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/twitter_text/autolink.py b/twitter_text/autolink.py
index 8e40227..7b69f83 100644
--- a/twitter_text/autolink.py
+++ b/twitter_text/autolink.py
@@ -388,7 +388,6 @@ def _link_to_cashtag(self, entity, chars, options={}):
     def _link_to_screen_name(self, entity, chars, options={}):
         name = u'%s%s' % (entity['screen_name'], entity.get('list_slug') or '')
         chunk = options.get('link_text_transform', default_transform)(entity, name)
-        name = name.lower()
 
         at = chars[entity['indices'][0]]
 

From 6581a0a474415612603fdc284f4dd9ede8314b59 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 14:35:12 -0700
Subject: [PATCH 13/30] Required beautifulsoup4 for better unicode handling.

---
 conftest.py      | 7 ++-----
 requirements.txt | 3 ++-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/conftest.py b/conftest.py
index e40dcb6..0add157 100644
--- a/conftest.py
+++ b/conftest.py
@@ -38,10 +38,7 @@
 try:
     from bs4 import BeautifulSoup
 except ImportError:
-    try:
-        from BeautifulSoup import BeautifulSoup
-    except ImportError:
-        raise Exception('You need to install BeautifulSoup to run the tests')
+    raise Exception('You need to install BeautifulSoup4 to run the tests')
 
 
 def pytest_collect_file(parent, path):
@@ -142,7 +139,7 @@ def __init__(self, parent, filename, section, spec):
 
     def _equal_without_attribute_order(self, result, expected):
         # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through
-        return BeautifulSoup(result) == BeautifulSoup(expected)
+        return BeautifulSoup(result, "lxml") == BeautifulSoup(expected, "lxml")
 
     def runtest(self):
         if self.filename not in TEST_MAP:
diff --git a/requirements.txt b/requirements.txt
index d001e1b..1c8d134 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 argparse==1.2.1
 PyYAML==3.10
-beautifulsoup4==4.2.0
+beautifulsoup4==4.4.1
+lxml==3.4.4
 pytest==2.87
 py==1.4.29

From 7dc0301589a438290e2656d2acf3d36b4ed5f44a Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 14:45:57 -0700
Subject: [PATCH 14/30] Corrected nested balanced paren handling.

---
 twitter_text/regex.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/twitter_text/regex.py b/twitter_text/regex.py
index fffebde..049bc04 100644
--- a/twitter_text/regex.py
+++ b/twitter_text/regex.py
@@ -208,7 +208,14 @@ def regex_range(start, end=None):
 # Allow URL paths to contain balanced parens
 #  1. Used in Wikipedia URLs like /Primer_(film)
 #  2. Used in IIS sessions like /S(dfd346)/
-REGEXEN['valid_url_balanced_parens'] = re.compile(ur'\(%s+\)' % REGEXEN['valid_general_url_path_chars'].pattern, re.IGNORECASE | re.UNICODE)
+# Allow one nested level of balanced parentheses
+REGEXEN['valid_url_balanced_parens'] = re.compile(
+    ur'\((?:%s+|(?:%s*\(%s+\)%s*))\)' % (
+        REGEXEN['valid_general_url_path_chars'].pattern,
+        REGEXEN['valid_general_url_path_chars'].pattern,
+        REGEXEN['valid_general_url_path_chars'].pattern,
+        REGEXEN['valid_general_url_path_chars'].pattern,
+    ), re.IGNORECASE | re.UNICODE)
 # Valid end-of-path chracters (so /foo. does not gobble the period).
 #   1. Allow =&# for empty URL parameters and other URL-join artifacts
 REGEXEN['valid_url_path_ending_chars'] = re.compile(ur'[a-z0-9=_#\/\+\-%s]|(?:%s)' % (LATIN_ACCENTS, REGEXEN['valid_url_balanced_parens'].pattern), re.IGNORECASE | re.UNICODE)

From e24ebdc9b0fdc3c108d2d30dea092b6e5c9c6a10 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 14:49:21 -0700
Subject: [PATCH 15/30] Bumped pytest version.

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 1c8d134..f3f1ba6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,5 +2,5 @@ argparse==1.2.1
 PyYAML==3.10
 beautifulsoup4==4.4.1
 lxml==3.4.4
-pytest==2.87
+pytest==2.9.1
 py==1.4.29

From 91423dbc9216534d4bbbb86e3542e4c3ee5e4fd1 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 15:11:58 -0700
Subject: [PATCH 16/30] Improved non-latin regex support.

---
 requirements.txt      |  1 +
 twitter_text/regex.py | 37 +++++++++++++++++++++++++++----------
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index f3f1ba6..334171d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ beautifulsoup4==4.4.1
 lxml==3.4.4
 pytest==2.9.1
 py==1.4.29
+regex==2016.04.25
diff --git a/twitter_text/regex.py b/twitter_text/regex.py
index 049bc04..0eef03e 100644
--- a/twitter_text/regex.py
+++ b/twitter_text/regex.py
@@ -3,8 +3,9 @@
 # A collection of regular expressions for parsing Tweet text. The regular expression
 # list is frozen at load time to ensure immutability. These reular expressions are
 # used throughout the Twitter classes. Special care has been taken to make
-# sure these reular expressions work with Tweets in all languages.
-import re
+# sure these regular expressions work with Tweets in all languages.
+from __future__ import absolute_import
+import regex as re
 
 REGEXEN = {}  # :nodoc:
 
@@ -79,10 +80,7 @@ def regex_range(start, end=None):
 
 NON_LATIN_HASHTAG_CHARS = ''.join([
     # Cyrillic (Russian, Ukrainian, etc.)
-    regex_range(0x0400, 0x04ff),  # Cyrillic
-    regex_range(0x0500, 0x0527),  # Cyrillic Supplement
-    regex_range(0x2de0, 0x2dff),  # Cyrillic Extended A
-    regex_range(0xa640, 0xa69f),  # Cyrillic Extended B
+    '\p{Cyrillic}',  # Cyrillic
     regex_range(0x0591, 0x05bf),  # Hebrew
     regex_range(0x05c1, 0x05c2),
     regex_range(0x05c4, 0x05c5),
@@ -204,7 +202,11 @@ def regex_range(start, end=None):
 
 REGEXEN['valid_port_number'] = re.compile(ur'[0-9]+')
 
-REGEXEN['valid_general_url_path_chars'] = re.compile(ur"[a-z0-9!\*';:=\+\,\.\$\/%%#\[\]\-_~&|@%s]" % LATIN_ACCENTS, re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_general_url_path_chars'] = re.compile(
+    ur"[a-z%s0-9!\*';:=\+\,\.\$\/%%#\[\]\-_~&|@%s]" % (
+        NON_LATIN_HASHTAG_CHARS,
+        LATIN_ACCENTS,
+    ), re.IGNORECASE | re.UNICODE)
 # Allow URL paths to contain balanced parens
 #  1. Used in Wikipedia URLs like /Primer_(film)
 #  2. Used in IIS sessions like /S(dfd346)/
@@ -218,8 +220,20 @@ def regex_range(start, end=None):
     ), re.IGNORECASE | re.UNICODE)
 # Valid end-of-path chracters (so /foo. does not gobble the period).
 #   1. Allow =&# for empty URL parameters and other URL-join artifacts
-REGEXEN['valid_url_path_ending_chars'] = re.compile(ur'[a-z0-9=_#\/\+\-%s]|(?:%s)' % (LATIN_ACCENTS, REGEXEN['valid_url_balanced_parens'].pattern), re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_url_path'] = re.compile(ur'(?:(?:%s*(?:%s %s*)*%s)|(?:%s+\/))' % (REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_balanced_parens'].pattern, REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_path_ending_chars'].pattern, REGEXEN['valid_general_url_path_chars'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_url_path_ending_chars'] = re.compile(
+    ur'[a-z%s0-9=_#\/\+\-%s]|(?:%s)' % (
+        NON_LATIN_HASHTAG_CHARS,
+        LATIN_ACCENTS,
+        REGEXEN['valid_url_balanced_parens'].pattern
+    ), re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_url_path'] = re.compile(
+    ur'(?:(?:%s*(?:%s %s*)*%s)|(?:%s+\/))' % (
+        REGEXEN['valid_general_url_path_chars'].pattern,
+        REGEXEN['valid_url_balanced_parens'].pattern,
+        REGEXEN['valid_general_url_path_chars'].pattern,
+        REGEXEN['valid_url_path_ending_chars'].pattern,
+        REGEXEN['valid_general_url_path_chars'].pattern
+    ), re.IGNORECASE | re.UNICODE)
 
 REGEXEN['valid_url_query_chars'] = re.compile(ur"[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]", re.IGNORECASE | re.UNICODE)
 REGEXEN['valid_url_query_ending_chars'] = re.compile(ur'[a-z0-9_&=#\/]', re.IGNORECASE | re.UNICODE)
@@ -245,7 +259,10 @@ def regex_range(start, end=None):
 REGEXEN['valid_cashtag'] = re.compile(ur'(^|[%s])(\$|＄|﹩)(%s)(?=$|\s|[%s])' % (REGEXEN['spaces'].pattern, REGEXEN['cashtag'].pattern, PUNCTUATION_CHARS), re.IGNORECASE)
 
 # These URL validation pattern strings are based on the ABNF from RFC 3986
-REGEXEN['validate_url_unreserved'] = re.compile(ur'[a-z0-9\-._~]', re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_unreserved'] = re.compile(
+    ur'[a-z%s0-9\-._~]' % (
+        NON_LATIN_HASHTAG_CHARS,
+    ), re.IGNORECASE | re.UNICODE)
 REGEXEN['validate_url_pct_encoded'] = re.compile(ur'(?:%[0-9a-f]{2})', re.IGNORECASE | re.UNICODE)
 REGEXEN['validate_url_sub_delims'] = re.compile(ur"[!$&'()*+,;=]", re.IGNORECASE | re.UNICODE)
 REGEXEN['validate_url_pchar'] = re.compile(ur'(?:%s|%s|%s|[:\|@])' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE)

From f012873a26fee9875e49b825bd44371746b38445 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 15:18:54 -0700
Subject: [PATCH 17/30] Improved unicode hashcode support.

Autolink 100% passing.
---
 twitter_text/regex.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/twitter_text/regex.py b/twitter_text/regex.py
index 0eef03e..60c1334 100644
--- a/twitter_text/regex.py
+++ b/twitter_text/regex.py
@@ -157,11 +157,16 @@ def regex_range(start, end=None):
 CTRL_CHARS = ur"\x00-\x1F\x7F"
 
 # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
-HASHTAG_ALPHA = ur'[a-z_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS)
+HASHTAG_ALPHA = ur'[\p{L}\p{M}]'
 HASHTAG_ALPHANUMERIC = ur'[a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS)
 HASHTAG_BOUNDARY = ur'\A|\z|\[|[^&a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS)
 
-HASHTAG = re.compile(ur'(%s)(#|＃)(%s*%s%s*)' % (HASHTAG_BOUNDARY, HASHTAG_ALPHANUMERIC, HASHTAG_ALPHA, HASHTAG_ALPHANUMERIC), re.IGNORECASE)
+HASHTAG = re.compile(ur'(%s)(#|＃)(?!\ufe0f|\u20e3)(%s*%s%s*)' % (
+    HASHTAG_BOUNDARY,
+    HASHTAG_ALPHANUMERIC,
+    HASHTAG_ALPHA,
+    HASHTAG_ALPHANUMERIC,
+), re.IGNORECASE)
 
 REGEXEN['valid_hashtag'] = HASHTAG
 REGEXEN['end_hashtag_match'] = re.compile(ur'\A(?:[#＃]|:\/\/)', re.IGNORECASE | re.UNICODE)

From 9a9bfe813cb9c776b6d05958aa32f2a65f81d161 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 15:26:06 -0700
Subject: [PATCH 18/30] Corrected typo affecting tests that don't ignore
 attribute order.

---
 conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conftest.py b/conftest.py
index 0add157..4a2da8e 100644
--- a/conftest.py
+++ b/conftest.py
@@ -161,7 +161,7 @@ def runtest(self):
         if self.section in TEST_MAP[self.filename].get('ignore_attribute_order', ()):
             equal = self._equal_without_attribute_order(result, self.spec['expected'])
         else:
-            equal = result == self.spect['expected']
+            equal = result == self.spec['expected']
         if not equal:
             raise YamlException("{} != {}".format(result, self.spec['expected']))
 

From 7ad41afd8971ad60add77fdb7d8f1244ef893f95 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 15:31:16 -0700
Subject: [PATCH 19/30] Limited regex to Cyrillic in same places where other
 conformant libs do.

---
 twitter_text/regex.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/twitter_text/regex.py b/twitter_text/regex.py
index 60c1334..f839bfb 100644
--- a/twitter_text/regex.py
+++ b/twitter_text/regex.py
@@ -208,8 +208,7 @@ def regex_range(start, end=None):
 REGEXEN['valid_port_number'] = re.compile(ur'[0-9]+')
 
 REGEXEN['valid_general_url_path_chars'] = re.compile(
-    ur"[a-z%s0-9!\*';:=\+\,\.\$\/%%#\[\]\-_~&|@%s]" % (
-        NON_LATIN_HASHTAG_CHARS,
+    ur"[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%%#\[\]\-_~&|@%s]" % (
         LATIN_ACCENTS,
     ), re.IGNORECASE | re.UNICODE)
 # Allow URL paths to contain balanced parens
@@ -226,8 +225,7 @@ def regex_range(start, end=None):
 # Valid end-of-path chracters (so /foo. does not gobble the period).
 #   1. Allow =&# for empty URL parameters and other URL-join artifacts
 REGEXEN['valid_url_path_ending_chars'] = re.compile(
-    ur'[a-z%s0-9=_#\/\+\-%s]|(?:%s)' % (
-        NON_LATIN_HASHTAG_CHARS,
+    ur'[a-z\p{Cyrillic}0-9=_#\/\+\-%s]|(?:%s)' % (
         LATIN_ACCENTS,
         REGEXEN['valid_url_balanced_parens'].pattern
     ), re.IGNORECASE | re.UNICODE)
@@ -264,10 +262,7 @@ def regex_range(start, end=None):
 REGEXEN['valid_cashtag'] = re.compile(ur'(^|[%s])(\$|＄|﹩)(%s)(?=$|\s|[%s])' % (REGEXEN['spaces'].pattern, REGEXEN['cashtag'].pattern, PUNCTUATION_CHARS), re.IGNORECASE)
 
 # These URL validation pattern strings are based on the ABNF from RFC 3986
-REGEXEN['validate_url_unreserved'] = re.compile(
-    ur'[a-z%s0-9\-._~]' % (
-        NON_LATIN_HASHTAG_CHARS,
-    ), re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_unreserved'] = re.compile(ur'[a-z\p{Cyrillic}0-9\-._~]', re.IGNORECASE | re.UNICODE)
 REGEXEN['validate_url_pct_encoded'] = re.compile(ur'(?:%[0-9a-f]{2})', re.IGNORECASE | re.UNICODE)
 REGEXEN['validate_url_sub_delims'] = re.compile(ur"[!$&'()*+,;=]", re.IGNORECASE | re.UNICODE)
 REGEXEN['validate_url_pchar'] = re.compile(ur'(?:%s|%s|%s|[:\|@])' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE)

From e6e8e6a3e56e0f6a7cb621d4f5dc0da100c1cad5 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 15:59:49 -0700
Subject: [PATCH 20/30] Improved & updated TLD detection

Started pulling list of tlds directly from conformance core.
---
 conftest.py               |  8 +-------
 twitter_text/extractor.py |  8 +++++---
 twitter_text/regex.py     | 34 +++++++++++++++++++++++++++++++---
 3 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/conftest.py b/conftest.py
index 4a2da8e..93c5a6c 100644
--- a/conftest.py
+++ b/conftest.py
@@ -2,21 +2,15 @@
 
 from __future__ import unicode_literals
 
-import argparse
 import json
 import os
-import re
-import sys
 
 import pytest
+import yaml
 
 import twitter_text
 from twitter_text.encoding import force_text, smart_bytes
 
-try:
-    import yaml
-except ImportError:
-    raise Exception('You need to install pyaml to run the tests')
 # from http://stackoverflow.com/questions/2890146/how-to-force-pyyaml-to-load-strings-as-unicode-objects
 #from yaml import Loader, SafeLoader
 
diff --git a/twitter_text/extractor.py b/twitter_text/extractor.py
index 87245b8..1f51fce 100644
--- a/twitter_text/extractor.py
+++ b/twitter_text/extractor.py
@@ -173,10 +173,12 @@ def extract_urls_with_indices(self, options={'extract_url_without_protocol': Tru
                     ascii_domain = ascii_domain.group()
                     last_url = {
                         'url':      ascii_domain,
-                        'indices':  [start_position - len(before or '') + complete.find(ascii_domain), start_position - len(before or '') + complete.find(ascii_domain) + len(ascii_domain)]
+                        'indices':  [start_position - len(before or '') + complete.find(ascii_domain),
+                                     start_position - len(before or '') + complete.find(ascii_domain) + len(ascii_domain)]
                     }
-                    last_url_invalid_match = REGEXEN['invalid_short_domain'].search(ascii_domain) is not None
-                    if not last_url_invalid_match:
+                    if (path or
+                            REGEXEN['valid_special_short_domain'].search(ascii_domain) or
+                            not REGEXEN['invalid_short_domain'].search(ascii_domain)):
                         urls.append(last_url)
                 # no ASCII-only domain found. Skip the entire URL
                 if not last_url:
diff --git a/twitter_text/regex.py b/twitter_text/regex.py
index f839bfb..12ca9b8 100644
--- a/twitter_text/regex.py
+++ b/twitter_text/regex.py
@@ -5,7 +5,12 @@
 # used throughout the Twitter classes. Special care has been taken to make
 # sure these regular expressions work with Tweets in all languages.
 from __future__ import absolute_import
+import os
+
 import regex as re
+import yaml
+
+from twitter_text.encoding import force_text
 
 REGEXEN = {}  # :nodoc:
 
@@ -16,6 +21,15 @@ def regex_range(start, end=None):
     else:
         return u'%s' % unichr(start)
 
+TLDS = yaml.safe_load(force_text(
+    open(os.path.join(
+        os.path.dirname(os.path.dirname(__file__)),
+        'twitter-text',
+        'conformance',
+        'tld_lib.yml'
+    )).read()
+))
+
 
 # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
 # to access both the list of characters and a pattern suitible for use with String#split
@@ -189,11 +203,24 @@ def regex_range(start, end=None):
 REGEXEN['invalid_url_without_protocol_preceding_chars'] = re.compile(ur'[-_.\/]$')
 DOMAIN_VALID_CHARS = ur'[^%s%s%s%s%s]' % (PUNCTUATION_CHARS, SPACE_CHARS, CTRL_CHARS, ur''.join(REGEXEN['invalid_control_characters']), ur''.join(UNICODE_SPACES))
 REGEXEN['valid_subdomain'] = re.compile(ur'(?:(?:%s(?:[_-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_domain_name'] = re.compile(ur'(?:(?:%s(?:[-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_gTLD'] = re.compile(ur'(?:(?:academy|actor|aero|agency|arpa|asia|bar|bargains|berlin|best|bid|bike|biz|blue|boutique|build|builders|buzz|cab|camera|camp|cards|careers|cat|catering|center|ceo|cheap|christmas|cleaning|clothing|club|codes|coffee|com|community|company|computer|construction|contractors|cool|coop|cruises|dance|dating|democrat|diamonds|directory|domains|edu|education|email|enterprises|equipment|estate|events|expert|exposed|farm|fish|flights|florist|foundation|futbol|gallery|gift|glass|gov|graphics|guitars|guru|holdings|holiday|house|immobilien|industries|info|institute|int|international|jobs|kaufen|kim|kitchen|kiwi|koeln|kred|land|lighting|limo|link|luxury|management|mango|marketing|menu|mil|mobi|moda|monash|museum|nagoya|name|net|neustar|ninja|okinawa|onl|org|partners|parts|photo|photography|photos|pics|pink|plumbing|post|pro|productions|properties|pub|qpon|recipes|red|rentals|repair|report|reviews|rich|ruhr|sexy|shiksha|shoes|singles|social|solar|solutions|supplies|supply|support|systems|tattoo|technology|tel|tienda|tips|today|tokyo|tools|training|travel|uno|vacations|ventures|viajes|villas|vision|vote|voting|voto|voyage|wang|watch|wed|wien|wiki|works|xxx|xyz|zone|дети|онлайн|орг|сайт|بازار|شبكة|みんな|中信|中文网|公司|公>益|在线|我爱你|政务|游戏|移动|网络|集团|삼성)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_ccTLD'] = re.compile(ur'(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bl|bm|bn|bo|bq|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mf|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|мон|рф|срб|укр|қаз|الاردن|الجزائر|السعودية|المغرب|امارات|ایران|بھارت|تونس|سودان|سورية|عمان|فلسطين|قطر|مصر|مليسيا|پاکستان|भारत|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|ไทย|გე|中国|中加坡|湾|台灣|新香港|한국)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_domain_name'] = re.compile(
+    ur'(?:(?:%s(?:[-]|%s)*)?%s\.)' % (
+        DOMAIN_VALID_CHARS,
+        DOMAIN_VALID_CHARS,
+        DOMAIN_VALID_CHARS
+    ), re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_gTLD'] = re.compile(
+    ur'(?:(?:%s)(?=[^0-9a-z@]|$))' % (
+        '|'.join(TLDS['generic']),
+    ), re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_ccTLD'] = re.compile(
+    ur'(?:(?:%s)(?=[^0-9a-z@]|$))' % (
+        '|'.join(TLDS['country']),
+    ), re.IGNORECASE | re.UNICODE)
 REGEXEN['valid_punycode'] = re.compile(ur'(?:xn--[0-9a-z]+)', re.IGNORECASE | re.UNICODE)
 
+REGEXEN['valid_special_cctld'] = re.compile(ur'(?:(?:co|tv)(?=[^0-9a-z@]|$))')
+
 REGEXEN['valid_domain'] = re.compile(ur'(?:%s*%s(?:%s|%s|%s))' % (REGEXEN['valid_subdomain'].pattern, REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE)
 
 # This is used in Extractor
@@ -204,6 +231,7 @@ def regex_range(start, end=None):
 
 # This is used in Extractor to filter out unwanted URLs.
 REGEXEN['invalid_short_domain'] = re.compile(ur'\A%s%s\Z' % (REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_ccTLD'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_special_short_domain'] = re.compile(ur'\A%s%s\Z' % (REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_special_cctld'].pattern))
 
 REGEXEN['valid_port_number'] = re.compile(ur'[0-9]+')
 

From 263b5cf8832ebe19bfd0379963a2d8a814d61513 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 16:20:07 -0700
Subject: [PATCH 21/30] Corrected list of CJ hashtag chars that work in narrow
 build

---
 twitter_text/regex.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/twitter_text/regex.py b/twitter_text/regex.py
index 12ca9b8..d3a7d7b 100644
--- a/twitter_text/regex.py
+++ b/twitter_text/regex.py
@@ -149,6 +149,9 @@ def regex_range(start, end=None):
     regex_range(0x3099, 0x309E),  # Hiragana
     regex_range(0x3400, 0x4DBF),  # Kanji (CJK Extension A)
     regex_range(0x4E00, 0x9FFF),  # Kanji (Unified)
+    regex_range(0x3003),  # Kanji (CJK supplement)
+    regex_range(0x3005),  # Kanji (CJK supplement)
+    regex_range(0x303B),  # Kanji (CJK supplement)
 ])
 
 try:
@@ -158,9 +161,6 @@ def regex_range(start, end=None):
         regex_range(0x2A700, 0x2B73F),  # Kanji (CJK Extension C)
         regex_range(0x2B740, 0x2B81F),  # Kanji (CJK Extension D)
         regex_range(0x2F800, 0x2FA1F),  # Kanji (CJK supplement)
-        regex_range(0x3003),  # Kanji (CJK supplement)
-        regex_range(0x3005),  # Kanji (CJK supplement)
-        regex_range(0x303B),  # Kanji (CJK supplement)
     ])
 except ValueError:
     # this is a narrow python build so these extended Kanji characters won't work

From 35467a1d2b800b79a8a71555a60b8e24a06f5577 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 16:27:54 -0700
Subject: [PATCH 22/30] Matched hashtag regex exactly to ruby version

---
 twitter_text/regex.py | 78 ++-----------------------------------------
 1 file changed, 2 insertions(+), 76 deletions(-)

diff --git a/twitter_text/regex.py b/twitter_text/regex.py
index d3a7d7b..8bbdd38 100644
--- a/twitter_text/regex.py
+++ b/twitter_text/regex.py
@@ -92,88 +92,14 @@ def regex_range(start, end=None):
     regex_range(0xFE70, 0xFEFF)
 ])
 
-NON_LATIN_HASHTAG_CHARS = ''.join([
-    # Cyrillic (Russian, Ukrainian, etc.)
-    '\p{Cyrillic}',  # Cyrillic
-    regex_range(0x0591, 0x05bf),  # Hebrew
-    regex_range(0x05c1, 0x05c2),
-    regex_range(0x05c4, 0x05c5),
-    regex_range(0x05c7),
-    regex_range(0x05d0, 0x05ea),
-    regex_range(0x05f0, 0x05f4),
-    regex_range(0xfb12, 0xfb28),  # Hebrew Presentation Forms
-    regex_range(0xfb2a, 0xfb36),
-    regex_range(0xfb38, 0xfb3c),
-    regex_range(0xfb3e),
-    regex_range(0xfb40, 0xfb41),
-    regex_range(0xfb43, 0xfb44),
-    regex_range(0xfb46, 0xfb4f),
-    regex_range(0x0610, 0x061a),  # Arabic
-    regex_range(0x0620, 0x065f),
-    regex_range(0x066e, 0x06d3),
-    regex_range(0x06d5, 0x06dc),
-    regex_range(0x06de, 0x06e8),
-    regex_range(0x06ea, 0x06ef),
-    regex_range(0x06fa, 0x06fc),
-    regex_range(0x06ff),
-    regex_range(0x0750, 0x077f),  # Arabic Supplement
-    regex_range(0x08a0),         # Arabic Extended A
-    regex_range(0x08a2, 0x08ac),
-    regex_range(0x08e4, 0x08fe),
-    regex_range(0xfb50, 0xfbb1),  # Arabic Pres. Forms A
-    regex_range(0xfbd3, 0xfd3d),
-    regex_range(0xfd50, 0xfd8f),
-    regex_range(0xfd92, 0xfdc7),
-    regex_range(0xfdf0, 0xfdfb),
-    regex_range(0xfe70, 0xfe74),  # Arabic Pres. Forms B
-    regex_range(0xfe76, 0xfefc),
-    regex_range(0x200c, 0x200c),  # Zero-Width Non-Joiner
-    regex_range(0x0e01, 0x0e3a),  # Thai
-    regex_range(0x0e40, 0x0e4e),  # Hangul (Korean)
-    regex_range(0x1100, 0x11ff),  # Hangul Jamo
-    regex_range(0x3130, 0x3185),  # Hangul Compatibility Jamo
-    regex_range(0xA960, 0xA97F),  # Hangul Jamo Extended-A
-    regex_range(0xAC00, 0xD7AF),  # Hangul Syllables
-    regex_range(0xD7B0, 0xD7FF),  # Hangul Jamo Extended-B
-    regex_range(0xFFA1, 0xFFDC),  # Half-width Hangul
-])
-
-CJ_HASHTAG_CHARACTERS = ''.join([
-    regex_range(0x30A1, 0x30FA),  # Katakana (full-width)
-    regex_range(0x30FC, 0x30FE),  # Katakana (full-width)
-    regex_range(0xFF66, 0xFF9F),  # Katakana (half-width)
-    regex_range(0xFF10, 0xFF19),  # Latin (full-width)
-    regex_range(0xFF21, 0xFF3A),  # Latin (full-width)
-    regex_range(0xFF41, 0xFF5A),  # Latin (full-width)
-    regex_range(0x3041, 0x3096),  # Hiragana
-    regex_range(0x3099, 0x309E),  # Hiragana
-    regex_range(0x3400, 0x4DBF),  # Kanji (CJK Extension A)
-    regex_range(0x4E00, 0x9FFF),  # Kanji (Unified)
-    regex_range(0x3003),  # Kanji (CJK supplement)
-    regex_range(0x3005),  # Kanji (CJK supplement)
-    regex_range(0x303B),  # Kanji (CJK supplement)
-])
-
-try:
-    CJ_HASHTAG_CHARACTERS = ''.join([
-        CJ_HASHTAG_CHARACTERS,
-        regex_range(0x20000, 0x2A6DF),  # Kanji (CJK Extension B)
-        regex_range(0x2A700, 0x2B73F),  # Kanji (CJK Extension C)
-        regex_range(0x2B740, 0x2B81F),  # Kanji (CJK Extension D)
-        regex_range(0x2F800, 0x2FA1F),  # Kanji (CJK supplement)
-    ])
-except ValueError:
-    # this is a narrow python build so these extended Kanji characters won't work
-    pass
-
 PUNCTUATION_CHARS = ur'!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
 SPACE_CHARS = ur" \t\n\x0B\f\r"
 CTRL_CHARS = ur"\x00-\x1F\x7F"
 
 # A hashtag must contain latin characters, numbers and underscores, but not all numbers.
 HASHTAG_ALPHA = ur'[\p{L}\p{M}]'
-HASHTAG_ALPHANUMERIC = ur'[a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS)
-HASHTAG_BOUNDARY = ur'\A|\z|\[|[^&a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS)
+HASHTAG_ALPHANUMERIC = ur'[\p{L}\p{M}\p{Nd}_\u200c\u200d\u0482\ua673\ua67e\u05be\u05f3\u05f4\uff5e\u301c\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7]'
+HASHTAG_BOUNDARY = ur'\A|\z|[^&\p{L}\p{M}\p{Nd}_\u200c\u200d\u0482\ua673\ua67e\u05be\u05f3\u05f4\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7]'
 
 HASHTAG = re.compile(ur'(%s)(#|＃)(?!\ufe0f|\u20e3)(%s*%s%s*)' % (
     HASHTAG_BOUNDARY,

From d88d2e53f9278b9a3d03ffad84d2d9e15e4b3c6b Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 16:31:30 -0700
Subject: [PATCH 23/30] Updated mention preceding characters regex.

All running tests passing.
---
 twitter_text/regex.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/twitter_text/regex.py b/twitter_text/regex.py
index 8bbdd38..3129e51 100644
--- a/twitter_text/regex.py
+++ b/twitter_text/regex.py
@@ -112,7 +112,7 @@ def regex_range(start, end=None):
 REGEXEN['end_hashtag_match'] = re.compile(ur'\A(?:[#＃]|:\/\/)', re.IGNORECASE | re.UNICODE)
 REGEXEN['numeric_only'] = re.compile(ur'^[\d]+$')
 
-REGEXEN['valid_mention_preceding_chars'] = re.compile(r'(?:[^a-zA-Z0-9_!#\$%&*@＠]|^|RT:?)')
+REGEXEN['valid_mention_preceding_chars'] = re.compile(r'(?:[^a-zA-Z0-9_!#\$%&*@＠]|^|(?:^|[^a-zA-Z0-9_+~.-])[rR][tT]:?)')
 REGEXEN['at_signs'] = re.compile(ur'[@＠]')
 REGEXEN['valid_mention_or_list'] = re.compile(
     ur'(%s)' % REGEXEN['valid_mention_preceding_chars'].pattern.decode('utf-8') +   # preceding character

From 00b9f201cd926ed9e934ee90d1b6e9ead9eadf94 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 16:35:40 -0700
Subject: [PATCH 24/30] Added django to test requirements.

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 334171d..3cfae74 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 argparse==1.2.1
 PyYAML==3.10
 beautifulsoup4==4.4.1
+Django==1.9.6
 lxml==3.4.4
 pytest==2.9.1
 py==1.4.29

From 86e84d2d278c4a339e7feaf102aa1fecb65559e5 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 16:47:48 -0700
Subject: [PATCH 25/30] Added sudo:false to travis.yml

See https://docs.travis-ci.com/user/workers/container-based-infrastructure/
---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index 3c79536..57be34a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,3 +1,4 @@
+sudo: false
 language: python
 python:
   - "2.6"

From 284192dfa6b38f91f627f7219abcc07dc87cb692 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 16:48:59 -0700
Subject: [PATCH 26/30] Removed python 2.6 test run.

---
 .travis.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 57be34a..b29bfb2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,6 @@
 sudo: false
 language: python
 python:
-  - "2.6"
   - "2.7"
 
 install: 

From 74691912871c7ecc21b9b441abca2db4d43d5adb Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 17:03:58 -0700
Subject: [PATCH 27/30] Corrected Validation._valid_match logic.

---
 conftest.py                | 2 ++
 twitter_text/validation.py | 5 ++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/conftest.py b/conftest.py
index 93c5a6c..a2218f0 100644
--- a/conftest.py
+++ b/conftest.py
@@ -112,12 +112,14 @@ def collect(self):
     'validate': {
         'cls': twitter_text.validation.Validation,
         'requires_wide_build': True,
+        'options': {'require_protocol': False},
         'methods': {
             'tweets': 'valid_tweet_text',
             'usernames': 'valid_username',
             'lists': 'valid_list',
             'hashtags': 'valid_hashtag',
             'urls': 'valid_url',
+            'urls_without_protocol': 'valid_url',
         },
     }
 }
diff --git a/twitter_text/validation.py b/twitter_text/validation.py
index 25b4d3d..efd8bd9 100644
--- a/twitter_text/validation.py
+++ b/twitter_text/validation.py
@@ -149,9 +149,8 @@ def valid_url(self, unicode_domains=True, require_protocol=True):
         )
 
     def _valid_match(self, string, re_obj, optional=False):
-        if optional and string is None:
-            return True
-        match = re_obj.match(string)
+        if string:
+            match = re_obj.match(string)
         if optional:
             return not (string and (match is None or not match.string[match.span()[0]:match.span()[1]] == string))
         else:

From ccbd4cf75941b1c5e95fb56269f8c6672cb7079e Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 17:04:14 -0700
Subject: [PATCH 28/30] Added handling of tweet_length tests.

---
 conftest.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conftest.py b/conftest.py
index a2218f0..b076a15 100644
--- a/conftest.py
+++ b/conftest.py
@@ -120,6 +120,7 @@ def collect(self):
             'hashtags': 'valid_hashtag',
             'urls': 'valid_url',
             'urls_without_protocol': 'valid_url',
+            'lengths': 'tweet_length',
         },
     }
 }

From 801274c6ecee777ff078701a33e0d23d11805957 Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 17:35:48 -0700
Subject: [PATCH 29/30] Corrected running of url without protocol tests

---
 conftest.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/conftest.py b/conftest.py
index b076a15..221c885 100644
--- a/conftest.py
+++ b/conftest.py
@@ -112,14 +112,13 @@ def collect(self):
     'validate': {
         'cls': twitter_text.validation.Validation,
         'requires_wide_build': True,
-        'options': {'require_protocol': False},
         'methods': {
             'tweets': 'valid_tweet_text',
             'usernames': 'valid_username',
             'lists': 'valid_list',
             'hashtags': 'valid_hashtag',
             'urls': 'valid_url',
-            'urls_without_protocol': 'valid_url',
+            'urls_without_protocol': ('valid_url', {'require_protocol': False}),
             'lengths': 'tweet_length',
         },
     }
@@ -144,10 +143,14 @@ def runtest(self):
         if self.section not in TEST_MAP[self.filename]['methods']:
             raise YamlException("{}:{} section not supported".format(self.section))
         cls = TEST_MAP[self.filename]['cls']
-        method_name = TEST_MAP[self.filename]['methods'][self.section]
         instance = cls(self.spec['text'])
         args = []
-        kwargs = {}
+        try:
+            method_name, kwargs = TEST_MAP[self.filename]['methods'][self.section]
+            kwargs = kwargs.copy()
+        except ValueError:
+            kwargs = {}
+            method_name = TEST_MAP[self.filename]['methods'][self.section]
         if 'json' in self.spec:
             args.append(json.loads(self.spec['json']))
         if 'options' in TEST_MAP[self.filename]:

From 09a504339babf3911674c1855051ce90bffb158d Mon Sep 17 00:00:00 2001
From: Stephen Burrows <stephen.r.burrows@gmail.com>
Date: Thu, 12 May 2016 17:36:06 -0700
Subject: [PATCH 30/30] Updated short url length

---
 twitter_text/validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/twitter_text/validation.py b/twitter_text/validation.py
index efd8bd9..3990ddf 100644
--- a/twitter_text/validation.py
+++ b/twitter_text/validation.py
@@ -9,7 +9,7 @@
 MAX_LENGTH = 140
 
 DEFAULT_TCO_URL_LENGTHS = {
-    'short_url_length': 22,
+    'short_url_length': 23,
     'short_url_length_https': 23,
     'characters_reserved_per_media': 22,
 }