Skip to content

Commit 07f2204

Browse files
committed
[NLP][Ch08Nb05] Update Notebook 5 with windows compatibility
1 parent 2edd5d1 commit 07f2204

File tree

1 file changed

+40
-33
lines changed

1 file changed

+40
-33
lines changed

Ch8/O5_smtd_preprocessing.py

Lines changed: 40 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,15 @@
3333
demoji.download_codes()
3434
from nltk.tokenize import TweetTokenizer
3535

36-
#gobal
36+
# Global
37+
3738
PunctChars = r'''[`'“".?!,:;]'''
3839
Punct = '%s+' % PunctChars
3940
Entity = '&(amp|lt|gt|quot);'
4041
printable = set(string.printable)
4142

42-
# helper functoins
43+
# Helper functoins.
44+
4345
def regex_or(*items):
4446
r = '|'.join(items)
4547
r = '(' + r + ')'
@@ -55,6 +57,7 @@ def optional(r):
5557
return '(%s)?' % r
5658

5759
def trim(transient_tweet_text):
60+
5861
'''
5962
trim leading and trailing spaces in the tweet text
6063
'''
@@ -112,7 +115,8 @@ def process_URLs(transient_tweet_text):
112115
Url_RE = re.compile("(%s)" % Url, re.U|re.I)
113116
transient_tweet_text = re.sub(Url_RE, " constanturl ", transient_tweet_text)
114117

115-
# fix to handle unicodes in URL
118+
# Fix to handle unicodes in URL.
119+
116120
URL_regex2 = r'\b(htt)[p\:\/]*([\\x\\u][a-z0-9]*)*'
117121
transient_tweet_text = re.sub(URL_regex2, " constanturl ", transient_tweet_text)
118122
return transient_tweet_text
@@ -155,9 +159,9 @@ def process_Dates(transient_tweet_text):
155159
'''
156160
Identify date and convert it to constant
157161
'''
158-
#transient_tweet_text = re.sub(r'(\d+/\d+/\d+)', " constantdate " , transient_tweet_text)
159-
#transient_tweet_text = re.sub(r'constantnum[\s]?(/|-)[\s]?constantnum[\s]?(/|-)[\s]?constantnum', " constantdate " , transient_tweet_text)
160-
#date_regex = r'(constantnum)[\s]*(st|nd|rd|th)[\s]*(january|jan|february|feb|march|mar|april|may|june|jun|july|august|aug|september|sep|october|oct|november|nov|december|dec)'
162+
# transient_tweet_text = re.sub(r'(\d+/\d+/\d+)', " constantdate " , transient_tweet_text)
163+
# transient_tweet_text = re.sub(r'constantnum[\s]?(/|-)[\s]?constantnum[\s]?(/|-)[\s]?constantnum', " constantdate " , transient_tweet_text)
164+
# date_regex = r'(constantnum)[\s]*(st|nd|rd|th)[\s]*(january|jan|february|feb|march|mar|april|may|june|jun|july|august|aug|september|sep|october|oct|november|nov|december|dec)'
161165
date_regex1 = r'\b((0|1|2|3)?[0-9][\s]*)[-./]([\s]*([012]?[0-9])[\s]*)([-./]([\s]*(19|20)[0-9][0-9]))?\b'
162166
transient_tweet_text = re.sub(date_regex1, ' constantdate ' , transient_tweet_text)
163167
date_regex2 = r'\b((19|20)[0-9][0-9][\s]*[-./]?)?[\s]*([012]?[0-9])[\s]*[-./][\s]*(0|1|2|3)?[0-9]\b'
@@ -221,37 +225,37 @@ def identify_Savings(transient_tweet_text):
221225
'''
222226
identify sale/save offers
223227
'''
224-
#sale_regex = r'(?<!#)\b(discount|discounts|sale|save|saver|super[\s]*saver|super[\s]*save)\b[\s]*(constantnum)*[\s]*[%]?[\s]*(-|~)?[\s]*(constantnum)*[\s]*[%]?'
228+
# sale_regex = r'(?<!#)\b(discount|discounts|sale|save|saver|super[\s]*saver|super[\s]*save)\b[\s]*(constantnum)*[\s]*[%]?[\s]*(-|~)?[\s]*(constantnum)*[\s]*[%]?'
225229
sale_regex = r'(?<!#)\b(discount|discounts|sale|save|saver|super[\s]*saver|super[\s]*save)\b[\s]*((rs|\$)*[\s]*(constantnum))*[\s]*[%]?[\s]*(-|~|or)?[\s]*((rs|\$)*[\s]*(constantnum))*[\s]*[%]?'
226230
transient_tweet_text = re.sub(sale_regex, " constantdiscount ", transient_tweet_text)
227-
#discount_List = []
228-
#discount_List = re.findall(r'constantdiscount', transient_tweet_text)
231+
# discount_List = []
232+
# discount_List = re.findall(r'constantdiscount', transient_tweet_text)
229233
return transient_tweet_text
230234

231235
def indentify_Offers(transient_tweet_text):
232236
'''
233237
identify cashbacks and off / substrings of the form "30% off" or "30% cashback" or "$30 off"
234238
Replace them by constantOFFER
235239
'''
236-
#transient_tweet_text = re.sub(r'[rs|$]?[ ]*[constantnum][ ]*[%]?[ ]?[off|cashback|offer]', "constantoffer", transient_tweet_text)
240+
# transient_tweet_text = re.sub(r'[rs|$]?[ ]*[constantnum][ ]*[%]?[ ]?[off|cashback|offer]', "constantoffer", transient_tweet_text)
237241
transient_tweet_text = re.sub(r'(?<!#)\b(?:(up[\s]?to)?((rs|\$)*[\s]*(constantnum))[\s]*[%]?)?[\s]*[-|~|or]?[\.]?[\s]*((rs|\$)*[\s]*(constantnum))*[\s]*[%]?[\s]*(offer|off|cashback|cash|cash back)', " constantoffer ", transient_tweet_text)
238242
transient_tweet_text = re.sub(r'(?<!#)\b(?:cashback|cash back|cash)\b', " constantoffer ", transient_tweet_text)
239-
#Offer_List = []
240-
#Offer_List = re.findall(r'constantoffer', transient_tweet_text)
243+
# Offer_List = []
244+
# Offer_List = re.findall(r'constantoffer', transient_tweet_text)
241245
return transient_tweet_text
242246

243247
def indentify_Promos(transient_tweet_text):
244248
'''
245249
indentify coupons/promos with promo codes
246250
Assumption - promo code can be alphanumeric. But it immediately follows text of promo/code/promocode etc
247251
'''
248-
#transient_tweet_text = re.sub(r'\b(promocode|promo code|promo|code)[s]?[\s]*[a-z]*(constantnum)*[a-z]*[\s]+', " constantpromo ", transient_tweet_text)
252+
# transient_tweet_text = re.sub(r'\b(promocode|promo code|promo|code)[s]?[\s]*[a-z]*(constantnum)*[a-z]*[\s]+', " constantpromo ", transient_tweet_text)
249253
transient_tweet_text = re.sub(r'(?<!#)\b(?:promocode|promo code|promo|coupon code|code)\b[\s]*(constantalphanum)\b', " constantpromo ", transient_tweet_text)
250254
transient_tweet_text = re.sub(r'(?<!#)\b(?:promocode|promo code|promo|coupon code|code)\b[\s]*[a-z]+\b', " constantpromo ", transient_tweet_text)
251255
transient_tweet_text = re.sub(r'(?<!#)\b(?:promocode|promo code|promo|coupon code|code)\b[\s]*[0-9]+\b', " constantpromo ", transient_tweet_text)
252256
transient_tweet_text = re.sub(r'(?<!#)\b(?:promocode|promo code|promo|coupon code|code|coupon)[s]?\b', " constantpromo ", transient_tweet_text)
253-
#Promo_List = []
254-
#Promo_List = re.findall(r'constantpromo', transient_tweet_text)
257+
# Promo_List = []
258+
# Promo_List = re.findall(r'constantpromo', transient_tweet_text)
255259
return transient_tweet_text
256260

257261
def indentify_Money(transient_tweet_text):
@@ -264,8 +268,8 @@ def indentify_Money(transient_tweet_text):
264268
transient_tweet_text = re.sub(money_regex2, " constantmoney ", transient_tweet_text)
265269
money_regex3 = r'(\$|rs)[\s]*constantalphanum'
266270
transient_tweet_text = re.sub(money_regex3, " constantmoney ", transient_tweet_text)
267-
#Money_List = []
268-
#Money_List = re.findall(r'constantmoney', transient_tweet_text)
271+
# Money_List = []
272+
# Money_List = re.findall(r'constantmoney', transient_tweet_text)
269273
return transient_tweet_text
270274

271275
def indentify_freebies(transient_tweet_text):
@@ -360,9 +364,6 @@ def deEmojify(transient_tweet_text):
360364
# ############
361365
# print_test()
362366

363-
364-
365-
366367
def process_TweetText(tweet_text):
367368
'''
368369
Takes tweet_text and preprocesses it
@@ -372,44 +373,50 @@ def process_TweetText(tweet_text):
372373
'''
373374

374375
# get utf-8 encoding, lowercase, trim and remove multiple white spaces
376+
375377
transient_tweet_text = tweet_text
376378
transient_tweet_text = strip_unicode(transient_tweet_text)
377-
#print "PROCESSED: ", transient_tweet_text
379+
380+
# print "PROCESSED: ", transient_tweet_text
378381

379382
transient_tweet_text = to_LowerCase(transient_tweet_text)
380383
transient_tweet_text = trim(transient_tweet_text)
381384
transient_tweet_text = strip_whiteSpaces(transient_tweet_text)
382385
transient_tweet_text = remove_spl_words(transient_tweet_text)
383386

384-
385-
#emoji
387+
# Emoji
388+
386389
transient_tweet_text = remove_emoji(transient_tweet_text)
387390
transient_tweet_text = deEmojify(transient_tweet_text)
388-
# process Hastags, URLs, Websites, process_EmailIds
391+
392+
# Process Hastags, URLs, Websites, process_EmailIds
389393
# Give precedence to url over hashtag
394+
390395
transient_tweet_text = process_URLs(transient_tweet_text)
391396
transient_tweet_text = process_HashTags(transient_tweet_text)
392-
#transient_tweet_text = process_Websites(transient_tweet_text)
397+
398+
# transient_tweet_text = process_Websites(transient_tweet_text)
399+
393400
transient_tweet_text = process_EmailIds(transient_tweet_text)
394401

395-
# process for brand mention, any other mention and brand Name
396-
#transient_tweet_text = process_BrandMentions(transient_tweet_text)
397-
#transient_tweet_text = process_NonBrandMentions(transient_tweet_text)
402+
# Process for brand mention, any other mention and brand Name
403+
# transient_tweet_text = process_BrandMentions(transient_tweet_text)
404+
# transient_tweet_text = process_NonBrandMentions(transient_tweet_text)
398405
transient_tweet_text = process_Mentions(transient_tweet_text)
399406
#transient_tweet_text = process_BrandName(transient_tweet_text)
400407

401-
# remove any unicodes
408+
# Remove any unicodes
402409
transient_tweet_text = strip_unicode(transient_tweet_text)
403410

404-
# identify Date / Time if any
411+
# Identify Date / Time if any
405412
transient_tweet_text = process_Times(transient_tweet_text)
406413
transient_tweet_text = process_Dates(transient_tweet_text)
407414

408-
# indentify alphanums and nums
415+
# Identify alphanums and nums
409416
transient_tweet_text = identify_AlphaNumerics(transient_tweet_text)
410417
transient_tweet_text = replace_numbers(transient_tweet_text)
411418

412-
# identify promos, savings, offers, money and freebies
419+
# Identify promos, savings, offers, money and freebies
413420
transient_tweet_text = indentify_Promos(transient_tweet_text)
414421
transient_tweet_text = identify_Savings(transient_tweet_text)
415422
transient_tweet_text = indentify_Offers(transient_tweet_text)
@@ -424,4 +431,4 @@ def process_TweetText(tweet_text):
424431
return transient_tweet_text
425432

426433
# if __name__ == "__main__":
427-
# print(process_TweetText("Nice @varun paytm @paytm saver abc@gmail.com sizes for the wolf on 20/10/2010 at 10:00PM grey/deep royal-volt Nike Air Skylon II retro are 40% OFF for a limited time at $59.99 + FREE shipping.BUY HERE -> https://bit.ly/2L2n7rB (promotion - use code MEMDAYSV at checkout)"))
434+
# print(process_TweetText("Nice @varun paytm @paytm saver abc@gmail.com sizes for the wolf on 20/10/2010 at 10:00PM grey/deep royal-volt Nike Air Skylon II retro are 40% OFF for a limited time at $59.99 + FREE shipping.BUY HERE -> https://bit.ly/2L2n7rB (promotion - use code MEMDAYSV at checkout)"))

0 commit comments

Comments
 (0)