33
33
demoji .download_codes ()
34
34
from nltk .tokenize import TweetTokenizer
35
35
36
- #gobal
36
+ # Global
37
+
37
38
PunctChars = r'''[`'“".?!,:;]'''
38
39
Punct = '%s+' % PunctChars
39
40
Entity = '&(amp|lt|gt|quot);'
40
41
printable = set (string .printable )
41
42
42
- # helper functoins
43
+ # Helper functoins.
44
+
43
45
def regex_or (* items ):
44
46
r = '|' .join (items )
45
47
r = '(' + r + ')'
@@ -55,6 +57,7 @@ def optional(r):
55
57
return '(%s)?' % r
56
58
57
59
def trim (transient_tweet_text ):
60
+
58
61
'''
59
62
trim leading and trailing spaces in the tweet text
60
63
'''
@@ -112,7 +115,8 @@ def process_URLs(transient_tweet_text):
112
115
Url_RE = re .compile ("(%s)" % Url , re .U | re .I )
113
116
transient_tweet_text = re .sub (Url_RE , " constanturl " , transient_tweet_text )
114
117
115
- # fix to handle unicodes in URL
118
+ # Fix to handle unicodes in URL.
119
+
116
120
URL_regex2 = r'\b(htt)[p\:\/]*([\\x\\u][a-z0-9]*)*'
117
121
transient_tweet_text = re .sub (URL_regex2 , " constanturl " , transient_tweet_text )
118
122
return transient_tweet_text
@@ -155,9 +159,9 @@ def process_Dates(transient_tweet_text):
155
159
'''
156
160
Identify date and convert it to constant
157
161
'''
158
- #transient_tweet_text = re.sub(r'(\d+/\d+/\d+)', " constantdate " , transient_tweet_text)
159
- #transient_tweet_text = re.sub(r'constantnum[\s]?(/|-)[\s]?constantnum[\s]?(/|-)[\s]?constantnum', " constantdate " , transient_tweet_text)
160
- #date_regex = r'(constantnum)[\s]*(st|nd|rd|th)[\s]*(january|jan|february|feb|march|mar|april|may|june|jun|july|august|aug|september|sep|october|oct|november|nov|december|dec)'
162
+ # transient_tweet_text = re.sub(r'(\d+/\d+/\d+)', " constantdate " , transient_tweet_text)
163
+ # transient_tweet_text = re.sub(r'constantnum[\s]?(/|-)[\s]?constantnum[\s]?(/|-)[\s]?constantnum', " constantdate " , transient_tweet_text)
164
+ # date_regex = r'(constantnum)[\s]*(st|nd|rd|th)[\s]*(january|jan|february|feb|march|mar|april|may|june|jun|july|august|aug|september|sep|october|oct|november|nov|december|dec)'
161
165
date_regex1 = r'\b((0|1|2|3)?[0-9][\s]*)[-./]([\s]*([012]?[0-9])[\s]*)([-./]([\s]*(19|20)[0-9][0-9]))?\b'
162
166
transient_tweet_text = re .sub (date_regex1 , ' constantdate ' , transient_tweet_text )
163
167
date_regex2 = r'\b((19|20)[0-9][0-9][\s]*[-./]?)?[\s]*([012]?[0-9])[\s]*[-./][\s]*(0|1|2|3)?[0-9]\b'
@@ -221,37 +225,37 @@ def identify_Savings(transient_tweet_text):
221
225
'''
222
226
identify sale/save offers
223
227
'''
224
- #sale_regex = r'(?<!#)\b(discount|discounts|sale|save|saver|super[\s]*saver|super[\s]*save)\b[\s]*(constantnum)*[\s]*[%]?[\s]*(-|~)?[\s]*(constantnum)*[\s]*[%]?'
228
+ # sale_regex = r'(?<!#)\b(discount|discounts|sale|save|saver|super[\s]*saver|super[\s]*save)\b[\s]*(constantnum)*[\s]*[%]?[\s]*(-|~)?[\s]*(constantnum)*[\s]*[%]?'
225
229
sale_regex = r'(?<!#)\b(discount|discounts|sale|save|saver|super[\s]*saver|super[\s]*save)\b[\s]*((rs|\$)*[\s]*(constantnum))*[\s]*[%]?[\s]*(-|~|or)?[\s]*((rs|\$)*[\s]*(constantnum))*[\s]*[%]?'
226
230
transient_tweet_text = re .sub (sale_regex , " constantdiscount " , transient_tweet_text )
227
- #discount_List = []
228
- #discount_List = re.findall(r'constantdiscount', transient_tweet_text)
231
+ # discount_List = []
232
+ # discount_List = re.findall(r'constantdiscount', transient_tweet_text)
229
233
return transient_tweet_text
230
234
231
235
def indentify_Offers (transient_tweet_text ):
232
236
'''
233
237
identify cashbacks and off / substrings of the form "30% off" or "30% cashback" or "$30 off"
234
238
Replace them by constantOFFER
235
239
'''
236
- #transient_tweet_text = re.sub(r'[rs|$]?[ ]*[constantnum][ ]*[%]?[ ]?[off|cashback|offer]', "constantoffer", transient_tweet_text)
240
+ # transient_tweet_text = re.sub(r'[rs|$]?[ ]*[constantnum][ ]*[%]?[ ]?[off|cashback|offer]', "constantoffer", transient_tweet_text)
237
241
transient_tweet_text = re .sub (r'(?<!#)\b(?:(up[\s]?to)?((rs|\$)*[\s]*(constantnum))[\s]*[%]?)?[\s]*[-|~|or]?[\.]?[\s]*((rs|\$)*[\s]*(constantnum))*[\s]*[%]?[\s]*(offer|off|cashback|cash|cash back)' , " constantoffer " , transient_tweet_text )
238
242
transient_tweet_text = re .sub (r'(?<!#)\b(?:cashback|cash back|cash)\b' , " constantoffer " , transient_tweet_text )
239
- #Offer_List = []
240
- #Offer_List = re.findall(r'constantoffer', transient_tweet_text)
243
+ # Offer_List = []
244
+ # Offer_List = re.findall(r'constantoffer', transient_tweet_text)
241
245
return transient_tweet_text
242
246
243
247
def indentify_Promos (transient_tweet_text ):
244
248
'''
245
249
indentify coupons/promos with promo codes
246
250
Assumption - promo code can be alphanumeric. But it immediately follows text of promo/code/promocode etc
247
251
'''
248
- #transient_tweet_text = re.sub(r'\b(promocode|promo code|promo|code)[s]?[\s]*[a-z]*(constantnum)*[a-z]*[\s]+', " constantpromo ", transient_tweet_text)
252
+ # transient_tweet_text = re.sub(r'\b(promocode|promo code|promo|code)[s]?[\s]*[a-z]*(constantnum)*[a-z]*[\s]+', " constantpromo ", transient_tweet_text)
249
253
transient_tweet_text = re .sub (r'(?<!#)\b(?:promocode|promo code|promo|coupon code|code)\b[\s]*(constantalphanum)\b' , " constantpromo " , transient_tweet_text )
250
254
transient_tweet_text = re .sub (r'(?<!#)\b(?:promocode|promo code|promo|coupon code|code)\b[\s]*[a-z]+\b' , " constantpromo " , transient_tweet_text )
251
255
transient_tweet_text = re .sub (r'(?<!#)\b(?:promocode|promo code|promo|coupon code|code)\b[\s]*[0-9]+\b' , " constantpromo " , transient_tweet_text )
252
256
transient_tweet_text = re .sub (r'(?<!#)\b(?:promocode|promo code|promo|coupon code|code|coupon)[s]?\b' , " constantpromo " , transient_tweet_text )
253
- #Promo_List = []
254
- #Promo_List = re.findall(r'constantpromo', transient_tweet_text)
257
+ # Promo_List = []
258
+ # Promo_List = re.findall(r'constantpromo', transient_tweet_text)
255
259
return transient_tweet_text
256
260
257
261
def indentify_Money (transient_tweet_text ):
@@ -264,8 +268,8 @@ def indentify_Money(transient_tweet_text):
264
268
transient_tweet_text = re .sub (money_regex2 , " constantmoney " , transient_tweet_text )
265
269
money_regex3 = r'(\$|rs)[\s]*constantalphanum'
266
270
transient_tweet_text = re .sub (money_regex3 , " constantmoney " , transient_tweet_text )
267
- #Money_List = []
268
- #Money_List = re.findall(r'constantmoney', transient_tweet_text)
271
+ # Money_List = []
272
+ # Money_List = re.findall(r'constantmoney', transient_tweet_text)
269
273
return transient_tweet_text
270
274
271
275
def indentify_freebies (transient_tweet_text ):
@@ -360,9 +364,6 @@ def deEmojify(transient_tweet_text):
360
364
# ############
361
365
# print_test()
362
366
363
-
364
-
365
-
366
367
def process_TweetText (tweet_text ):
367
368
'''
368
369
Takes tweet_text and preprocesses it
@@ -372,44 +373,50 @@ def process_TweetText(tweet_text):
372
373
'''
373
374
374
375
# get utf-8 encoding, lowercase, trim and remove multiple white spaces
376
+
375
377
transient_tweet_text = tweet_text
376
378
transient_tweet_text = strip_unicode (transient_tweet_text )
377
- #print "PROCESSED: ", transient_tweet_text
379
+
380
+ # print "PROCESSED: ", transient_tweet_text
378
381
379
382
transient_tweet_text = to_LowerCase (transient_tweet_text )
380
383
transient_tweet_text = trim (transient_tweet_text )
381
384
transient_tweet_text = strip_whiteSpaces (transient_tweet_text )
382
385
transient_tweet_text = remove_spl_words (transient_tweet_text )
383
386
384
-
385
- #emoji
387
+ # Emoji
388
+
386
389
transient_tweet_text = remove_emoji (transient_tweet_text )
387
390
transient_tweet_text = deEmojify (transient_tweet_text )
388
- # process Hastags, URLs, Websites, process_EmailIds
391
+
392
+ # Process Hastags, URLs, Websites, process_EmailIds
389
393
# Give precedence to url over hashtag
394
+
390
395
transient_tweet_text = process_URLs (transient_tweet_text )
391
396
transient_tweet_text = process_HashTags (transient_tweet_text )
392
- #transient_tweet_text = process_Websites(transient_tweet_text)
397
+
398
+ # transient_tweet_text = process_Websites(transient_tweet_text)
399
+
393
400
transient_tweet_text = process_EmailIds (transient_tweet_text )
394
401
395
- # process for brand mention, any other mention and brand Name
396
- #transient_tweet_text = process_BrandMentions(transient_tweet_text)
397
- #transient_tweet_text = process_NonBrandMentions(transient_tweet_text)
402
+ # Process for brand mention, any other mention and brand Name
403
+ # transient_tweet_text = process_BrandMentions(transient_tweet_text)
404
+ # transient_tweet_text = process_NonBrandMentions(transient_tweet_text)
398
405
transient_tweet_text = process_Mentions (transient_tweet_text )
399
406
#transient_tweet_text = process_BrandName(transient_tweet_text)
400
407
401
- # remove any unicodes
408
+ # Remove any unicodes
402
409
transient_tweet_text = strip_unicode (transient_tweet_text )
403
410
404
- # identify Date / Time if any
411
+ # Identify Date / Time if any
405
412
transient_tweet_text = process_Times (transient_tweet_text )
406
413
transient_tweet_text = process_Dates (transient_tweet_text )
407
414
408
- # indentify alphanums and nums
415
+ # Identify alphanums and nums
409
416
transient_tweet_text = identify_AlphaNumerics (transient_tweet_text )
410
417
transient_tweet_text = replace_numbers (transient_tweet_text )
411
418
412
- # identify promos, savings, offers, money and freebies
419
+ # Identify promos, savings, offers, money and freebies
413
420
transient_tweet_text = indentify_Promos (transient_tweet_text )
414
421
transient_tweet_text = identify_Savings (transient_tweet_text )
415
422
transient_tweet_text = indentify_Offers (transient_tweet_text )
@@ -424,4 +431,4 @@ def process_TweetText(tweet_text):
424
431
return transient_tweet_text
425
432
426
433
# if __name__ == "__main__":
427
- # print(process_TweetText("Nice @varun paytm @paytm saver abc@gmail.com sizes for the wolf on 20/10/2010 at 10:00PM grey/deep royal-volt Nike Air Skylon II retro are 40% OFF for a limited time at $59.99 + FREE shipping.BUY HERE -> https://bit.ly/2L2n7rB (promotion - use code MEMDAYSV at checkout)"))
434
+ # print(process_TweetText("Nice @varun paytm @paytm saver abc@gmail.com sizes for the wolf on 20/10/2010 at 10:00PM grey/deep royal-volt Nike Air Skylon II retro are 40% OFF for a limited time at $59.99 + FREE shipping.BUY HERE -> https://bit.ly/2L2n7rB (promotion - use code MEMDAYSV at checkout)"))
0 commit comments