-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathDataAnalysis.py
More file actions
32 lines (23 loc) · 817 Bytes
/
DataAnalysis.py
File metadata and controls
32 lines (23 loc) · 817 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pymongo import MongoClient
def process(words, Id):
new_word = re.sub("'AbstractText':", " ", str(words))
letters_only = re.sub("[^a-zA-Z]", " ", str(new_word))
word_token = word_tokenize(letters_only)
stop_words = set(stopwords.words('english'))
filtered_sentence = []
for w in word_token:
if w not in stop_words:
filtered_sentence.append(w)
collection.update({'_id': Id}, {'$set': {"ProcessedAbstract": filtered_sentence}}, False, True)
client = MongoClient()
client = MongoClient('localhost', 27017)
db = client.local
collection = db.Bio
cursor = collection.find()
for new in cursor:
ar = new.get('_id')
words1 = new.get('AbstractText')
process(words1, ar)