-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathIntroductionParagraphsCorrelation.py
More file actions
70 lines (53 loc) · 2.61 KB
/
IntroductionParagraphsCorrelation.py
File metadata and controls
70 lines (53 loc) · 2.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from nltk.corpus import stopwords
import string
import numpy as np
import re
import pymorphy2
morph = pymorphy2.MorphAnalyzer(lang='ru')
import spacy
nlp = spacy.load("en_core_web_sm")
from ast import literal_eval
class IntroductionParagraphsCorrelation(object):
russian_stopwords = stopwords.words("russian")
russian_stopwords = set(russian_stopwords + ['который', 'таблица', 'рисунок',
'тот', 'также', 'этот', 'это',
'такой', 'каждый', 'другой'])
def __init__(self, df):
self.df = df
print("IntroductionParagraphsCorrelation is running")
df['si_corr_info'] = df['dict_sections_texts'].map(self.__get_info_corr_sections_and_intro)
df['corr_sections_and_intro'] = df['si_corr_info'].apply(lambda x: x[0])
# кореляция 'corr_sections_and_intro']
self.df = df
def get_results(self):
'''
Возвращает датафрейм и степеь корреляции текста с введением
:return:
'''
return {"df": self.df, "corr_sections_and_intro": self.df["corr_sections_and_intro"]}
def __get_info_corr_sections_and_intro(self, paper_dict):
if 'введение' not in paper_dict:
return 0, {}
intro_preproc = self.__sentence_preproc(paper_dict['введение'])
dict_intro_parag_corr = {}
for parag_name in paper_dict.keys():
if 'введение' in parag_name:
continue
parag_text = paper_dict[parag_name]
if len(parag_text) < 30:
continue
parag_preproc = self.__sentence_preproc(parag_text)
doc_intro = nlp(intro_preproc)
doc_parag = nlp(parag_preproc)
corr = doc_parag.similarity(doc_intro)
dict_intro_parag_corr[parag_name] = corr
avg_corr_sections_and_intro = np.mean(list(dict_intro_parag_corr.values()))
return avg_corr_sections_and_intro, dict_intro_parag_corr
def __sentence_preproc(self, sentence):
sentence = ''.join([ch for ch in sentence if ch not in string.punctuation])
sentence = re.sub(r'[^а-яА-Я]', ' ', sentence).strip().replace(' ', '')
sentence = sentence.split()
sentence_new = [morph.parse(word)[0].normal_form for word in sentence if word not in self.russian_stopwords and
str(morph.parse(word)[0].tag) != 'UNKN']
sentence_new = ' '.join(sentence_new)
return sentence_new