nlp-docx/IntroductionParagraphsCorrelation.py at master · normcontrol/nlp-docx · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from nltk.corpus import stopwords
import string
import numpy as np
import re
import pymorphy2

morph = pymorphy2.MorphAnalyzer(lang='ru')
import spacy

nlp = spacy.load("en_core_web_sm")
from ast import literal_eval


class IntroductionParagraphsCorrelation(object):
    russian_stopwords = stopwords.words("russian")
    russian_stopwords = set(russian_stopwords + ['который', 'таблица', 'рисунок',
                                                 'тот', 'также', 'этот', 'это',
                                                 'такой', 'каждый', 'другой'])

    def __init__(self, df):
        self.df = df
        print("IntroductionParagraphsCorrelation is running")

        df['si_corr_info'] = df['dict_sections_texts'].map(self.__get_info_corr_sections_and_intro)
        df['corr_sections_and_intro'] = df['si_corr_info'].apply(lambda x: x[0])
        # кореляция 'corr_sections_and_intro']
        self.df = df

    def get_results(self):
        '''
        Возвращает датафрейм и степеь корреляции текста с введением
        :return:
        '''
        return {"df": self.df, "corr_sections_and_intro": self.df["corr_sections_and_intro"]}

    def __get_info_corr_sections_and_intro(self, paper_dict):

        if 'введение' not in paper_dict:
            return 0, {}
        intro_preproc = self.__sentence_preproc(paper_dict['введение'])

        dict_intro_parag_corr = {}
        for parag_name in paper_dict.keys():
            if 'введение' in parag_name:
                continue
            parag_text = paper_dict[parag_name]
            if len(parag_text) < 30:
                continue

            parag_preproc = self.__sentence_preproc(parag_text)

            doc_intro = nlp(intro_preproc)
            doc_parag = nlp(parag_preproc)
            corr = doc_parag.similarity(doc_intro)
            dict_intro_parag_corr[parag_name] = corr

        avg_corr_sections_and_intro = np.mean(list(dict_intro_parag_corr.values()))
        return avg_corr_sections_and_intro, dict_intro_parag_corr

    def __sentence_preproc(self, sentence):
        sentence = ''.join([ch for ch in sentence if ch not in string.punctuation])
        sentence = re.sub(r'[^а-яА-Я]', ' ', sentence).strip().replace('  ', '')
        sentence = sentence.split()

        sentence_new = [morph.parse(word)[0].normal_form for word in sentence if word not in self.russian_stopwords and
                        str(morph.parse(word)[0].tag) != 'UNKN']

        sentence_new = ' '.join(sentence_new)

        return sentence_new