Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions app/main/check_packs/pack_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
["max_abstract_size_check"],
["theme_in_report_check"],
["empty_task_page_check"],
["water_in_the_text_check"],
]

DEFAULT_TYPE = 'pres'
Expand Down
1 change: 1 addition & 0 deletions app/main/checks/report_checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from .max_abstract_size_check import ReportMaxSizeOfAbstractCheck
from .template_name import ReportTemplateNameCheck
from .empty_task_page_check import EmptyTaskPageCheck
from .water_in_the_text_check import WaterInTheTextCheck
from .sw_section_banned_words import SWSectionBannedWordsCheck
from .sw_section_lit_reference import SWSectionLiteratureReferenceCheck
from .sw_tasks import SWTasksCheck
Expand Down
87 changes: 87 additions & 0 deletions app/main/checks/report_checks/water_in_the_text_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import re
from collections import Counter
from ..base_check import BaseReportCriterion, answer, morph
from .watery_phrase_settings import WateryPhrase

class WaterInTheTextCheck(BaseReportCriterion):
label = "Проверка объема воды в тексте"
description = ''
id = 'water_in_the_text_check'
# необходимо подобрать watery_phrase_threshold, long_sentence_threshold, meaningful_word_threshold, long_sentence_word_limit
def __init__(self, file_info, watery_phrase_threshold=0.3, long_sentence_threshold=0.3, meaningful_word_threshold=0.6, long_sentence_word_limit=20):
super().__init__(file_info)
self.chapters = []
self.watery_phrase = None
self.watery_words = None
self.watery_phrase_threshold = watery_phrase_threshold
self.long_sentence_threshold = long_sentence_threshold
self.meaningful_word_threshold = meaningful_word_threshold
self.long_sentence_word_limit = long_sentence_word_limit

def late_init(self):
self.chapters = self.file.make_chapters(self.file_type['report_type'])

def check(self):
self.watery_phrase = WateryPhrase.INTRODUCTORY_PHRASE
self.watery_words = WateryPhrase.SERVICE_WORDS + WateryPhrase.ABSTRACT_WORDS
if self.file.page_counter() < 4:
return answer(False, "В отчете недостаточно страниц. Нечего проверять.")
self.late_init()
result_str = ""
for chapter in self.chapters:
if 'список использованных источников' in chapter['text'].lower():
break
text = self.get_chapter_text(chapter)
words = self.get_words(text)
if self.watery_phrase_density(text, words) > self.watery_phrase_threshold:
result_str += f"В разделе '{chapter['text']}' содержится более {self.watery_phrase_threshold*100}% 'водянистых' фраз. Попробуйте уменьшить количество водянистых слов и фраз.<br>"

if self.long_sentences_density(text) > self.long_sentence_threshold:
result_str += f"В разделе '{chapter['text']}' более {self.long_sentence_threshold*100}% предложений длиннее {self.long_sentence_word_limit} слов. Используйте более короткие предложения.<br>"

if self.meaningful_word_density(words) < self.meaningful_word_threshold:
result_str += f"В разделе '{chapter['text']}' доля значимых слов составляет менее {self.meaningful_word_threshold*100}% от общего количества слов. Уменьшите количество вспомогательных частей речи.<br>"
if not result_str:
return answer(True, "Пройдена!")
result_str = f'''Значимыми словами в рамках критерия считаются существительные, прилагательные/краткие прилагательные и глаголы.<br><br>
Водянистыми словами и фразами являются:<br>
Служебные слова: {WateryPhrase.SERVICE_WORDS}<br>
Вводные конструкции: { WateryPhrase.INTRODUCTORY_PHRASE}<br>
Абстрактные слова: {WateryPhrase.ABSTRACT_WORDS}<br><br>
''' + result_str

return answer(False, result_str)

def get_chapter_text(self, chapter):
chapter_text = ""
for child in chapter['child']:
chapter_text += " " + child['text']
return chapter_text

def get_words(self, text):
cleaned_text = re.sub(r'\s+', ' ', text)
return re.findall(r'\b\w+(?:-\w+)*\b', re.sub(r'[^а-яА-ЯёЁ\s-]', '', cleaned_text.lower()))

def watery_phrase_density(self, text, words):
watery_phrase_count = sum(text.lower().count(phrase) for phrase in self.watery_phrase)
watery_phrase_count += sum(word in self.watery_words for word in words)
if len(words) == 0:
return 0
return watery_phrase_count / len(words)

def long_sentences_density(self, text):
sentences = re.split(r'[.!?]', text)
long_sentences_count = sum(len(sentence.split()) > self.long_sentence_word_limit for sentence in sentences)
total_sentences = len(sentences)
if total_sentences <= 3 :
return 0
return long_sentences_count / total_sentences

def meaningful_word_density(self, words):
meaningful_words = [
word for word in words
if morph.parse(word)[0].tag.POS in {'NOUN', 'VERB', 'ADJF', 'ADJS','INFN'}
]
if len(words) == 0:
return 1
return len(meaningful_words) / len(words)
20 changes: 20 additions & 0 deletions app/main/checks/report_checks/watery_phrase_settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
class WateryPhrase:
SERVICE_WORDS = [
"а", "бы", "был", "была", "были", "будет", "будешь", "буду", "вот", "всё", "вся",
"в", "для", "до", "его", "её", "ей", "если", "же", "за", "из", "и", "как", "когда", "кого",
"ли", "лишь", "многие", "на", "не", "ни", "о", "об", "или", "он", "она", "они", "это",
"к", "кто", "между", "над", "ну", "от", "по", "под", "при", "через", "с", "так", "та",
"те", "то", "ты", "уж", "чтобы", "ещё"
]

INTRODUCTORY_PHRASE = [
"например", "так сказать", "кстати", "в общем", "по сути", "вроде", "между прочим", "короче",
"если честно", "во-первых", "во-вторых", "итак", "значит", "другими словами", "с другой стороны",
"по-моему", "по идее", "к слову", "собственно", "в принципе", "не удивлюсь", "естественно",
"по правде говоря", "безусловно", "пожалуй", "как правило", "почему-то", "честно говоря",
"вдобавок", "кроме того", "вдруг", "если так", "в общем-то", "скажем так"
]

ABSTRACT_WORDS = [
"некоторый", "какой-то", "некто", "некоторые", "несколько", "некий", "такой-то"
]