diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py index 91e08134..db2e4ef2 100644 --- a/app/main/check_packs/pack_config.py +++ b/app/main/check_packs/pack_config.py @@ -22,6 +22,7 @@ ['pres_image_capture'], ['task_tracker'], ['overview_in_tasks'], + ['pres_abbreviations_check'], ] BASE_REPORT_CRITERION = [ ["simple_check"], @@ -50,6 +51,7 @@ ["empty_task_page_check"], ["water_in_the_text_check"], ["report_task_tracker"], + ["report_abbreviations_check"], ] DEFAULT_TYPE = 'pres' diff --git a/app/main/checks/check_abbreviations.py b/app/main/checks/check_abbreviations.py new file mode 100644 index 00000000..0fd0ff4d --- /dev/null +++ b/app/main/checks/check_abbreviations.py @@ -0,0 +1,105 @@ +import re +from pymorphy3 import MorphAnalyzer +morph = MorphAnalyzer() + + +def get_unexplained_abbrev(text, title_page): + abbreviations = find_abbreviations(text, title_page) + + if not abbreviations: + return False, [] + + unexplained_abbr = [] + for abbr in abbreviations: + if not is_abbreviation_explained(abbr, text): + unexplained_abbr.append(abbr) + + return True, unexplained_abbr + +def find_abbreviations(text: str, title_page: str): + pattern = r'\b[А-ЯA-Z]{2,5}\b' + abbreviations = re.findall(pattern, text) + + common_abbr = { + 'СССР', 'РФ', 'США', 'ВКР', 'ИТ', 'ПО', 'ООО', 'ЗАО', 'ОАО', 'HTML', 'CSS', + 'JS', 'ЛЭТИ', 'МОЕВМ', 'ЭВМ', 'ГОСТ', 'DVD' + + 'SSD', 'PC', 'HDD', + 'AX', 'BX', 'CX', 'DX', 'SI', 'DI', 'BP', 'SP', + 'AH', 'AL', 'BH', 'BL', 'CH', 'CL', 'DH', 'DL', + 'CS', 'DS', 'ES', 'SS', 'FS', 'GS', + 'IP', 'EIP', 'RIP', 'URL', + 'CF', 'PF', 'AF', 'ZF', 'SF', 'TF', 'IF', 'DF', 'OF', + 'EAX', 'EBX', 'ECX', 'EDX', 'ESI', 'EDI', 'EBP', 'ESP', + 'RAX', 'RBX', 'RCX', 'RDX', 'RSI', 'RDI', 'RBP', 'RSP', + 'DOS', 'OS', 'BIOS', 'UEFI', 'MBR', 'GPT', + 'ASCII', 'UTF', 'UNICODE', 'ANSI', + 'ЭВМ', 'МОЭВМ', + 'CPU', 'GPU', 'APU', 'RAM', 'ROM', 'PROM', 'EPROM', 'EEPROM', + 'USB', 'SATA', 'PCI', 'PCIe', 'AGP', 'ISA', 'VGA', 'HDMI', 'DP', + 'LAN', 'WAN', 'WLAN', 'VPN', 'ISP', 'DNS', 'DHCP', 'TCP', 'UDP', 'IP', + 'HTTP', 'HTTPS', 'FTP', 'SSH', 'SSL', 'TLS', + 'API', 'GUI', 'CLI', 'IDE', 'SDK', 'SQL', 'NoSQL', 'XML', 'JSON', 'YAML', + 'MAC', 'IBM', 'ГОСТ', 'ООП', 'ЛР', 'КР', 'ОТЧЕТ' + } + filtered_abbr = {abbr for abbr in abbreviations if abbr not in common_abbr \ + and abbr not in title_page and morph.parse(abbr.lower())[0].score != 0} + + return list(filtered_abbr) + +def is_abbreviation_explained(abbr: str, text: str) -> bool: + patterns = [ + rf'{abbr}\s*\(([^)]+)\)', # АААА (расшифровка) + rf'\(([^)]+)\)\s*{abbr}', # (расшифровка) АААА + rf'{abbr}\s*[—\-]\s*([^.,;!?]+)', # АААА — расшифровка + rf'{abbr}\s*-\s*([^.,;!?]+)', # АААА - расшифровка + rf'([^.,;!?]+)\s*[—\-]\s*{abbr}', # расшифровка — АААА + rf'([^.,;!?]+)\s*-\s*{abbr}' # расшифровка - АААА + ] + + + for pattern in patterns: + match = re.search(pattern, text, re.IGNORECASE) + if match and correctly_explained(abbr, match.group(1)): + return True + + return False + +def correctly_explained(abbr, explan): + words = explan.split() + + first_letters = "" + for word in words: + if word: + first_letters += word[0].upper() + + return first_letters == abbr.upper() + +def main_check(text: str, title_page: str): + try: + continue_check = True + res_str = "" + if not text: + continue_check, res_str = False, "Не удалось получить текст" + + abbr_is_finding, unexplained_abbr = get_unexplained_abbrev(text=text, title_page=title_page) + + if not abbr_is_finding: + continue_check, res_str = False, "Аббревиатуры не найдены в представленном документе" + + if not unexplained_abbr: + continue_check, res_str = False, "Все аббревиатуры правильно расшифрованы" + + return continue_check, res_str, unexplained_abbr + + except Exception as e: + return False, f"Ошибка при проверке аббревиатур: {str(e)}", {} + +def forming_response(unexplained_abbr_with_page, format_page_link): + result_str = "Найдены нерасшифрованные аббревиатуры при первом использовании:
" + page_links = format_page_link(list(unexplained_abbr_with_page.values())) + for index_links, abbr in enumerate(unexplained_abbr_with_page): + result_str += f"- {abbr} на {page_links[index_links]} странице/слайде
" + result_str += "Каждая аббревиатура должна быть расшифрована при первом использовании в тексте.
" + result_str += "Расшифровка должны быть по первыми буквам, например, МВД - Министерство внутренних дел.
" + return result_str diff --git a/app/main/checks/presentation_checks/__init__.py b/app/main/checks/presentation_checks/__init__.py index 8a0a64fb..288546f1 100644 --- a/app/main/checks/presentation_checks/__init__.py +++ b/app/main/checks/presentation_checks/__init__.py @@ -17,4 +17,5 @@ from .name_of_image_check import PresImageCaptureCheck from .task_tracker import TaskTracker from .overview_in_tasks import OverviewInTasks -from .decimal_places import PresDecimalPlacesCheck \ No newline at end of file +from .abbreviations_presentation import PresAbbreviationsCheck +from .decimal_places import PresDecimalPlacesCheck diff --git a/app/main/checks/presentation_checks/abbreviations_presentation.py b/app/main/checks/presentation_checks/abbreviations_presentation.py new file mode 100644 index 00000000..1dafefdd --- /dev/null +++ b/app/main/checks/presentation_checks/abbreviations_presentation.py @@ -0,0 +1,39 @@ +import re +from ..base_check import BasePresCriterion, answer +from ..check_abbreviations import main_check, forming_response + + +class PresAbbreviationsCheck(BasePresCriterion): + label = "Проверка расшифровки аббревиатур в презентации" + description = "Все аббревиатуры должны быть расшифрованы при первом использовании" + id = 'pres_abbreviations_check' + + def __init__(self, file_info): + super().__init__(file_info) + + def check(self): + try: + slides_text = self.file.get_text_from_slides() + title_page = slides_text[0] + full_text = " ".join(slides_text) + + continue_check, res_str, unexplained_abbr = main_check(text=full_text, title_page=title_page) + if not continue_check: + return answer(True, res_str) + + + unexplained_abbr_with_slides = {} + + for slide_num, slide_text in enumerate(slides_text, 1): + for abbr in unexplained_abbr: + if abbr in slide_text and abbr not in unexplained_abbr_with_slides: + unexplained_abbr_with_slides[abbr] = slide_num + + if not unexplained_abbr_with_slides: + return answer(True, "Все аббревиатуры правильно расшифрованы") + + result_str = forming_response(unexplained_abbr_with_slides, lambda pages: self.format_page_link(pages)) + return answer(False, result_str) + + except Exception as e: + return answer(False, f"Ошибка при проверке аббревиатур: {str(e)}") diff --git a/app/main/checks/report_checks/__init__.py b/app/main/checks/report_checks/__init__.py index 7b1b974b..96b3fab0 100644 --- a/app/main/checks/report_checks/__init__.py +++ b/app/main/checks/report_checks/__init__.py @@ -13,12 +13,10 @@ from .page_counter import ReportPageCounter from .right_words_check import ReportRightWordsCheck from .section_component import ReportSectionComponent -from .sections_check import LRReportSectionCheck from .short_sections_check import ReportShortSectionsCheck from .simple_check import ReportSimpleCheck from .style_check_settings import StyleCheckSettings from .find_theme_in_report import FindThemeInReport -from .headers_at_page_top_check import ReportHeadersAtPageTopCheck from .sections_check import LRReportSectionCheck from .style_check import ReportStyleCheck from .spelling_check import SpellingCheck @@ -33,5 +31,5 @@ from .sw_keywords_check import SWKeywordsCheck from .task_tracker import ReportTaskTracker from .paragraphs_count_check import ReportParagraphsCountCheck -from .template_name import ReportTemplateNameCheck -from .decimal_places import ReportDecimalPlacesCheck \ No newline at end of file +from .abbreviations_check import ReportAbbreviationsCheck +from .decimal_places import ReportDecimalPlacesCheck diff --git a/app/main/checks/report_checks/abbreviations_check.py b/app/main/checks/report_checks/abbreviations_check.py new file mode 100644 index 00000000..906378e0 --- /dev/null +++ b/app/main/checks/report_checks/abbreviations_check.py @@ -0,0 +1,56 @@ +from ..base_check import BaseReportCriterion, answer +from ..check_abbreviations import main_check, forming_response + + +class ReportAbbreviationsCheck(BaseReportCriterion): + label = "Проверка расшифровки аббревиатур" + description = "Все аббревиатуры должны быть расшифрованы при первом использовании" + id = 'report_abbreviations_check' + + def __init__(self, file_info): + super().__init__(file_info) + + + def check(self): + try: + text = self._get_document_text() + title_page = self.file.pdf_file.text_on_page[1] + + continue_check, res_str, unexplained_abbr = main_check(text=text, title_page=title_page) + if not continue_check: + return answer(True, res_str) + + unexplained_abbr_with_page = {} + + for page_num in range(1, self.file.page_counter() + 1): + text_on_page = self.file.pdf_file.text_on_page[page_num] + + for abbr in unexplained_abbr: + if abbr in text_on_page and abbr not in unexplained_abbr_with_page: + unexplained_abbr_with_page[abbr] = page_num + + if not unexplained_abbr_with_page: + return answer(True, "Все аббревиатуры правильно расшифрованы") + result_str = forming_response(unexplained_abbr_with_page, lambda pages: self.format_page_link(pages)) + return answer(False, result_str) + + except Exception as e: + return answer(False, f"Ошибка при проверке аббревиатур: {str(e)}") + + + + def _get_document_text(self): + + if hasattr(self.file, 'pdf_file'): + page_texts = self.file.pdf_file.get_text_on_page() + return " ".join(page_texts.values()) + elif hasattr(self.file, 'paragraphs'): + text_parts = [] + for paragraph in self.file.paragraphs: + text = paragraph.to_string() + if '\n' in text: + text = text.split('\n')[1] + text_parts.append(text) + return "\n".join(text_parts) + return None + \ No newline at end of file