-
Notifications
You must be signed in to change notification settings - Fork 2
Checks abbreviations press and reports #759
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
LapshinAE0
wants to merge
32
commits into
dev
Choose a base branch
from
555_check_abbreviations
base: dev
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
32 commits
Select commit
Hold shift + click to select a range
7316257
Checks abbreviations press and reports
LapshinAE0 c89d506
Fixed if present check
LapshinAE0 8394b5a
abbr taken out
LapshinAE0 975b4d2
correct check abbr
LapshinAE0 280d9cd
fixed checks 1
LapshinAE0 7fe11be
fixed checks 2
LapshinAE0 edd07ce
fixed checks 3
LapshinAE0 d696932
Merge branch 'master' into 555_check_abbreviations
LapshinAE0 ab8f2f7
Update banned_words_in_literature.py
HadronCollider 331a121
fix svg size and background color
HadronCollider ac65e06
fix dev docker compose
HadronCollider c69548c
print traceback to logs (and check result)
HadronCollider bbbf47d
little change for svg size
HadronCollider 31371af
add more feedback for UNEXPECTED_CHECK_FAIL_MSG
HadronCollider df61abd
little kostil'
HadronCollider c90d82d
update template results.html
HadronCollider 3269e53
569: fix big files in webpack
necit-dev bdfefdf
update some html-templates and styles
HadronCollider eece7ea
update Dockerfiles (base and main), requirements and some python-libs
HadronCollider 8280103
update main_character_check
HadronCollider c88f47e
add recheck test
HadronCollider 495b68d
Update style_check_settings.py
HadronCollider dcad677
add warned_words for banned_words_check
HadronCollider 8b5865e
add login_required and author check for result page
HadronCollider 02b70ba
little update for 404 page
HadronCollider 9a54360
all requier applyied-2
LapshinAE0 96cb746
all requier applyied-3
LapshinAE0 d2d09a6
Merge branch 'dev' into 555_check_abbreviations
LapshinAE0 d26fcd3
add case for title page
LapshinAE0 fd19063
dont check abbr title page
LapshinAE0 467bef5
Merge branch 'dev' into 555_check_abbreviations
HadronCollider f1a8f16
Merge branch 'dev' into 555_check_abbreviations
HadronCollider File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,105 @@ | ||
| import re | ||
| from pymorphy3 import MorphAnalyzer | ||
| morph = MorphAnalyzer() | ||
|
|
||
|
|
||
| def get_unexplained_abbrev(text, title_page): | ||
| abbreviations = find_abbreviations(text, title_page) | ||
|
|
||
| if not abbreviations: | ||
| return False, [] | ||
|
|
||
| unexplained_abbr = [] | ||
| for abbr in abbreviations: | ||
| if not is_abbreviation_explained(abbr, text): | ||
| unexplained_abbr.append(abbr) | ||
|
|
||
| return True, unexplained_abbr | ||
|
|
||
| def find_abbreviations(text: str, title_page: str): | ||
| pattern = r'\b[А-ЯA-Z]{2,5}\b' | ||
| abbreviations = re.findall(pattern, text) | ||
|
|
||
| common_abbr = { | ||
| 'СССР', 'РФ', 'США', 'ВКР', 'ИТ', 'ПО', 'ООО', 'ЗАО', 'ОАО', 'HTML', 'CSS', | ||
| 'JS', 'ЛЭТИ', 'МОЕВМ', 'ЭВМ', 'ГОСТ', 'DVD' | ||
|
|
||
| 'SSD', 'PC', 'HDD', | ||
| 'AX', 'BX', 'CX', 'DX', 'SI', 'DI', 'BP', 'SP', | ||
| 'AH', 'AL', 'BH', 'BL', 'CH', 'CL', 'DH', 'DL', | ||
| 'CS', 'DS', 'ES', 'SS', 'FS', 'GS', | ||
| 'IP', 'EIP', 'RIP', 'URL', | ||
| 'CF', 'PF', 'AF', 'ZF', 'SF', 'TF', 'IF', 'DF', 'OF', | ||
| 'EAX', 'EBX', 'ECX', 'EDX', 'ESI', 'EDI', 'EBP', 'ESP', | ||
| 'RAX', 'RBX', 'RCX', 'RDX', 'RSI', 'RDI', 'RBP', 'RSP', | ||
| 'DOS', 'OS', 'BIOS', 'UEFI', 'MBR', 'GPT', | ||
| 'ASCII', 'UTF', 'UNICODE', 'ANSI', | ||
| 'ЭВМ', 'МОЭВМ', | ||
| 'CPU', 'GPU', 'APU', 'RAM', 'ROM', 'PROM', 'EPROM', 'EEPROM', | ||
| 'USB', 'SATA', 'PCI', 'PCIe', 'AGP', 'ISA', 'VGA', 'HDMI', 'DP', | ||
| 'LAN', 'WAN', 'WLAN', 'VPN', 'ISP', 'DNS', 'DHCP', 'TCP', 'UDP', 'IP', | ||
| 'HTTP', 'HTTPS', 'FTP', 'SSH', 'SSL', 'TLS', | ||
| 'API', 'GUI', 'CLI', 'IDE', 'SDK', 'SQL', 'NoSQL', 'XML', 'JSON', 'YAML', | ||
| 'MAC', 'IBM', 'ГОСТ', 'ООП', 'ЛР', 'КР', 'ОТЧЕТ' | ||
| } | ||
| filtered_abbr = {abbr for abbr in abbreviations if abbr not in common_abbr \ | ||
| and abbr not in title_page and morph.parse(abbr.lower())[0].score != 0} | ||
|
|
||
| return list(filtered_abbr) | ||
|
|
||
| def is_abbreviation_explained(abbr: str, text: str) -> bool: | ||
| patterns = [ | ||
| rf'{abbr}\s*\(([^)]+)\)', # АААА (расшифровка) | ||
| rf'\(([^)]+)\)\s*{abbr}', # (расшифровка) АААА | ||
| rf'{abbr}\s*[—\-]\s*([^.,;!?]+)', # АААА — расшифровка | ||
| rf'{abbr}\s*-\s*([^.,;!?]+)', # АААА - расшифровка | ||
| rf'([^.,;!?]+)\s*[—\-]\s*{abbr}', # расшифровка — АААА | ||
| rf'([^.,;!?]+)\s*-\s*{abbr}' # расшифровка - АААА | ||
| ] | ||
|
|
||
|
|
||
| for pattern in patterns: | ||
| match = re.search(pattern, text, re.IGNORECASE) | ||
| if match and correctly_explained(abbr, match.group(1)): | ||
| return True | ||
|
|
||
| return False | ||
|
|
||
| def correctly_explained(abbr, explan): | ||
| words = explan.split() | ||
|
|
||
| first_letters = "" | ||
| for word in words: | ||
| if word: | ||
| first_letters += word[0].upper() | ||
|
|
||
| return first_letters == abbr.upper() | ||
|
|
||
| def main_check(text: str, title_page: str): | ||
| try: | ||
| continue_check = True | ||
| res_str = "" | ||
| if not text: | ||
| continue_check, res_str = False, "Не удалось получить текст" | ||
|
|
||
| abbr_is_finding, unexplained_abbr = get_unexplained_abbrev(text=text, title_page=title_page) | ||
|
|
||
| if not abbr_is_finding: | ||
| continue_check, res_str = False, "Аббревиатуры не найдены в представленном документе" | ||
|
|
||
| if not unexplained_abbr: | ||
| continue_check, res_str = False, "Все аббревиатуры правильно расшифрованы" | ||
|
|
||
| return continue_check, res_str, unexplained_abbr | ||
|
|
||
| except Exception as e: | ||
| return False, f"Ошибка при проверке аббревиатур: {str(e)}", {} | ||
|
|
||
| def forming_response(unexplained_abbr_with_page, format_page_link): | ||
| result_str = "Найдены нерасшифрованные аббревиатуры при первом использовании:<br>" | ||
| page_links = format_page_link(list(unexplained_abbr_with_page.values())) | ||
| for index_links, abbr in enumerate(unexplained_abbr_with_page): | ||
| result_str += f"- {abbr} на {page_links[index_links]} странице/слайде<br>" | ||
| result_str += "Каждая аббревиатура должна быть расшифрована при первом использовании в тексте.<br>" | ||
| result_str += "Расшифровка должны быть по первыми буквам, например, МВД - Министерство внутренних дел.<br>" | ||
| return result_str |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
39 changes: 39 additions & 0 deletions
39
app/main/checks/presentation_checks/abbreviations_presentation.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| import re | ||
| from ..base_check import BasePresCriterion, answer | ||
| from ..check_abbreviations import main_check, forming_response | ||
|
|
||
|
|
||
| class PresAbbreviationsCheck(BasePresCriterion): | ||
| label = "Проверка расшифровки аббревиатур в презентации" | ||
| description = "Все аббревиатуры должны быть расшифрованы при первом использовании" | ||
| id = 'pres_abbreviations_check' | ||
|
|
||
| def __init__(self, file_info): | ||
| super().__init__(file_info) | ||
|
|
||
| def check(self): | ||
| try: | ||
| slides_text = self.file.get_text_from_slides() | ||
| title_page = slides_text[0] | ||
| full_text = " ".join(slides_text) | ||
|
|
||
| continue_check, res_str, unexplained_abbr = main_check(text=full_text, title_page=title_page) | ||
| if not continue_check: | ||
| return answer(True, res_str) | ||
|
|
||
|
|
||
| unexplained_abbr_with_slides = {} | ||
|
|
||
| for slide_num, slide_text in enumerate(slides_text, 1): | ||
| for abbr in unexplained_abbr: | ||
| if abbr in slide_text and abbr not in unexplained_abbr_with_slides: | ||
| unexplained_abbr_with_slides[abbr] = slide_num | ||
|
|
||
| if not unexplained_abbr_with_slides: | ||
| return answer(True, "Все аббревиатуры правильно расшифрованы") | ||
|
|
||
| result_str = forming_response(unexplained_abbr_with_slides, lambda pages: self.format_page_link(pages)) | ||
| return answer(False, result_str) | ||
|
|
||
| except Exception as e: | ||
| return answer(False, f"Ошибка при проверке аббревиатур: {str(e)}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| from ..base_check import BaseReportCriterion, answer | ||
| from ..check_abbreviations import main_check, forming_response | ||
|
|
||
|
|
||
| class ReportAbbreviationsCheck(BaseReportCriterion): | ||
| label = "Проверка расшифровки аббревиатур" | ||
| description = "Все аббревиатуры должны быть расшифрованы при первом использовании" | ||
| id = 'report_abbreviations_check' | ||
|
|
||
| def __init__(self, file_info): | ||
| super().__init__(file_info) | ||
|
|
||
|
|
||
| def check(self): | ||
| try: | ||
| text = self._get_document_text() | ||
| title_page = self.file.pdf_file.text_on_page[1] | ||
|
|
||
| continue_check, res_str, unexplained_abbr = main_check(text=text, title_page=title_page) | ||
| if not continue_check: | ||
| return answer(True, res_str) | ||
|
|
||
| unexplained_abbr_with_page = {} | ||
|
|
||
| for page_num in range(1, self.file.page_counter() + 1): | ||
| text_on_page = self.file.pdf_file.text_on_page[page_num] | ||
|
|
||
| for abbr in unexplained_abbr: | ||
| if abbr in text_on_page and abbr not in unexplained_abbr_with_page: | ||
| unexplained_abbr_with_page[abbr] = page_num | ||
|
|
||
| if not unexplained_abbr_with_page: | ||
| return answer(True, "Все аббревиатуры правильно расшифрованы") | ||
| result_str = forming_response(unexplained_abbr_with_page, lambda pages: self.format_page_link(pages)) | ||
| return answer(False, result_str) | ||
|
|
||
| except Exception as e: | ||
| return answer(False, f"Ошибка при проверке аббревиатур: {str(e)}") | ||
|
|
||
|
|
||
|
|
||
| def _get_document_text(self): | ||
|
|
||
| if hasattr(self.file, 'pdf_file'): | ||
| page_texts = self.file.pdf_file.get_text_on_page() | ||
| return " ".join(page_texts.values()) | ||
| elif hasattr(self.file, 'paragraphs'): | ||
| text_parts = [] | ||
| for paragraph in self.file.paragraphs: | ||
| text = paragraph.to_string() | ||
| if '\n' in text: | ||
| text = text.split('\n')[1] | ||
| text_parts.append(text) | ||
| return "\n".join(text_parts) | ||
| return None | ||
|
|
||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
С учетом, что данный код 1 в 1 дублируется в обоих критериях (за исключением строк с указанием документа/презентации и получения данных), его стоит вынести в отдельную функцию/модуль