From d6b163cfd01f2d92334c46a4a8bdbdaa97ac62dc Mon Sep 17 00:00:00 2001
From: ardnaxelas <aardnaxelass@gmail.com>
Date: Sat, 28 Sep 2024 00:45:20 +0300
Subject: [PATCH 01/20] v1

---
 app/db/db_methods.py                          | 13 ++++++++
 app/db/db_types.py                            | 15 +++++++++
 app/main/parser.py                            | 25 ++++++++++++--
 .../presentations/pptx/presentation_pptx.py   | 25 ++++++++++++++
 .../reports/docx_uploader/docx_uploader.py    | 33 +++++++++++++++++++
 5 files changed, 108 insertions(+), 3 deletions(-)

diff --git a/app/db/db_methods.py b/app/db/db_methods.py
index d80d92db..31bf4c10 100644
--- a/app/db/db_methods.py
+++ b/app/db/db_methods.py
@@ -21,11 +21,24 @@
 logs_collection = db.create_collection(
     'logs', capped=True, size=5242880) if not db['logs'] else db['logs']
 celery_check_collection = db['celery_check']  # collection for mapping celery_task to check
+images_collection = db['images']  # коллекция для хранения изображений
 
 
 def get_client():
     return client
 
+def save_image_to_db(check_id, image_data, caption):
+    from app.db.db_types import Image
+
+    image = Image({
+        'check_id': check_id,
+        'image_data': image_data,
+        'caption': caption
+    })
+    images_collection.insert_one(image.pack())
+
+    print("check_id----------",check_id)
+
 
 # Returns user if user was created and None if already exists
 def add_user(username, password_hash='', is_LTI=False):
diff --git a/app/db/db_types.py b/app/db/db_types.py
index eeeb26d9..53dd04b1 100644
--- a/app/db/db_types.py
+++ b/app/db/db_types.py
@@ -145,3 +145,18 @@ def none_to_false(x):
         is_ended = none_to_true(self.is_ended)  # None for old checks => True, True->True, False->False
         is_failed = none_to_false(self.is_failed)  # None for old checks => False, True->True, False->False
         return {'is_ended': is_ended, 'is_failed': is_failed}
+
+class Image(PackableWithId):
+    def __init__(self, dictionary=None):
+        super().__init__(dictionary)
+        dictionary = dictionary or {}
+        self.check_id = dictionary.get('check_id')  # Привязка к check_id
+        self.caption = dictionary.get('caption', '')  # Подпись к изображению
+        self.image_data = dictionary.get('image_data')  # Файл изображения в формате bindata
+
+    def pack(self):
+        package = super().pack()
+        package['check_id'] = str(self.check_id)
+        package['caption'] = self.caption
+        package['image_data'] = self.image_data
+        return package
diff --git a/app/main/parser.py b/app/main/parser.py
index 593b8cfd..07754aab 100644
--- a/app/main/parser.py
+++ b/app/main/parser.py
@@ -8,8 +8,11 @@
 from main.reports.md_uploader import MdUploader
 from utils import convert_to
 
-logger = logging.getLogger('root_logger')
+from os.path import basename
+from app.db.db_methods import add_check
+from app.db.db_types import Check
 
+logger = logging.getLogger('root_logger')
 
 def parse(filepath, pdf_filepath):
     tmp_filepath = filepath.lower()
@@ -19,7 +22,17 @@ def parse(filepath, pdf_filepath):
             if tmp_filepath.endswith(('.odp', '.ppt')):
                 logger.info(f"Презентация {filepath} старого формата. Временно преобразована в pptx для обработки.")
                 new_filepath = convert_to(filepath, target_format='pptx')
-            file_object = PresentationPPTX(new_filepath)
+
+            presentation = PresentationPPTX(new_filepath)
+
+            check = Check({
+                'filename': basename(new_filepath),
+            })
+            check_id = add_check(23, check)
+            presentation.extract_images_with_captions(check_id)
+            file_object = presentation
+
+
         elif tmp_filepath.endswith(('.doc', '.odt', '.docx', )):
             new_filepath = filepath
             if tmp_filepath.endswith(('.doc', '.odt')):
@@ -28,7 +41,13 @@ def parse(filepath, pdf_filepath):
 
             docx = DocxUploader()
             docx.upload(new_filepath, pdf_filepath)
+            # Создание проверки
+            check = Check({
+                'filename': basename(new_filepath),
+            })
+            check_id = add_check(23, check)
             docx.parse()
+            docx.extract_images_with_captions(check_id)
             file_object = docx
 
         elif tmp_filepath.endswith('.md' ):
@@ -54,4 +73,4 @@ def save_to_temp_file(file):
     temp_file.write(file.read())
     temp_file.close()
     file.seek(0)
-    return temp_file.name
+    return temp_file.name
\ No newline at end of file
diff --git a/app/main/presentations/pptx/presentation_pptx.py b/app/main/presentations/pptx/presentation_pptx.py
index dd909f8c..e78babb0 100644
--- a/app/main/presentations/pptx/presentation_pptx.py
+++ b/app/main/presentations/pptx/presentation_pptx.py
@@ -1,4 +1,5 @@
 from pptx import Presentation
+from pptx.enum.shapes import MSO_SHAPE_TYPE
 
 from .slide_pptx import SlidePPTX
 from ..presentation_basic import PresentationBasic
@@ -17,3 +18,27 @@ def add_slides(self):
 
     def __str__(self):
         return super().__str__()
+
+    def extract_images_with_captions(self, check_id):
+        from app.db.db_methods import save_image_to_db
+
+        images_with_captions = []
+
+        for slide in self.prs.slides:
+            for shape in slide.shapes:
+                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
+                    image_data = shape.image.blob  # Бинарные данные изображения
+                    caption = ""
+
+                    # Определение подписи. Предполагается, что подпись находится рядом с изображением
+                    if shape.has_text_frame:
+                        caption = shape.text.strip()
+                    else:
+                        # Альтернативный способ: поиск текстового поля рядом с изображением
+                        pass
+
+                    # Сохранение изображения и подписи в MongoDB
+                    save_image_to_db(check_id, image_data, caption)
+                    images_with_captions.append({"image_data": image_data, "caption": caption})
+
+        return images_with_captions
\ No newline at end of file
diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py
index ac30dee4..d988fed2 100644
--- a/app/main/reports/docx_uploader/docx_uploader.py
+++ b/app/main/reports/docx_uploader/docx_uploader.py
@@ -242,6 +242,39 @@ def show_chapters(self, work_type):
                 chapters_str += "&nbsp;&nbsp;&nbsp;&nbsp;" + header["text"] + "<br>"
         return chapters_str
 
+    def extract_images_with_captions(self, check_id):
+        from app.db.db_methods import save_image_to_db
+        images_with_captions = []
+
+        # Получение всех встроенных фигур (inline_shapes)
+        for inline_shape in self.file.inline_shapes:
+            # Проверка, является ли встроенный объект изображением
+            if inline_shape.type == 3:  # Тип 3 соответствует изображению (PICTURE)
+                # Извлечение бинарных данных изображения
+                image_stream = inline_shape._inline.graphic.graphicData.pic.blipFill.blip.embed
+                image_data = self.file.part.related_parts[image_stream].blob  # Бинарные данные изображения
+
+                # Инициализация подписи
+                caption = ""
+
+                # Поиск параграфа, следующего за изображением, для извлечения подписи
+                for i, paragraph in enumerate(self.file.paragraphs):
+                    # Проверяем, находится ли изображение в параграфе
+                    inline_shape_xml = inline_shape._inline.xml
+                    if inline_shape_xml in paragraph._element.xml:
+                        # Если есть следующий параграф, предположим, что это подпись
+                        if i + 1 < len(self.file.paragraphs):
+                            next_paragraph = self.file.paragraphs[i + 1].text.strip()
+                            if next_paragraph:
+                                caption = next_paragraph
+                        break  # Найдено изображение, больше искать не нужно
+
+                # Сохранение изображения и подписи в MongoDB
+                save_image_to_db(check_id, image_data, caption)
+                images_with_captions.append({"image_data": image_data, "caption": caption})
+
+        return images_with_captions
+
 
 def main(args):
     file = args.file

From 88f199cabec056a2c658a82ea1cf996cd032fa13 Mon Sep 17 00:00:00 2001
From: ardnaxelas <aardnaxelass@gmail.com>
Date: Mon, 30 Sep 2024 01:30:16 +0300
Subject: [PATCH 02/20] v1.1

---
 app/db/db_methods.py                          |  2 -
 app/main/parser.py                            | 20 ++++++-
 .../presentations/pptx/presentation_pptx.py   | 56 ++++++++++++-------
 .../reports/docx_uploader/docx_uploader.py    | 55 +++++++++---------
 4 files changed, 82 insertions(+), 51 deletions(-)

diff --git a/app/db/db_methods.py b/app/db/db_methods.py
index 31bf4c10..e63f1760 100644
--- a/app/db/db_methods.py
+++ b/app/db/db_methods.py
@@ -37,8 +37,6 @@ def save_image_to_db(check_id, image_data, caption):
     })
     images_collection.insert_one(image.pack())
 
-    print("check_id----------",check_id)
-
 
 # Returns user if user was created and None if already exists
 def add_user(username, password_hash='', is_LTI=False):
diff --git a/app/main/parser.py b/app/main/parser.py
index 07754aab..fb60a19d 100644
--- a/app/main/parser.py
+++ b/app/main/parser.py
@@ -15,6 +15,8 @@
 logger = logging.getLogger('root_logger')
 
 def parse(filepath, pdf_filepath):
+    from app.db.db_methods import files_info_collection
+
     tmp_filepath = filepath.lower()
     try:
         if tmp_filepath.endswith(('.odp', '.ppt', '.pptx')):
@@ -28,7 +30,13 @@ def parse(filepath, pdf_filepath):
             check = Check({
                 'filename': basename(new_filepath),
             })
-            check_id = add_check(23, check)
+
+            file_id = 0
+            file = files_info_collection.find_one({'name': basename(new_filepath)})
+            if file:
+                file_id = file['_id']
+
+            check_id = add_check(file_id, check)
             presentation.extract_images_with_captions(check_id)
             file_object = presentation
 
@@ -41,11 +49,17 @@ def parse(filepath, pdf_filepath):
 
             docx = DocxUploader()
             docx.upload(new_filepath, pdf_filepath)
-            # Создание проверки
+
             check = Check({
                 'filename': basename(new_filepath),
             })
-            check_id = add_check(23, check)
+
+            file_id = 0
+            file = files_info_collection.find_one({'name': basename(new_filepath)})
+            if file:
+                file_id = file['_id']
+
+            check_id = add_check(file_id, check)
             docx.parse()
             docx.extract_images_with_captions(check_id)
             file_object = docx
diff --git a/app/main/presentations/pptx/presentation_pptx.py b/app/main/presentations/pptx/presentation_pptx.py
index e78babb0..34f2081a 100644
--- a/app/main/presentations/pptx/presentation_pptx.py
+++ b/app/main/presentations/pptx/presentation_pptx.py
@@ -1,3 +1,5 @@
+from io import BytesIO
+
 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE
 
@@ -22,23 +24,37 @@ def __str__(self):
     def extract_images_with_captions(self, check_id):
         from app.db.db_methods import save_image_to_db
 
-        images_with_captions = []
-
-        for slide in self.prs.slides:
-            for shape in slide.shapes:
-                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
-                    image_data = shape.image.blob  # Бинарные данные изображения
-                    caption = ""
-
-                    # Определение подписи. Предполагается, что подпись находится рядом с изображением
-                    if shape.has_text_frame:
-                        caption = shape.text.strip()
-                    else:
-                        # Альтернативный способ: поиск текстового поля рядом с изображением
-                        pass
-
-                    # Сохранение изображения и подписи в MongoDB
-                    save_image_to_db(check_id, image_data, caption)
-                    images_with_captions.append({"image_data": image_data, "caption": caption})
-
-        return images_with_captions
\ No newline at end of file
+        # Проход по каждому слайду в презентации
+        for slide in self.slides:
+            image_found = False
+            image_data = None
+            caption_text = None
+
+            # Проход по всем шейпам на слайде
+            for shape in slide.slide.shapes:  # Используем slide.slide для доступа к текущему слайду
+                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:  # Тип 13 соответствует PICTURE
+                    image_found = True
+                    image_part = shape.image  # Получаем объект изображения
+
+                    # Извлекаем бинарные данные изображения
+                    image_stream = image_part.blob
+                    image_data = BytesIO(image_stream)
+                    print(f"Изображение найдено на слайде {slide.index}")
+
+                # Если мы нашли изображение, ищем следующий непустой текст как подпись
+                if image_found:
+                    for shape in slide.slide.shapes:
+                        if not shape.has_text_frame:
+                            continue
+                        text = shape.text.strip()
+                        if text:  # Находим непустое текстовое поле (предположительно, это подпись)
+                            caption_text = text
+                            # Сохраняем изображение и его подпись
+                            save_image_to_db(check_id, image_data.getvalue(), caption_text)
+                            print(f"Подпись найдена: '{caption_text}' на слайде {slide.index}")
+                            break  # Предполагаем, что это подпись к текущему изображению
+
+                    # Сброс флага и данных изображения для следующего цикла
+                    image_found = False
+                    image_data = None
+                    caption_text = None
diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py
index d988fed2..7c462207 100644
--- a/app/main/reports/docx_uploader/docx_uploader.py
+++ b/app/main/reports/docx_uploader/docx_uploader.py
@@ -244,36 +244,39 @@ def show_chapters(self, work_type):
 
     def extract_images_with_captions(self, check_id):
         from app.db.db_methods import save_image_to_db
-        images_with_captions = []
 
-        # Получение всех встроенных фигур (inline_shapes)
-        for inline_shape in self.file.inline_shapes:
-            # Проверка, является ли встроенный объект изображением
-            if inline_shape.type == 3:  # Тип 3 соответствует изображению (PICTURE)
-                # Извлечение бинарных данных изображения
-                image_stream = inline_shape._inline.graphic.graphicData.pic.blipFill.blip.embed
-                image_data = self.file.part.related_parts[image_stream].blob  # Бинарные данные изображения
+        image_found = False
+        image_data = None
 
-                # Инициализация подписи
-                caption = ""
+        # Проход по всем параграфам документа
+        for i, paragraph in enumerate(self.file.paragraphs):
+            # Проверяем, есть ли в параграфе встроенные объекты
+            for run in paragraph.runs:
+                if "graphic" in run._element.xml:  # Это может быть изображение
+                    image_found = True
 
-                # Поиск параграфа, следующего за изображением, для извлечения подписи
-                for i, paragraph in enumerate(self.file.paragraphs):
-                    # Проверяем, находится ли изображение в параграфе
-                    inline_shape_xml = inline_shape._inline.xml
-                    if inline_shape_xml in paragraph._element.xml:
-                        # Если есть следующий параграф, предположим, что это подпись
-                        if i + 1 < len(self.file.paragraphs):
-                            next_paragraph = self.file.paragraphs[i + 1].text.strip()
-                            if next_paragraph:
-                                caption = next_paragraph
-                        break  # Найдено изображение, больше искать не нужно
+                    # Извлечение бинарных данных изображения
+                    image_streams = run._element.findall('.//a:blip', namespaces={
+                        'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
+                    for image_stream in image_streams:
+                        embed_id = image_stream.get(
+                            '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
+                        if embed_id:
+                            image_data = self.file.part.related_parts[embed_id].blob
 
-                # Сохранение изображения и подписи в MongoDB
-                save_image_to_db(check_id, image_data, caption)
-                images_with_captions.append({"image_data": image_data, "caption": caption})
-
-        return images_with_captions
+                # Если мы уже нашли изображение, ищем следующий непустой параграф для подписи
+                if image_found:
+                    # Переход к следующему параграфу
+                    next_paragraph_index = i + 1
+                    while next_paragraph_index < len(self.file.paragraphs):
+                        next_paragraph_text = self.file.paragraphs[next_paragraph_index].text.strip()
+                        if next_paragraph_text:  # Находим непустой параграф
+                            # Сохраняем изображение и его подпись
+                            save_image_to_db(check_id, image_data, next_paragraph_text)
+                            break
+                        next_paragraph_index += 1
+                    image_found = False  # Сброс флага, чтобы искать следующее изображение
+                    image_data = None  # Очистка данных изображения
 
 
 def main(args):

From 5ecde02ca5b19355a9065b750b8c7253c4fb9de1 Mon Sep 17 00:00:00 2001
From: ardnaxelas <aardnaxelass@gmail.com>
Date: Mon, 30 Sep 2024 11:42:50 +0300
Subject: [PATCH 03/20] v2: edit cases

---
 app/db/db_methods.py                          |  1 +
 .../presentations/pptx/presentation_pptx.py   |  6 ++--
 .../reports/docx_uploader/docx_uploader.py    | 34 ++++++++++++++-----
 3 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/app/db/db_methods.py b/app/db/db_methods.py
index e63f1760..841085b4 100644
--- a/app/db/db_methods.py
+++ b/app/db/db_methods.py
@@ -36,6 +36,7 @@ def save_image_to_db(check_id, image_data, caption):
         'caption': caption
     })
     images_collection.insert_one(image.pack())
+    print(str(check_id) + " " + str(caption))
 
 
 # Returns user if user was created and None if already exists
diff --git a/app/main/presentations/pptx/presentation_pptx.py b/app/main/presentations/pptx/presentation_pptx.py
index 34f2081a..a8b8581f 100644
--- a/app/main/presentations/pptx/presentation_pptx.py
+++ b/app/main/presentations/pptx/presentation_pptx.py
@@ -30,16 +30,15 @@ def extract_images_with_captions(self, check_id):
             image_data = None
             caption_text = None
 
-            # Проход по всем шейпам на слайде
+            # Проход по всем фигурам на слайде
             for shape in slide.slide.shapes:  # Используем slide.slide для доступа к текущему слайду
-                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:  # Тип 13 соответствует PICTURE
+                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                     image_found = True
                     image_part = shape.image  # Получаем объект изображения
 
                     # Извлекаем бинарные данные изображения
                     image_stream = image_part.blob
                     image_data = BytesIO(image_stream)
-                    print(f"Изображение найдено на слайде {slide.index}")
 
                 # Если мы нашли изображение, ищем следующий непустой текст как подпись
                 if image_found:
@@ -51,7 +50,6 @@ def extract_images_with_captions(self, check_id):
                             caption_text = text
                             # Сохраняем изображение и его подпись
                             save_image_to_db(check_id, image_data.getvalue(), caption_text)
-                            print(f"Подпись найдена: '{caption_text}' на слайде {slide.index}")
                             break  # Предполагаем, что это подпись к текущему изображению
 
                     # Сброс флага и данных изображения для следующего цикла
diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py
index 7c462207..be65067d 100644
--- a/app/main/reports/docx_uploader/docx_uploader.py
+++ b/app/main/reports/docx_uploader/docx_uploader.py
@@ -252,8 +252,7 @@ def extract_images_with_captions(self, check_id):
         for i, paragraph in enumerate(self.file.paragraphs):
             # Проверяем, есть ли в параграфе встроенные объекты
             for run in paragraph.runs:
-                if "graphic" in run._element.xml:  # Это может быть изображение
-                    image_found = True
+                if "graphic" in run._element.xml:  # может быть изображение
 
                     # Извлечение бинарных данных изображения
                     image_streams = run._element.findall('.//a:blip', namespaces={
@@ -262,19 +261,36 @@ def extract_images_with_captions(self, check_id):
                         embed_id = image_stream.get(
                             '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
                         if embed_id:
+                            image_found = True
                             image_data = self.file.part.related_parts[embed_id].blob
 
                 # Если мы уже нашли изображение, ищем следующий непустой параграф для подписи
                 if image_found:
                     # Переход к следующему параграфу
                     next_paragraph_index = i + 1
-                    while next_paragraph_index < len(self.file.paragraphs):
-                        next_paragraph_text = self.file.paragraphs[next_paragraph_index].text.strip()
-                        if next_paragraph_text:  # Находим непустой параграф
-                            # Сохраняем изображение и его подпись
-                            save_image_to_db(check_id, image_data, next_paragraph_text)
-                            break
-                        next_paragraph_index += 1
+
+                    # Проверяем, есть ли следующий параграф
+                    if next_paragraph_index < len(self.file.paragraphs):
+                        while next_paragraph_index < len(self.file.paragraphs):
+                            next_paragraph = self.file.paragraphs[next_paragraph_index]
+                            next_paragraph_text = next_paragraph.text.strip()
+
+                            # Проверка, не содержит ли следующий параграф также изображение
+                            contains_image = any(
+                                "graphic" in run._element.xml for run in next_paragraph.runs
+                            )
+
+                            # Если параграф не содержит изображения и текст не пуст, то это подпись
+                            if not contains_image and next_paragraph_text:
+                                # Сохраняем изображение и его подпись
+                                save_image_to_db(check_id, image_data, next_paragraph_text)
+                                break
+                            else:
+                                save_image_to_db(check_id, image_data, "picture without caption")
+                                break
+                    else:
+                        save_image_to_db(check_id, image_data, "picture without caption")
+
                     image_found = False  # Сброс флага, чтобы искать следующее изображение
                     image_data = None  # Очистка данных изображения
 

From 52d1afe6c4a00b6e487972b0f0461db19c25abdf Mon Sep 17 00:00:00 2001
From: Dariiiii <usadariaa@yandex.ru>
Date: Fri, 7 Feb 2025 02:03:20 +0300
Subject: [PATCH 04/20]  prototype: images readability check

---
 app/db/db_methods.py                          |  5 +-
 app/db/db_types.py                            |  2 +
 app/main/check_packs/pack_config.py           |  1 +
 app/main/checks/report_checks/__init__.py     |  1 +
 .../report_checks/image_readability_check.py  | 60 +++++++++++++++++++
 .../reports/docx_uploader/docx_uploader.py    | 20 +++++--
 6 files changed, 81 insertions(+), 8 deletions(-)
 create mode 100644 app/main/checks/report_checks/image_readability_check.py

diff --git a/app/db/db_methods.py b/app/db/db_methods.py
index 841085b4..6335a181 100644
--- a/app/db/db_methods.py
+++ b/app/db/db_methods.py
@@ -27,13 +27,14 @@
 def get_client():
     return client
 
-def save_image_to_db(check_id, image_data, caption):
+def save_image_to_db(check_id, image_data, caption, image_size):
     from app.db.db_types import Image
 
     image = Image({
         'check_id': check_id,
         'image_data': image_data,
-        'caption': caption
+        'caption': caption,
+        'image_size': image_size
     })
     images_collection.insert_one(image.pack())
     print(str(check_id) + " " + str(caption))
diff --git a/app/db/db_types.py b/app/db/db_types.py
index 53dd04b1..3a660fec 100644
--- a/app/db/db_types.py
+++ b/app/db/db_types.py
@@ -153,10 +153,12 @@ def __init__(self, dictionary=None):
         self.check_id = dictionary.get('check_id')  # Привязка к check_id
         self.caption = dictionary.get('caption', '')  # Подпись к изображению
         self.image_data = dictionary.get('image_data')  # Файл изображения в формате bindata
+        self.image_size = dictionary.get('image_size')  # Размер изображения в сантимерах
 
     def pack(self):
         package = super().pack()
         package['check_id'] = str(self.check_id)
         package['caption'] = self.caption
         package['image_data'] = self.image_data
+        package['image_size'] = self.image_size
         return package
diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py
index c053ce0a..43976214 100644
--- a/app/main/check_packs/pack_config.py
+++ b/app/main/check_packs/pack_config.py
@@ -46,6 +46,7 @@
     ["theme_in_report_check"],
     ['key_words_report_check'],
     ["empty_task_page_check"],
+    ['image_readability_check'],
 ]
 
 DEFAULT_TYPE = 'pres'
diff --git a/app/main/checks/report_checks/__init__.py b/app/main/checks/report_checks/__init__.py
index 50972ce3..ff3b5c6c 100644
--- a/app/main/checks/report_checks/__init__.py
+++ b/app/main/checks/report_checks/__init__.py
@@ -26,3 +26,4 @@
 from .template_name import ReportTemplateNameCheck
 from .key_words_check import KeyWordsReportCheck
 from .empty_task_page_check import EmptyTaskPageCheck
+from .image_readability_check import image_readability_check
\ No newline at end of file
diff --git a/app/main/checks/report_checks/image_readability_check.py b/app/main/checks/report_checks/image_readability_check.py
new file mode 100644
index 00000000..023ca120
--- /dev/null
+++ b/app/main/checks/report_checks/image_readability_check.py
@@ -0,0 +1,60 @@
+from ..base_check import BaseReportCriterion, answer
+import cv2
+import numpy as np
+from io import BytesIO
+
+class ReportTaskTracker(BaseReportCriterion):
+    label = "Проверка читаемости изображений"
+    description = ''
+    id = 'image_readability_check'
+
+    def __init__(self, file_info, images, min_laplacian = 100, min_entropy = 5, max_density=10):
+        super().__init__(file_info)
+        self.images = images # корректно извлечь данные об изображениях
+        self.min_laplacian = min_laplacian
+        self.min_entropy = min_entropy
+        self.max_density = max_density
+        self.laplacian_score = None
+        self.entropy_score = None
+
+    def late_init(self):
+        self.chapters = self.file.make_chapters(self.file_type['report_type'])
+
+    def check(self):
+        self.late_init()
+        for image in self.images:
+            image_array = np.frombuffer(image.image_data, dtype=np.uint8)
+            img = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
+            laplacian, entropy = self.find_params(img)
+            width, height = image.image_size
+        # проанализровать текст на изображениях
+        # дописать сравнение с результатами 
+        if False:
+            return answer(False, f'Изображения нечитаемы! {self.deny_list}! Обнаруженные слова: {word_in_docs}.')
+        else:
+            return answer(True, 'Изображения корректны!')
+
+    def find_params(self, image):
+        if image is None or image.size == 0:
+            return None, None
+        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        
+        laplacian = cv2.Laplacian(gray_image, cv2.CV_64F).var()
+        
+        hist, _ = np.histogram(gray_image.flatten(), bins=256, range=[0, 256])
+        hist = hist / hist.sum()
+        entropy = -np.sum(hist * np.log2(hist + 1e-10))
+        return laplacian, entropy
+        
+    # def analyze_with_pytesseract(image):
+    #     try:
+    #         if image is None:
+    #             raise ValueError("Не удалось загрузить изображение")
+    #         text = pytesseract.image_to_string(image, lang='rus+eng')
+    #     except Exception as e:
+    #         print(f'Ошибка при обработке изображения: {e}')
+    #         text = ""
+
+    #     return {
+    #         'text': text
+    #     }
\ No newline at end of file
diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py
index be65067d..4a0a280a 100644
--- a/app/main/reports/docx_uploader/docx_uploader.py
+++ b/app/main/reports/docx_uploader/docx_uploader.py
@@ -244,7 +244,8 @@ def show_chapters(self, work_type):
 
     def extract_images_with_captions(self, check_id):
         from app.db.db_methods import save_image_to_db
-
+        
+        emu_to_cm  = 360000
         image_found = False
         image_data = None
 
@@ -262,8 +263,15 @@ def extract_images_with_captions(self, check_id):
                             '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
                         if embed_id:
                             image_found = True
-                            image_data = self.file.part.related_parts[embed_id].blob
-
+                            image_part = self.file.part.related_parts[embed_id]
+                            image_data = image_part.blob
+                            extent = run._element.find('.//wp:extent', namespaces={
+                            'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
+                            if extent is not None:
+                                width_emu = int(extent.get('cx'))
+                                height_emu = int(extent.get('cy'))
+                                width_cm = width_emu / emu_to_cm
+                                height_cm = height_emu / emu_to_cm
                 # Если мы уже нашли изображение, ищем следующий непустой параграф для подписи
                 if image_found:
                     # Переход к следующему параграфу
@@ -283,13 +291,13 @@ def extract_images_with_captions(self, check_id):
                             # Если параграф не содержит изображения и текст не пуст, то это подпись
                             if not contains_image and next_paragraph_text:
                                 # Сохраняем изображение и его подпись
-                                save_image_to_db(check_id, image_data, next_paragraph_text)
+                                save_image_to_db(check_id, image_data, next_paragraph_text, (width_cm, height_cm))
                                 break
                             else:
-                                save_image_to_db(check_id, image_data, "picture without caption")
+                                save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
                                 break
                     else:
-                        save_image_to_db(check_id, image_data, "picture without caption")
+                        save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
 
                     image_found = False  # Сброс флага, чтобы искать следующее изображение
                     image_data = None  # Очистка данных изображения

From e783ed9b2f52403884cc5a484502eaf0c2515ec0 Mon Sep 17 00:00:00 2001
From: Dariiiii <usadariaa@yandex.ru>
Date: Thu, 6 Mar 2025 23:16:11 +0300
Subject: [PATCH 05/20] fix image_quality_check

---
 app/db/db_methods.py                          |  14 ++-
 app/main/check_packs/pack_config.py           |   2 +-
 app/main/checks/report_checks/__init__.py     |   2 +-
 .../report_checks/image_quality_check.py      |  52 +++++++++
 .../report_checks/image_readability_check.py  |  60 -----------
 .../reports/docx_uploader/docx_uploader.py    | 101 +++++++++---------
 requirements.txt                              |   1 +
 7 files changed, 118 insertions(+), 114 deletions(-)
 create mode 100644 app/main/checks/report_checks/image_quality_check.py
 delete mode 100644 app/main/checks/report_checks/image_readability_check.py

diff --git a/app/db/db_methods.py b/app/db/db_methods.py
index 6335a181..6cddc794 100644
--- a/app/db/db_methods.py
+++ b/app/db/db_methods.py
@@ -7,7 +7,7 @@
 from pymongo import MongoClient
 from utils import convert_to
 
-from .db_types import User, Presentation, Check, Consumers, Logs
+from .db_types import User, Presentation, Check, Consumers, Logs, Image
 
 client = MongoClient("mongodb://mongodb:27017")
 db = client['pres-parser-db']
@@ -27,9 +27,17 @@
 def get_client():
     return client
 
-def save_image_to_db(check_id, image_data, caption, image_size):
-    from app.db.db_types import Image
+def get_images(check_id):
+    images = images_collection.find({'check_id': str(check_id)})
+    if images is not None:
+        image_list = []
+        for img in images:
+            image_list.append(Image(img))
+        return image_list
+    else:
+        return None
 
+def save_image_to_db(check_id, image_data, caption, image_size):
     image = Image({
         'check_id': check_id,
         'image_data': image_data,
diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py
index 43976214..17807dc1 100644
--- a/app/main/check_packs/pack_config.py
+++ b/app/main/check_packs/pack_config.py
@@ -46,7 +46,7 @@
     ["theme_in_report_check"],
     ['key_words_report_check'],
     ["empty_task_page_check"],
-    ['image_readability_check'],
+    ['image_quality_check'],
 ]
 
 DEFAULT_TYPE = 'pres'
diff --git a/app/main/checks/report_checks/__init__.py b/app/main/checks/report_checks/__init__.py
index ff3b5c6c..c85430ae 100644
--- a/app/main/checks/report_checks/__init__.py
+++ b/app/main/checks/report_checks/__init__.py
@@ -26,4 +26,4 @@
 from .template_name import ReportTemplateNameCheck
 from .key_words_check import KeyWordsReportCheck
 from .empty_task_page_check import EmptyTaskPageCheck
-from .image_readability_check import image_readability_check
\ No newline at end of file
+from .image_quality_check import ImageQualityCheck
\ No newline at end of file
diff --git a/app/main/checks/report_checks/image_quality_check.py b/app/main/checks/report_checks/image_quality_check.py
new file mode 100644
index 00000000..6590b6d1
--- /dev/null
+++ b/app/main/checks/report_checks/image_quality_check.py
@@ -0,0 +1,52 @@
+from ..base_check import BaseReportCriterion, answer
+import cv2
+import numpy as np
+
+class ImageQualityCheck(BaseReportCriterion):
+    label = "Проверка качества изображений"
+    description = ''
+    id = 'image_quality_check'
+
+    def __init__(self, file_info, min_laplacian=100, min_entropy=5):
+        super().__init__(file_info)
+        self.images = self.file.images
+        self.min_laplacian = min_laplacian
+        self.min_entropy = min_entropy
+        self.laplacian_score = None
+        self.entropy_score = None
+
+    def check(self):
+        deny_list = []
+        for img in self.images:
+            image_array = np.frombuffer(img.image_data, dtype=np.uint8)
+            img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
+            
+            if img_cv is None:
+                deny_list.append(f"Изображение с подписью {img.caption} не может быть обработано.")
+                continue
+            
+            self.find_params(img_cv)
+            
+            if self.laplacian_score is None or self.entropy_score is None:
+                deny_list.append(f"Изображение с подписью {img.caption} не может быть обработано.")
+                continue
+            
+            if self.laplacian_score < self.min_laplacian:
+                deny_list.append(f"Изображение с подписью {img.caption} имеет низкий показатель лапласиана: {self.laplacian_score} (минимум {self.min_laplacian}).")
+            
+            if self.entropy_score < self.min_entropy:
+                deny_list.append(f"Изображение с подписью {img.caption} имеет низкую энтропию: {self.entropy_score} (минимум {self.min_entropy}).")
+        
+        if deny_list:
+            return answer(False, f'Изображения нечитаемы! {deny_list}')
+        else:
+            return answer(True, 'Изображения корректны!')
+
+    def find_params(self, image):
+        if image is None or image.size == 0:
+            return None, None
+        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        self.laplacian_score = cv2.Laplacian(gray_image, cv2.CV_64F).var()
+        hist, _ = np.histogram(gray_image.flatten(), bins=256, range=[0, 256])
+        hist = hist / hist.sum()
+        self.entropy_score = -np.sum(hist * np.log2(hist + 1e-10))
\ No newline at end of file
diff --git a/app/main/checks/report_checks/image_readability_check.py b/app/main/checks/report_checks/image_readability_check.py
deleted file mode 100644
index 023ca120..00000000
--- a/app/main/checks/report_checks/image_readability_check.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from ..base_check import BaseReportCriterion, answer
-import cv2
-import numpy as np
-from io import BytesIO
-
-class ReportTaskTracker(BaseReportCriterion):
-    label = "Проверка читаемости изображений"
-    description = ''
-    id = 'image_readability_check'
-
-    def __init__(self, file_info, images, min_laplacian = 100, min_entropy = 5, max_density=10):
-        super().__init__(file_info)
-        self.images = images # корректно извлечь данные об изображениях
-        self.min_laplacian = min_laplacian
-        self.min_entropy = min_entropy
-        self.max_density = max_density
-        self.laplacian_score = None
-        self.entropy_score = None
-
-    def late_init(self):
-        self.chapters = self.file.make_chapters(self.file_type['report_type'])
-
-    def check(self):
-        self.late_init()
-        for image in self.images:
-            image_array = np.frombuffer(image.image_data, dtype=np.uint8)
-            img = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
-            laplacian, entropy = self.find_params(img)
-            width, height = image.image_size
-        # проанализровать текст на изображениях
-        # дописать сравнение с результатами 
-        if False:
-            return answer(False, f'Изображения нечитаемы! {self.deny_list}! Обнаруженные слова: {word_in_docs}.')
-        else:
-            return answer(True, 'Изображения корректны!')
-
-    def find_params(self, image):
-        if image is None or image.size == 0:
-            return None, None
-        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-        
-        laplacian = cv2.Laplacian(gray_image, cv2.CV_64F).var()
-        
-        hist, _ = np.histogram(gray_image.flatten(), bins=256, range=[0, 256])
-        hist = hist / hist.sum()
-        entropy = -np.sum(hist * np.log2(hist + 1e-10))
-        return laplacian, entropy
-        
-    # def analyze_with_pytesseract(image):
-    #     try:
-    #         if image is None:
-    #             raise ValueError("Не удалось загрузить изображение")
-    #         text = pytesseract.image_to_string(image, lang='rus+eng')
-    #     except Exception as e:
-    #         print(f'Ошибка при обработке изображения: {e}')
-    #         text = ""
-
-    #     return {
-    #         'text': text
-    #     }
\ No newline at end of file
diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py
index 4a0a280a..6c1ba12c 100644
--- a/app/main/reports/docx_uploader/docx_uploader.py
+++ b/app/main/reports/docx_uploader/docx_uploader.py
@@ -243,64 +243,67 @@ def show_chapters(self, work_type):
         return chapters_str
 
     def extract_images_with_captions(self, check_id):
-        from app.db.db_methods import save_image_to_db
+        from app.db.db_methods import save_image_to_db, get_images
         
         emu_to_cm  = 360000
         image_found = False
         image_data = None
+        if not self.images:
+            # Проход по всем параграфам документа
+            for i, paragraph in enumerate(self.file.paragraphs):
+                # Проверяем, есть ли в параграфе встроенные объекты
+                for run in paragraph.runs:
+                    if "graphic" in run._element.xml:  # может быть изображение
 
-        # Проход по всем параграфам документа
-        for i, paragraph in enumerate(self.file.paragraphs):
-            # Проверяем, есть ли в параграфе встроенные объекты
-            for run in paragraph.runs:
-                if "graphic" in run._element.xml:  # может быть изображение
+                        # Извлечение бинарных данных изображения
+                        image_streams = run._element.findall('.//a:blip', namespaces={
+                            'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
+                        for image_stream in image_streams:
+                            embed_id = image_stream.get(
+                                '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
+                            if embed_id:
+                                image_found = True
+                                image_part = self.file.part.related_parts[embed_id]
+                                image_data = image_part.blob
+                                extent = run._element.find('.//wp:extent', namespaces={
+                                'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
+                                if extent is not None:
+                                    width_emu = int(extent.get('cx'))
+                                    height_emu = int(extent.get('cy'))
+                                    width_cm = width_emu / emu_to_cm
+                                    height_cm = height_emu / emu_to_cm
+                    # Если мы уже нашли изображение, ищем следующий непустой параграф для подписи
+                    if image_found:
+                        # Переход к следующему параграфу
+                        next_paragraph_index = i + 1
 
-                    # Извлечение бинарных данных изображения
-                    image_streams = run._element.findall('.//a:blip', namespaces={
-                        'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
-                    for image_stream in image_streams:
-                        embed_id = image_stream.get(
-                            '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
-                        if embed_id:
-                            image_found = True
-                            image_part = self.file.part.related_parts[embed_id]
-                            image_data = image_part.blob
-                            extent = run._element.find('.//wp:extent', namespaces={
-                            'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
-                            if extent is not None:
-                                width_emu = int(extent.get('cx'))
-                                height_emu = int(extent.get('cy'))
-                                width_cm = width_emu / emu_to_cm
-                                height_cm = height_emu / emu_to_cm
-                # Если мы уже нашли изображение, ищем следующий непустой параграф для подписи
-                if image_found:
-                    # Переход к следующему параграфу
-                    next_paragraph_index = i + 1
+                        # Проверяем, есть ли следующий параграф
+                        if next_paragraph_index < len(self.file.paragraphs):
+                            while next_paragraph_index < len(self.file.paragraphs):
+                                next_paragraph = self.file.paragraphs[next_paragraph_index]
+                                next_paragraph_text = next_paragraph.text.strip()
 
-                    # Проверяем, есть ли следующий параграф
-                    if next_paragraph_index < len(self.file.paragraphs):
-                        while next_paragraph_index < len(self.file.paragraphs):
-                            next_paragraph = self.file.paragraphs[next_paragraph_index]
-                            next_paragraph_text = next_paragraph.text.strip()
+                                # Проверка, не содержит ли следующий параграф также изображение
+                                contains_image = any(
+                                    "graphic" in run._element.xml for run in next_paragraph.runs
+                                )
 
-                            # Проверка, не содержит ли следующий параграф также изображение
-                            contains_image = any(
-                                "graphic" in run._element.xml for run in next_paragraph.runs
-                            )
+                                # Если параграф не содержит изображения и текст не пуст, то это подпись
+                                if not contains_image and next_paragraph_text:
+                                    # Сохраняем изображение и его подпись
+                                    save_image_to_db(check_id, image_data, next_paragraph_text, (width_cm, height_cm))
+                                    break
+                                else:
+                                    save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
+                                    break
+                        else:
+                            save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
 
-                            # Если параграф не содержит изображения и текст не пуст, то это подпись
-                            if not contains_image and next_paragraph_text:
-                                # Сохраняем изображение и его подпись
-                                save_image_to_db(check_id, image_data, next_paragraph_text, (width_cm, height_cm))
-                                break
-                            else:
-                                save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
-                                break
-                    else:
-                        save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
-
-                    image_found = False  # Сброс флага, чтобы искать следующее изображение
-                    image_data = None  # Очистка данных изображения
+                        image_found = False  # Сброс флага, чтобы искать следующее изображение
+                        image_data = None  # Очистка данных изображения
+            self.images = get_images(check_id)
+                        
+                
 
 
 def main(args):
diff --git a/requirements.txt b/requirements.txt
index 082b7069..ab64f26c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -35,3 +35,4 @@ filetype==1.2.0
 language-tool-python==2.7.1
 markdown==3.4.4
 md2pdf==1.0.1
+opencv-python==4.5.5.64
\ No newline at end of file

From 5cc96ec37daea0dfa0af6d0df2608d83265d0bab Mon Sep 17 00:00:00 2001
From: Dariiiii <usadariaa@yandex.ru>
Date: Fri, 7 Mar 2025 00:19:36 +0300
Subject: [PATCH 06/20] v1 image_quality_check

---
 .../report_checks/image_quality_check.py      | 48 ++++++++++---------
 app/main/reports/document_uploader.py         |  1 +
 .../reports/docx_uploader/docx_uploader.py    |  2 +
 3 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/app/main/checks/report_checks/image_quality_check.py b/app/main/checks/report_checks/image_quality_check.py
index 6590b6d1..68eca342 100644
--- a/app/main/checks/report_checks/image_quality_check.py
+++ b/app/main/checks/report_checks/image_quality_check.py
@@ -6,8 +6,8 @@ class ImageQualityCheck(BaseReportCriterion):
     label = "Проверка качества изображений"
     description = ''
     id = 'image_quality_check'
-
-    def __init__(self, file_info, min_laplacian=100, min_entropy=5):
+    # необходимо подобрать min_laplacian и min_entropy
+    def __init__(self, file_info, min_laplacian=10, min_entropy=1):
         super().__init__(file_info)
         self.images = self.file.images
         self.min_laplacian = min_laplacian
@@ -17,28 +17,30 @@ def __init__(self, file_info, min_laplacian=100, min_entropy=5):
 
     def check(self):
         deny_list = []
-        for img in self.images:
-            image_array = np.frombuffer(img.image_data, dtype=np.uint8)
-            img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
-            
-            if img_cv is None:
-                deny_list.append(f"Изображение с подписью {img.caption} не может быть обработано.")
-                continue
-            
-            self.find_params(img_cv)
-            
-            if self.laplacian_score is None or self.entropy_score is None:
-                deny_list.append(f"Изображение с подписью {img.caption} не может быть обработано.")
-                continue
-            
-            if self.laplacian_score < self.min_laplacian:
-                deny_list.append(f"Изображение с подписью {img.caption} имеет низкий показатель лапласиана: {self.laplacian_score} (минимум {self.min_laplacian}).")
-            
-            if self.entropy_score < self.min_entropy:
-                deny_list.append(f"Изображение с подписью {img.caption} имеет низкую энтропию: {self.entropy_score} (минимум {self.min_entropy}).")
-        
+        if self.images:
+            for img in self.images:
+                image_array = np.frombuffer(img.image_data, dtype=np.uint8)
+                img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
+                
+                if img_cv is None:
+                    deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.<br>")
+                    continue
+                
+                self.find_params(img_cv)
+                
+                if self.laplacian_score is None or self.entropy_score is None:
+                    deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.<br>")
+                    continue
+                
+                if self.laplacian_score < self.min_laplacian:
+                    deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкий показатель лапласиана: {self.laplacian_score} (минимум {self.min_laplacian}).<br>")
+                
+                if self.entropy_score < self.min_entropy:
+                    deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкую энтропию: {self.entropy_score} (минимум {self.min_entropy}).<br>")
+        else: 
+            return answer(False, 'Изображения не найдены!')
         if deny_list:
-            return answer(False, f'Изображения нечитаемы! {deny_list}')
+            return answer(False, f'Изображения нечитаемы! <br>{"".join(deny_list)}')
         else:
             return answer(True, 'Изображения корректны!')
 
diff --git a/app/main/reports/document_uploader.py b/app/main/reports/document_uploader.py
index d0653fae..8a6a7303 100644
--- a/app/main/reports/document_uploader.py
+++ b/app/main/reports/document_uploader.py
@@ -12,6 +12,7 @@ def __init__(self):
         self.literature_page = 0
         self.first_lines = []
         self.page_count = 0
+        self.images = []
 
     @abstractmethod
     def upload(self):
diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py
index 6c1ba12c..421dfbe2 100644
--- a/app/main/reports/docx_uploader/docx_uploader.py
+++ b/app/main/reports/docx_uploader/docx_uploader.py
@@ -251,6 +251,8 @@ def extract_images_with_captions(self, check_id):
         if not self.images:
             # Проход по всем параграфам документа
             for i, paragraph in enumerate(self.file.paragraphs):
+                width_emu = None
+                height_emu = None
                 # Проверяем, есть ли в параграфе встроенные объекты
                 for run in paragraph.runs:
                     if "graphic" in run._element.xml:  # может быть изображение

From c15f5abc923a425b167b393f82ea1fd8cd7a9464 Mon Sep 17 00:00:00 2001
From: Dariiiii <usadariaa@yandex.ru>
Date: Fri, 21 Mar 2025 01:14:34 +0300
Subject: [PATCH 07/20] tesseract prototype

---
 Dockerfile_base                               |  7 ++-
 app/db/db_methods.py                          | 18 +++++--
 app/db/db_types.py                            |  2 +
 app/main/check_packs/pack_config.py           |  1 -
 app/main/checks/report_checks/__init__.py     |  3 +-
 .../report_checks/image_quality_check.py      | 54 -------------------
 .../reports/docx_uploader/docx_uploader.py    | 10 ++--
 app/tesseract_tasks.py                        | 49 +++++++++++++++++
 docker-compose.yml                            | 17 ++++++
 requirements.txt                              |  3 +-
 10 files changed, 98 insertions(+), 66 deletions(-)
 delete mode 100644 app/main/checks/report_checks/image_quality_check.py
 create mode 100644 app/tesseract_tasks.py

diff --git a/Dockerfile_base b/Dockerfile_base
index c3f30538..b0635e7c 100644
--- a/Dockerfile_base
+++ b/Dockerfile_base
@@ -8,7 +8,12 @@ ENV TZ=Europe/Moscow
 
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 
-RUN apt update && apt install -y libreoffice-writer libreoffice-impress default-jre
+RUN apt-get update &&  apt-get install -y \
+    libreoffice-writer \
+    libreoffice-impress \
+    default-jre \
+    tesseract-ocr \
+    tesseract-ocr-rus
 
 ADD requirements.txt .
 RUN python3 -m pip install -r requirements.txt --no-cache-dir
diff --git a/app/db/db_methods.py b/app/db/db_methods.py
index 6cddc794..40f8ef3d 100644
--- a/app/db/db_methods.py
+++ b/app/db/db_methods.py
@@ -37,16 +37,26 @@ def get_images(check_id):
     else:
         return None
 
-def save_image_to_db(check_id, image_data, caption, image_size):
+def save_image_to_db(check_id, image_data, caption, image_size, text=''):
     image = Image({
         'check_id': check_id,
         'image_data': image_data,
         'caption': caption,
-        'image_size': image_size
+        'image_size': image_size,
+        'text' : text
     })
-    images_collection.insert_one(image.pack())
-    print(str(check_id) + " " + str(caption))
+    result = images_collection.insert_one(image.pack())
+    return result.inserted_id 
 
+def update_image_text(image_id, new_text):
+    try:
+        result = images_collection.update_one(
+            {'_id': image_id},
+            {'$set': {'text': new_text}}
+        )
+        return result.matched_count > 0
+    except Exception:
+        return False
 
 # Returns user if user was created and None if already exists
 def add_user(username, password_hash='', is_LTI=False):
diff --git a/app/db/db_types.py b/app/db/db_types.py
index 3a660fec..3ece6f68 100644
--- a/app/db/db_types.py
+++ b/app/db/db_types.py
@@ -154,6 +154,7 @@ def __init__(self, dictionary=None):
         self.caption = dictionary.get('caption', '')  # Подпись к изображению
         self.image_data = dictionary.get('image_data')  # Файл изображения в формате bindata
         self.image_size = dictionary.get('image_size')  # Размер изображения в сантимерах
+        self.text = dictionary.get('text')
 
     def pack(self):
         package = super().pack()
@@ -161,4 +162,5 @@ def pack(self):
         package['caption'] = self.caption
         package['image_data'] = self.image_data
         package['image_size'] = self.image_size
+        package['text'] = self.text
         return package
diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py
index 17807dc1..c053ce0a 100644
--- a/app/main/check_packs/pack_config.py
+++ b/app/main/check_packs/pack_config.py
@@ -46,7 +46,6 @@
     ["theme_in_report_check"],
     ['key_words_report_check'],
     ["empty_task_page_check"],
-    ['image_quality_check'],
 ]
 
 DEFAULT_TYPE = 'pres'
diff --git a/app/main/checks/report_checks/__init__.py b/app/main/checks/report_checks/__init__.py
index c85430ae..3bf4e228 100644
--- a/app/main/checks/report_checks/__init__.py
+++ b/app/main/checks/report_checks/__init__.py
@@ -25,5 +25,4 @@
 from .max_abstract_size_check import ReportMaxSizeOfAbstractCheck
 from .template_name import ReportTemplateNameCheck
 from .key_words_check import KeyWordsReportCheck
-from .empty_task_page_check import EmptyTaskPageCheck
-from .image_quality_check import ImageQualityCheck
\ No newline at end of file
+from .empty_task_page_check import EmptyTaskPageCheck
\ No newline at end of file
diff --git a/app/main/checks/report_checks/image_quality_check.py b/app/main/checks/report_checks/image_quality_check.py
deleted file mode 100644
index 68eca342..00000000
--- a/app/main/checks/report_checks/image_quality_check.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from ..base_check import BaseReportCriterion, answer
-import cv2
-import numpy as np
-
-class ImageQualityCheck(BaseReportCriterion):
-    label = "Проверка качества изображений"
-    description = ''
-    id = 'image_quality_check'
-    # необходимо подобрать min_laplacian и min_entropy
-    def __init__(self, file_info, min_laplacian=10, min_entropy=1):
-        super().__init__(file_info)
-        self.images = self.file.images
-        self.min_laplacian = min_laplacian
-        self.min_entropy = min_entropy
-        self.laplacian_score = None
-        self.entropy_score = None
-
-    def check(self):
-        deny_list = []
-        if self.images:
-            for img in self.images:
-                image_array = np.frombuffer(img.image_data, dtype=np.uint8)
-                img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
-                
-                if img_cv is None:
-                    deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.<br>")
-                    continue
-                
-                self.find_params(img_cv)
-                
-                if self.laplacian_score is None or self.entropy_score is None:
-                    deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.<br>")
-                    continue
-                
-                if self.laplacian_score < self.min_laplacian:
-                    deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкий показатель лапласиана: {self.laplacian_score} (минимум {self.min_laplacian}).<br>")
-                
-                if self.entropy_score < self.min_entropy:
-                    deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкую энтропию: {self.entropy_score} (минимум {self.min_entropy}).<br>")
-        else: 
-            return answer(False, 'Изображения не найдены!')
-        if deny_list:
-            return answer(False, f'Изображения нечитаемы! <br>{"".join(deny_list)}')
-        else:
-            return answer(True, 'Изображения корректны!')
-
-    def find_params(self, image):
-        if image is None or image.size == 0:
-            return None, None
-        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-        self.laplacian_score = cv2.Laplacian(gray_image, cv2.CV_64F).var()
-        hist, _ = np.histogram(gray_image.flatten(), bins=256, range=[0, 256])
-        hist = hist / hist.sum()
-        self.entropy_score = -np.sum(hist * np.log2(hist + 1e-10))
\ No newline at end of file
diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py
index 421dfbe2..2f963bf0 100644
--- a/app/main/reports/docx_uploader/docx_uploader.py
+++ b/app/main/reports/docx_uploader/docx_uploader.py
@@ -244,6 +244,7 @@ def show_chapters(self, work_type):
 
     def extract_images_with_captions(self, check_id):
         from app.db.db_methods import save_image_to_db, get_images
+        from app.tesseract_tasks import tesseract_recognize
         
         emu_to_cm  = 360000
         image_found = False
@@ -293,13 +294,16 @@ def extract_images_with_captions(self, check_id):
                                 # Если параграф не содержит изображения и текст не пуст, то это подпись
                                 if not contains_image and next_paragraph_text:
                                     # Сохраняем изображение и его подпись
-                                    save_image_to_db(check_id, image_data, next_paragraph_text, (width_cm, height_cm))
+                                    image_id = save_image_to_db(check_id, image_data, next_paragraph_text, (width_cm, height_cm))
+                                    tesseract_recognize.delay(image_id, image_data)
                                     break
                                 else:
-                                    save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
+                                    image_id = save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
+                                    tesseract_recognize.delay(image_id, image_data)
                                     break
                         else:
-                            save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
+                            image_id = save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
+                            tesseract_recognize.delay(image_id, image_data)
 
                         image_found = False  # Сброс флага, чтобы искать следующее изображение
                         image_data = None  # Очистка данных изображения
diff --git a/app/tesseract_tasks.py b/app/tesseract_tasks.py
new file mode 100644
index 00000000..848faf2f
--- /dev/null
+++ b/app/tesseract_tasks.py
@@ -0,0 +1,49 @@
+import os
+from celery import Celery
+from celery.signals import worker_ready
+import pytesseract
+import cv2
+import numpy as np
+from db import db_methods
+from root_logger import get_root_logger
+
+TASK_RETRY_COUNTDOWN = 60
+logger = get_root_logger('tesseract_tasks')
+
+celery = Celery(__name__)
+celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://redis:6379")
+celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://redis:6379")
+
+celery.conf.timezone = 'Europe/Moscow'
+
+TESSERACT_CONFIG = {
+    'lang': 'rus',
+    'config': '--psm 6',
+}
+
+@worker_ready.connect
+def at_start(sender, **k):
+    logger.info("Tesseract worker is ready!")
+
+
+@celery.task(name="tesseract_recognize", queue='tesseract-queue', bind=True)
+def tesseract_recognize(self, image_id, image_data):
+    try:
+        image_array = np.frombuffer(image_data, dtype=np.uint8)
+        img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
+        if img_cv is None:
+            raise ValueError("Не удалось декодировать изображение из двоичных данных")
+        text = pytesseract.image_to_string(img_cv, **TESSERACT_CONFIG)
+        success = db_methods.update_image_text(image_id, text)
+        if not success:
+            logger.error(f"Не удалось записать текст для image_id: {image_id}")
+            raise Exception("Ошибка при обновлении текста изображения в базе данных")
+        logger.info(f"Текст успешно распознан и записан для image_id: {image_id}")
+        return text
+
+    except Exception as e:
+        logger.error(f"Ошибка при распознавании текста: {e}", exc_info=True)
+        if self.request.retries == self.max_retries:
+            logger.error(f"Достигнуто максимальное количество попыток для image_id: {image_id}")
+            return f"Ошибка: {e}"
+        self.retry(countdown=TASK_RETRY_COUNTDOWN)
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index a2518331..27b5042d 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -74,6 +74,23 @@ services:
     volumes:
       - ../slides_checker_mongo_data:/data/db
     cpuset: ${CONTAINER_CPU:-0-1}
+    
+  tesseract_worker:
+    image: document_insight_system_image
+    restart: always
+    command: celery --app=app.tasks.celery worker -n tesseract@worker -Q tesseract-queue --loglevel=info
+    environment:
+      - CELERY_BROKER_URL=${REDIS_URL}
+      - CELERY_RESULT_BACKEND=${REDIS_URL}
+    depends_on:
+      - redis
+      - mongodb
+    volumes:
+      - presentation_files:/usr/src/project/files/
+      - "/etc/timezone:/etc/timezone:ro"
+      - "/etc/localtime:/etc/localtime:ro"
+    cpuset: ${CONTAINER_CPU:-0-1}
+    mem_limit: ${WORKER_MEMORY:-1G}
 
 volumes:
   flower_data:
diff --git a/requirements.txt b/requirements.txt
index ab64f26c..69ab0d78 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -35,4 +35,5 @@ filetype==1.2.0
 language-tool-python==2.7.1
 markdown==3.4.4
 md2pdf==1.0.1
-opencv-python==4.5.5.64
\ No newline at end of file
+opencv-python==4.5.5.64
+pytesseract==0.3.10
\ No newline at end of file

From f645a68f78468709643cbc490ce56e8428906539 Mon Sep 17 00:00:00 2001
From: Dariiiii <usadariaa@yandex.ru>
Date: Fri, 21 Mar 2025 02:17:17 +0300
Subject: [PATCH 08/20] TODO: Implement Tesseract-based text check

---
 app/db/db_methods.py                            | 1 +
 app/main/reports/docx_uploader/docx_uploader.py | 6 +++---
 docker-compose.yml                              | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/app/db/db_methods.py b/app/db/db_methods.py
index 40f8ef3d..e01efd22 100644
--- a/app/db/db_methods.py
+++ b/app/db/db_methods.py
@@ -50,6 +50,7 @@ def save_image_to_db(check_id, image_data, caption, image_size, text=''):
 
 def update_image_text(image_id, new_text):
     try:
+        image_id = ObjectId(image_id)
         result = images_collection.update_one(
             {'_id': image_id},
             {'$set': {'text': new_text}}
diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py
index 2f963bf0..6968dbdc 100644
--- a/app/main/reports/docx_uploader/docx_uploader.py
+++ b/app/main/reports/docx_uploader/docx_uploader.py
@@ -295,15 +295,15 @@ def extract_images_with_captions(self, check_id):
                                 if not contains_image and next_paragraph_text:
                                     # Сохраняем изображение и его подпись
                                     image_id = save_image_to_db(check_id, image_data, next_paragraph_text, (width_cm, height_cm))
-                                    tesseract_recognize.delay(image_id, image_data)
+                                    tesseract_recognize.delay(str(image_id), image_data)
                                     break
                                 else:
                                     image_id = save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
-                                    tesseract_recognize.delay(image_id, image_data)
+                                    tesseract_recognize.delay(str(image_id), image_data)
                                     break
                         else:
                             image_id = save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
-                            tesseract_recognize.delay(image_id, image_data)
+                            tesseract_recognize.delay(str(image_id), image_data)
 
                         image_found = False  # Сброс флага, чтобы искать следующее изображение
                         image_data = None  # Очистка данных изображения
diff --git a/docker-compose.yml b/docker-compose.yml
index 27b5042d..d95e604e 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -78,7 +78,7 @@ services:
   tesseract_worker:
     image: document_insight_system_image
     restart: always
-    command: celery --app=app.tasks.celery worker -n tesseract@worker -Q tesseract-queue --loglevel=info
+    command: celery --app=app.tesseract_tasks.celery worker -n tesseract@worker -Q tesseract-queue --loglevel=info
     environment:
       - CELERY_BROKER_URL=${REDIS_URL}
       - CELERY_RESULT_BACKEND=${REDIS_URL}

From 40cfc2d4a2a3cbcf0af2597c8dae83c223ef288e Mon Sep 17 00:00:00 2001
From: Dariiiii <usadariaa@yandex.ru>
Date: Wed, 2 Apr 2025 23:33:26 +0300
Subject: [PATCH 09/20] tesseract check v1

---
 app/db/db_methods.py                          | 38 ++++++---
 app/db/db_types.py                            |  2 +
 app/main/check_packs/pack_config.py           |  1 +
 app/main/checks/report_checks/__init__.py     |  3 +-
 .../checks/report_checks/image_text_check.py  | 77 +++++++++++++++++++
 .../reports/docx_uploader/docx_uploader.py    | 64 +++++----------
 app/tesseract_tasks.py                        | 21 ++---
 7 files changed, 140 insertions(+), 66 deletions(-)
 create mode 100644 app/main/checks/report_checks/image_text_check.py

diff --git a/app/db/db_methods.py b/app/db/db_methods.py
index e01efd22..09609bbc 100644
--- a/app/db/db_methods.py
+++ b/app/db/db_methods.py
@@ -37,27 +37,41 @@ def get_images(check_id):
     else:
         return None
 
-def save_image_to_db(check_id, image_data, caption, image_size, text=''):
+def save_image_to_db(check_id, image_data, caption, image_size, text=None, tesseract_task_id=None):
     image = Image({
         'check_id': check_id,
         'image_data': image_data,
         'caption': caption,
         'image_size': image_size,
-        'text' : text
+        'text' : text,
+        'tesseract_task_id': tesseract_task_id
     })
     result = images_collection.insert_one(image.pack())
     return result.inserted_id 
 
-def update_image_text(image_id, new_text):
-    try:
-        image_id = ObjectId(image_id)
-        result = images_collection.update_one(
-            {'_id': image_id},
-            {'$set': {'text': new_text}}
-        )
-        return result.matched_count > 0
-    except Exception:
-        return False
+def add_image_text(tesseract_task_id, new_text):
+    result = images_collection.update_one(
+        {'tesseract_task_id': tesseract_task_id},
+        {'$set': {'text': new_text}}
+    )
+    return result.matched_count > 0
+    
+def add_tesseract_task_id(image_id, tesseract_task_id):
+    # image_id = ObjectId(image_id)
+    result = images_collection.update_one(
+        {'_id': image_id},
+        {'$set': {'tesseract_task_id': tesseract_task_id}}
+    )
+    return result.matched_count > 0
+
+def get_tesseract_task_id(image_id):
+    # image_id = ObjectId(image_id)
+    image = images_collection.find_one({'_id': image_id})
+    if image:
+        return image.get('tesseract_task_id')
+    else:
+        return None
+
 
 # Returns user if user was created and None if already exists
 def add_user(username, password_hash='', is_LTI=False):
diff --git a/app/db/db_types.py b/app/db/db_types.py
index 3ece6f68..eb528748 100644
--- a/app/db/db_types.py
+++ b/app/db/db_types.py
@@ -155,6 +155,7 @@ def __init__(self, dictionary=None):
         self.image_data = dictionary.get('image_data')  # Файл изображения в формате bindata
         self.image_size = dictionary.get('image_size')  # Размер изображения в сантимерах
         self.text = dictionary.get('text')
+        self.tesseract_task_id = dictionary.get('tesseract_task_id')
 
     def pack(self):
         package = super().pack()
@@ -163,4 +164,5 @@ def pack(self):
         package['image_data'] = self.image_data
         package['image_size'] = self.image_size
         package['text'] = self.text
+        package['tesseract_task_id'] = self.tesseract_task_id
         return package
diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py
index c053ce0a..bb51c992 100644
--- a/app/main/check_packs/pack_config.py
+++ b/app/main/check_packs/pack_config.py
@@ -46,6 +46,7 @@
     ["theme_in_report_check"],
     ['key_words_report_check'],
     ["empty_task_page_check"],
+    ["image_text_check"],
 ]
 
 DEFAULT_TYPE = 'pres'
diff --git a/app/main/checks/report_checks/__init__.py b/app/main/checks/report_checks/__init__.py
index 3bf4e228..3af7c4d2 100644
--- a/app/main/checks/report_checks/__init__.py
+++ b/app/main/checks/report_checks/__init__.py
@@ -25,4 +25,5 @@
 from .max_abstract_size_check import ReportMaxSizeOfAbstractCheck
 from .template_name import ReportTemplateNameCheck
 from .key_words_check import KeyWordsReportCheck
-from .empty_task_page_check import EmptyTaskPageCheck
\ No newline at end of file
+from .empty_task_page_check import EmptyTaskPageCheck
+from .image_text_check import ImageTextCheck
\ No newline at end of file
diff --git a/app/main/checks/report_checks/image_text_check.py b/app/main/checks/report_checks/image_text_check.py
new file mode 100644
index 00000000..69bfb36e
--- /dev/null
+++ b/app/main/checks/report_checks/image_text_check.py
@@ -0,0 +1,77 @@
+import re
+from ..base_check import BaseReportCriterion, answer
+import time
+from celery.result import AsyncResult
+
+class ImageTextCheck(BaseReportCriterion):
+    label = "Проверка текста, считанного с изображений"
+    description = ''
+    id = 'image_text_check'
+    # Подобрать значения для symbols_set, max_symbols_percentage, max_text_density
+    def __init__(self, file_info, symbols_set=['%', '1'], max_symbols_percentage=0, max_text_density=0, max_wait_time=30):
+        super().__init__(file_info)
+        self.images = self.file.images
+        self.symbols_set = symbols_set
+        self.max_symbols_percentage = max_symbols_percentage
+        self.max_text_density = max_text_density
+        self.max_wait_time = max_wait_time
+
+    def check(self):
+        deny_list = []
+        if self.images:
+            for image in self.images:
+                if image.text == '':
+                    continue
+                recognized_text = self.wait_for_text_recognition(image)
+                width, height = image.image_size
+                if not recognized_text:
+                    continue
+                text_density = self.calculate_text_density(recognized_text, width, height)
+                if text_density > self.max_text_density:
+                    deny_list.append(
+                        f"Изображение с подписью '{image.caption}' имеет слишком высокую плотность текста: "
+                        f"{text_density:.4f} (максимум {self.max_text_density}). Это может означать, что текст нечитаем.<br>"
+                    )
+                symbols_count = self.count_symbols_in_text(recognized_text, self.symbols_set)
+                text_length = len(recognized_text)
+                symbols_percentage = (symbols_count / text_length) * 100
+                if symbols_percentage > self.max_symbols_percentage:
+                    deny_list.append(
+                        f"На изображении с подписью '{image.caption}' содержится слишком много неверно распознанных символов: "
+                        f"{symbols_percentage:.2f}% (максимум {self.max_symbols_percentage}%). Это может означать, что размер шрифта слишком маленький или текст нечитаем.<br>"
+                    )
+        else:
+            return answer(False, 'Изображения не найдены!')
+        if deny_list:
+            return answer(False, f'Проблемы с текстом на изображениях! <br>{"".join(deny_list)}')
+        else:
+            return answer(True, 'Текст на изображениях корректен!')
+
+    def count_symbols_in_text(self, text, symbols_set):
+        return sum(1 for char in text if char in symbols_set)
+
+    def calculate_text_density(self, text, width, height):
+        text_without_spaces = ''.join(text.split())
+        image_area = width * height
+        if image_area == 0:
+            return 0
+        return len(text_without_spaces) / image_area
+
+    def wait_for_text_recognition(self, image):
+        from app.db.db_methods import add_image_text
+        start_time = time.time()
+        task_id = image.tesseract_task_id
+        if not task_id:
+            return None
+
+        while time.time() - start_time < self.max_wait_time:
+            task_result = AsyncResult(task_id)
+            if task_result.state == 'SUCCESS':
+                recognized_text = task_result.result
+                recognized_text = re.sub(r'\s+', ' ', recognized_text)
+                image.text = recognized_text
+                add_image_text(task_id, recognized_text)
+                return recognized_text.strip()
+            time.sleep(1)
+
+        return None
diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py
index 6968dbdc..3354c8a0 100644
--- a/app/main/reports/docx_uploader/docx_uploader.py
+++ b/app/main/reports/docx_uploader/docx_uploader.py
@@ -12,6 +12,7 @@
 from ..document_uploader import DocumentUploader
 
 
+
 class DocxUploader(DocumentUploader):
     def __init__(self):
         super().__init__()
@@ -243,22 +244,16 @@ def show_chapters(self, work_type):
         return chapters_str
 
     def extract_images_with_captions(self, check_id):
-        from app.db.db_methods import save_image_to_db, get_images
+        from app.db.db_methods import save_image_to_db, get_images, add_tesseract_task_id
         from app.tesseract_tasks import tesseract_recognize
         
         emu_to_cm  = 360000
         image_found = False
         image_data = None
         if not self.images:
-            # Проход по всем параграфам документа
             for i, paragraph in enumerate(self.file.paragraphs):
-                width_emu = None
-                height_emu = None
-                # Проверяем, есть ли в параграфе встроенные объекты
                 for run in paragraph.runs:
-                    if "graphic" in run._element.xml:  # может быть изображение
-
-                        # Извлечение бинарных данных изображения
+                    if "graphic" in run._element.xml:
                         image_streams = run._element.findall('.//a:blip', namespaces={
                             'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
                         for image_stream in image_streams:
@@ -270,46 +265,29 @@ def extract_images_with_captions(self, check_id):
                                 image_data = image_part.blob
                                 extent = run._element.find('.//wp:extent', namespaces={
                                 'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
+                                width_cm = height_cm = None
                                 if extent is not None:
-                                    width_emu = int(extent.get('cx'))
-                                    height_emu = int(extent.get('cy'))
-                                    width_cm = width_emu / emu_to_cm
-                                    height_cm = height_emu / emu_to_cm
-                    # Если мы уже нашли изображение, ищем следующий непустой параграф для подписи
+                                    width_cm = int(extent.get('cx')) / emu_to_cm
+                                    height_cm = int(extent.get('cy')) / emu_to_cm
                     if image_found:
-                        # Переход к следующему параграфу
+                        caption = "picture without caption"
                         next_paragraph_index = i + 1
-
-                        # Проверяем, есть ли следующий параграф
-                        if next_paragraph_index < len(self.file.paragraphs):
-                            while next_paragraph_index < len(self.file.paragraphs):
-                                next_paragraph = self.file.paragraphs[next_paragraph_index]
-                                next_paragraph_text = next_paragraph.text.strip()
-
-                                # Проверка, не содержит ли следующий параграф также изображение
-                                contains_image = any(
-                                    "graphic" in run._element.xml for run in next_paragraph.runs
-                                )
-
-                                # Если параграф не содержит изображения и текст не пуст, то это подпись
-                                if not contains_image and next_paragraph_text:
-                                    # Сохраняем изображение и его подпись
-                                    image_id = save_image_to_db(check_id, image_data, next_paragraph_text, (width_cm, height_cm))
-                                    tesseract_recognize.delay(str(image_id), image_data)
-                                    break
-                                else:
-                                    image_id = save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
-                                    tesseract_recognize.delay(str(image_id), image_data)
-                                    break
-                        else:
-                            image_id = save_image_to_db(check_id, image_data, "picture without caption", (width_cm, height_cm))
-                            tesseract_recognize.delay(str(image_id), image_data)
-
-                        image_found = False  # Сброс флага, чтобы искать следующее изображение
-                        image_data = None  # Очистка данных изображения
-            self.images = get_images(check_id)
+                        while next_paragraph_index < len(self.file.paragraphs):
+                            next_paragraph = self.file.paragraphs[next_paragraph_index]
+                            next_text = next_paragraph.text.strip()
+                            if next_text and not any("graphic" in r._element.xml for r in next_paragraph.runs):
+                                caption = next_text
+                                break
+                            next_paragraph_index += 1
                         
+                        image_id = save_image_to_db(check_id, image_data, caption, (width_cm, height_cm))
+                        task = tesseract_recognize.delay(str(image_id), image_data)
+                        add_tesseract_task_id(image_id, task.id)
+                        image_found = False
+                        image_data = None 
                 
+            self.images = get_images(check_id)
+                              
 
 
 def main(args):
diff --git a/app/tesseract_tasks.py b/app/tesseract_tasks.py
index 848faf2f..c878055d 100644
--- a/app/tesseract_tasks.py
+++ b/app/tesseract_tasks.py
@@ -4,10 +4,11 @@
 import pytesseract
 import cv2
 import numpy as np
-from db import db_methods
 from root_logger import get_root_logger
 
 TASK_RETRY_COUNTDOWN = 60
+MAX_RETRIES = 2
+
 logger = get_root_logger('tesseract_tasks')
 
 celery = Celery(__name__)
@@ -17,7 +18,7 @@
 celery.conf.timezone = 'Europe/Moscow'
 
 TESSERACT_CONFIG = {
-    'lang': 'rus',
+    'lang': 'rus+eng',
     'config': '--psm 6',
 }
 
@@ -25,8 +26,7 @@
 def at_start(sender, **k):
     logger.info("Tesseract worker is ready!")
 
-
-@celery.task(name="tesseract_recognize", queue='tesseract-queue', bind=True)
+@celery.task(name="tesseract_recognize", queue='tesseract-queue', bind=True, max_retries=MAX_RETRIES)
 def tesseract_recognize(self, image_id, image_data):
     try:
         image_array = np.frombuffer(image_data, dtype=np.uint8)
@@ -34,16 +34,17 @@ def tesseract_recognize(self, image_id, image_data):
         if img_cv is None:
             raise ValueError("Не удалось декодировать изображение из двоичных данных")
         text = pytesseract.image_to_string(img_cv, **TESSERACT_CONFIG)
-        success = db_methods.update_image_text(image_id, text)
-        if not success:
-            logger.error(f"Не удалось записать текст для image_id: {image_id}")
-            raise Exception("Ошибка при обновлении текста изображения в базе данных")
-        logger.info(f"Текст успешно распознан и записан для image_id: {image_id}")
+        if text is None:
+            logger.warning(f"Tesseract вернул None для image_id: {image_id}.")
+            text = ""
+        logger.info(f"Текст успешно распознан для image_id: {image_id}")
         return text
 
     except Exception as e:
         logger.error(f"Ошибка при распознавании текста: {e}", exc_info=True)
-        if self.request.retries == self.max_retries:
+        logger.info(f"Пустая строка записана для image_id: {image_id} из-за ошибки: {e}")
+        if self.request.retries >= self.max_retries:
             logger.error(f"Достигнуто максимальное количество попыток для image_id: {image_id}")
             return f"Ошибка: {e}"
+        logger.info(f"Повторная попытка распознавания для image_id: {image_id}. Попытка {self.request.retries + 1} из {self.max_retries}.")
         self.retry(countdown=TASK_RETRY_COUNTDOWN)
\ No newline at end of file

From b7acfcd685182ae387d525812f9837da1a364e9a Mon Sep 17 00:00:00 2001
From: Dariiiii <usadariaa@yandex.ru>
Date: Thu, 3 Apr 2025 00:03:18 +0300
Subject: [PATCH 10/20] add TASK_SOFT_TIME_LIMIT

---
 app/db/db_methods.py   |  2 --
 app/tesseract_tasks.py | 12 ++++++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/app/db/db_methods.py b/app/db/db_methods.py
index 09609bbc..a52737fb 100644
--- a/app/db/db_methods.py
+++ b/app/db/db_methods.py
@@ -57,7 +57,6 @@ def add_image_text(tesseract_task_id, new_text):
     return result.matched_count > 0
     
 def add_tesseract_task_id(image_id, tesseract_task_id):
-    # image_id = ObjectId(image_id)
     result = images_collection.update_one(
         {'_id': image_id},
         {'$set': {'tesseract_task_id': tesseract_task_id}}
@@ -65,7 +64,6 @@ def add_tesseract_task_id(image_id, tesseract_task_id):
     return result.matched_count > 0
 
 def get_tesseract_task_id(image_id):
-    # image_id = ObjectId(image_id)
     image = images_collection.find_one({'_id': image_id})
     if image:
         return image.get('tesseract_task_id')
diff --git a/app/tesseract_tasks.py b/app/tesseract_tasks.py
index c878055d..30035bca 100644
--- a/app/tesseract_tasks.py
+++ b/app/tesseract_tasks.py
@@ -1,13 +1,15 @@
 import os
 from celery import Celery
+from celery.exceptions import SoftTimeLimitExceeded
 from celery.signals import worker_ready
 import pytesseract
 import cv2
 import numpy as np
 from root_logger import get_root_logger
 
-TASK_RETRY_COUNTDOWN = 60
+TASK_RETRY_COUNTDOWN = 30
 MAX_RETRIES = 2
+TASK_SOFT_TIME_LIMIT = 60
 
 logger = get_root_logger('tesseract_tasks')
 
@@ -26,7 +28,7 @@
 def at_start(sender, **k):
     logger.info("Tesseract worker is ready!")
 
-@celery.task(name="tesseract_recognize", queue='tesseract-queue', bind=True, max_retries=MAX_RETRIES)
+@celery.task(name="tesseract_recognize", queue='tesseract-queue', bind=True, max_retries=MAX_RETRIES, soft_time_limit=TASK_SOFT_TIME_LIMIT)
 def tesseract_recognize(self, image_id, image_data):
     try:
         image_array = np.frombuffer(image_data, dtype=np.uint8)
@@ -39,7 +41,9 @@ def tesseract_recognize(self, image_id, image_data):
             text = ""
         logger.info(f"Текст успешно распознан для image_id: {image_id}")
         return text
-
+    except SoftTimeLimitExceeded:
+        logger.warning(f"Превышен мягкий лимит времени для image_id: {image_id}. Задача будет перезапущена.")
+        self.retry(countdown=TASK_RETRY_COUNTDOWN)
     except Exception as e:
         logger.error(f"Ошибка при распознавании текста: {e}", exc_info=True)
         logger.info(f"Пустая строка записана для image_id: {image_id} из-за ошибки: {e}")
@@ -47,4 +51,4 @@ def tesseract_recognize(self, image_id, image_data):
             logger.error(f"Достигнуто максимальное количество попыток для image_id: {image_id}")
             return f"Ошибка: {e}"
         logger.info(f"Повторная попытка распознавания для image_id: {image_id}. Попытка {self.request.retries + 1} из {self.max_retries}.")
-        self.retry(countdown=TASK_RETRY_COUNTDOWN)
\ No newline at end of file
+        self.retry(countdown=TASK_RETRY_COUNTDOWN)

From 89ee03bd5945584b85798cf4679feed709db34ae Mon Sep 17 00:00:00 2001
From: Dariiiii <usadariaa@yandex.ru>
Date: Mon, 14 Apr 2025 22:25:54 +0300
Subject: [PATCH 11/20] first fix

---
 app/db/db_methods.py                          | 31 +++----
 app/db/db_types.py                            |  2 -
 app/main/check_packs/pack_config.py           |  1 +
 app/main/checks/report_checks/__init__.py     |  3 +-
 .../report_checks/image_quality_check.py      | 54 ++++++++++++
 .../checks/report_checks/image_text_check.py  | 68 ++-------------
 app/main/parser.py                            | 28 +-----
 .../reports/docx_uploader/docx_uploader.py    |  8 +-
 app/tasks.py                                  |  2 +-
 app/tesseract_tasks.py                        | 87 +++++++++++++++----
 10 files changed, 151 insertions(+), 133 deletions(-)
 create mode 100644 app/main/checks/report_checks/image_quality_check.py

diff --git a/app/db/db_methods.py b/app/db/db_methods.py
index a52737fb..f5713e85 100644
--- a/app/db/db_methods.py
+++ b/app/db/db_methods.py
@@ -27,6 +27,14 @@
 def get_client():
     return client
 
+def get_image(image_id):
+    image_id = ObjectId(image_id)
+    image = images_collection.find({'_id': image_id})
+    if image is not None:
+        return Image(image)
+    else:
+        return None
+
 def get_images(check_id):
     images = images_collection.find({'check_id': str(check_id)})
     if images is not None:
@@ -37,39 +45,24 @@ def get_images(check_id):
     else:
         return None
 
-def save_image_to_db(check_id, image_data, caption, image_size, text=None, tesseract_task_id=None):
+def save_image_to_db(check_id, image_data, caption, image_size, text=None):
     image = Image({
         'check_id': check_id,
         'image_data': image_data,
         'caption': caption,
         'image_size': image_size,
-        'text' : text,
-        'tesseract_task_id': tesseract_task_id
+        'text' : text
     })
     result = images_collection.insert_one(image.pack())
     return result.inserted_id 
 
-def add_image_text(tesseract_task_id, new_text):
-    result = images_collection.update_one(
-        {'tesseract_task_id': tesseract_task_id},
-        {'$set': {'text': new_text}}
-    )
-    return result.matched_count > 0
-    
-def add_tesseract_task_id(image_id, tesseract_task_id):
+def add_image_text(image_id, new_text):
     result = images_collection.update_one(
         {'_id': image_id},
-        {'$set': {'tesseract_task_id': tesseract_task_id}}
+        {'$set': {'text': new_text}}
     )
     return result.matched_count > 0
 
-def get_tesseract_task_id(image_id):
-    image = images_collection.find_one({'_id': image_id})
-    if image:
-        return image.get('tesseract_task_id')
-    else:
-        return None
-
 
 # Returns user if user was created and None if already exists
 def add_user(username, password_hash='', is_LTI=False):
diff --git a/app/db/db_types.py b/app/db/db_types.py
index eb528748..3ece6f68 100644
--- a/app/db/db_types.py
+++ b/app/db/db_types.py
@@ -155,7 +155,6 @@ def __init__(self, dictionary=None):
         self.image_data = dictionary.get('image_data')  # Файл изображения в формате bindata
         self.image_size = dictionary.get('image_size')  # Размер изображения в сантимерах
         self.text = dictionary.get('text')
-        self.tesseract_task_id = dictionary.get('tesseract_task_id')
 
     def pack(self):
         package = super().pack()
@@ -164,5 +163,4 @@ def pack(self):
         package['image_data'] = self.image_data
         package['image_size'] = self.image_size
         package['text'] = self.text
-        package['tesseract_task_id'] = self.tesseract_task_id
         return package
diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py
index bb51c992..3d5e2834 100644
--- a/app/main/check_packs/pack_config.py
+++ b/app/main/check_packs/pack_config.py
@@ -47,6 +47,7 @@
     ['key_words_report_check'],
     ["empty_task_page_check"],
     ["image_text_check"],
+    ['image_quality_check'],
 ]
 
 DEFAULT_TYPE = 'pres'
diff --git a/app/main/checks/report_checks/__init__.py b/app/main/checks/report_checks/__init__.py
index 3af7c4d2..bf1c3674 100644
--- a/app/main/checks/report_checks/__init__.py
+++ b/app/main/checks/report_checks/__init__.py
@@ -26,4 +26,5 @@
 from .template_name import ReportTemplateNameCheck
 from .key_words_check import KeyWordsReportCheck
 from .empty_task_page_check import EmptyTaskPageCheck
-from .image_text_check import ImageTextCheck
\ No newline at end of file
+from .image_text_check import ImageTextCheck
+from .image_quality_check import ImageQualityCheck
\ No newline at end of file
diff --git a/app/main/checks/report_checks/image_quality_check.py b/app/main/checks/report_checks/image_quality_check.py
new file mode 100644
index 00000000..96df0f90
--- /dev/null
+++ b/app/main/checks/report_checks/image_quality_check.py
@@ -0,0 +1,54 @@
+from ..base_check import BaseReportCriterion, answer
+import cv2
+import numpy as np
+
+class ImageQualityCheck(BaseReportCriterion):
+    label = "Проверка качества изображений"
+    description = ''
+    id = 'image_quality_check'
+    # необходимо подобрать min_laplacian и min_entropy
+    def __init__(self, file_info, min_laplacian=10, min_entropy=1):
+        super().__init__(file_info)
+        self.images = self.file.images
+        self.min_laplacian = min_laplacian
+        self.min_entropy = min_entropy
+        self.laplacian_score = None
+        self.entropy_score = None
+
+    def check(self):
+        deny_list = []
+        if self.images:
+            for img in self.images:
+                image_array = np.frombuffer(img.image_data, dtype=np.uint8)
+                img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
+                
+                if img_cv is None:
+                    deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.<br>")
+                    continue
+                
+                self.find_params(img_cv)
+                
+                if self.laplacian_score is None or self.entropy_score is None:
+                    deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.<br>")
+                    continue
+                
+                if self.laplacian_score < self.min_laplacian:
+                    deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкий показатель лапласиана: {self.laplacian_score} (минимум {self.min_laplacian}).<br>")
+                
+                if self.entropy_score < self.min_entropy:
+                    deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкую энтропию: {self.entropy_score} (минимум {self.min_entropy}).<br>")
+        else: 
+            return answer(True, 'Изображения не найдены!')
+        if deny_list:
+            return answer(False, f'Изображения нечитаемы! <br>Попробуйте улучшить качество изображений, возможно они слишком размыты или зашумлены.<br>{"".join(deny_list)}')
+        else:
+            return answer(True, 'Изображения корректны!')
+
+    def find_params(self, image):
+        if image is None or image.size == 0:
+            return None, None
+        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        self.laplacian_score = cv2.Laplacian(gray_image, cv2.CV_64F).var()
+        hist, _ = np.histogram(gray_image.flatten(), bins=256, range=[0, 256])
+        hist = hist / hist.sum()
+        self.entropy_score = -np.sum(hist * np.log2(hist + 1e-10))
\ No newline at end of file
diff --git a/app/main/checks/report_checks/image_text_check.py b/app/main/checks/report_checks/image_text_check.py
index 69bfb36e..bba1c6fe 100644
--- a/app/main/checks/report_checks/image_text_check.py
+++ b/app/main/checks/report_checks/image_text_check.py
@@ -1,77 +1,23 @@
-import re
 from ..base_check import BaseReportCriterion, answer
-import time
-from celery.result import AsyncResult
+
+SYMBOLS_SET = ['%', '1']
+MAX_SYMBOLS_PERCENTAGE = 0
+MAX_TEXT_DENSITY = 4
 
 class ImageTextCheck(BaseReportCriterion):
     label = "Проверка текста, считанного с изображений"
     description = ''
     id = 'image_text_check'
     # Подобрать значения для symbols_set, max_symbols_percentage, max_text_density
-    def __init__(self, file_info, symbols_set=['%', '1'], max_symbols_percentage=0, max_text_density=0, max_wait_time=30):
+    def __init__(self, file_info, symbols_set=['%', '1'], max_symbols_percentage=0, max_text_density=4):
         super().__init__(file_info)
         self.images = self.file.images
         self.symbols_set = symbols_set
         self.max_symbols_percentage = max_symbols_percentage
         self.max_text_density = max_text_density
-        self.max_wait_time = max_wait_time
 
     def check(self):
-        deny_list = []
         if self.images:
-            for image in self.images:
-                if image.text == '':
-                    continue
-                recognized_text = self.wait_for_text_recognition(image)
-                width, height = image.image_size
-                if not recognized_text:
-                    continue
-                text_density = self.calculate_text_density(recognized_text, width, height)
-                if text_density > self.max_text_density:
-                    deny_list.append(
-                        f"Изображение с подписью '{image.caption}' имеет слишком высокую плотность текста: "
-                        f"{text_density:.4f} (максимум {self.max_text_density}). Это может означать, что текст нечитаем.<br>"
-                    )
-                symbols_count = self.count_symbols_in_text(recognized_text, self.symbols_set)
-                text_length = len(recognized_text)
-                symbols_percentage = (symbols_count / text_length) * 100
-                if symbols_percentage > self.max_symbols_percentage:
-                    deny_list.append(
-                        f"На изображении с подписью '{image.caption}' содержится слишком много неверно распознанных символов: "
-                        f"{symbols_percentage:.2f}% (максимум {self.max_symbols_percentage}%). Это может означать, что размер шрифта слишком маленький или текст нечитаем.<br>"
-                    )
-        else:
-            return answer(False, 'Изображения не найдены!')
-        if deny_list:
-            return answer(False, f'Проблемы с текстом на изображениях! <br>{"".join(deny_list)}')
+            return answer(True, 'Изображения проверяются!')
         else:
-            return answer(True, 'Текст на изображениях корректен!')
-
-    def count_symbols_in_text(self, text, symbols_set):
-        return sum(1 for char in text if char in symbols_set)
-
-    def calculate_text_density(self, text, width, height):
-        text_without_spaces = ''.join(text.split())
-        image_area = width * height
-        if image_area == 0:
-            return 0
-        return len(text_without_spaces) / image_area
-
-    def wait_for_text_recognition(self, image):
-        from app.db.db_methods import add_image_text
-        start_time = time.time()
-        task_id = image.tesseract_task_id
-        if not task_id:
-            return None
-
-        while time.time() - start_time < self.max_wait_time:
-            task_result = AsyncResult(task_id)
-            if task_result.state == 'SUCCESS':
-                recognized_text = task_result.result
-                recognized_text = re.sub(r'\s+', ' ', recognized_text)
-                image.text = recognized_text
-                add_image_text(task_id, recognized_text)
-                return recognized_text.strip()
-            time.sleep(1)
-
-        return None
+            return answer(True, 'Изображения не найдены!')
diff --git a/app/main/parser.py b/app/main/parser.py
index fb60a19d..dcb33b31 100644
--- a/app/main/parser.py
+++ b/app/main/parser.py
@@ -8,14 +8,10 @@
 from main.reports.md_uploader import MdUploader
 from utils import convert_to
 
-from os.path import basename
-from app.db.db_methods import add_check
-from app.db.db_types import Check
 
 logger = logging.getLogger('root_logger')
 
-def parse(filepath, pdf_filepath):
-    from app.db.db_methods import files_info_collection
+def parse(filepath, pdf_filepath, check_id):
 
     tmp_filepath = filepath.lower()
     try:
@@ -26,17 +22,6 @@ def parse(filepath, pdf_filepath):
                 new_filepath = convert_to(filepath, target_format='pptx')
 
             presentation = PresentationPPTX(new_filepath)
-
-            check = Check({
-                'filename': basename(new_filepath),
-            })
-
-            file_id = 0
-            file = files_info_collection.find_one({'name': basename(new_filepath)})
-            if file:
-                file_id = file['_id']
-
-            check_id = add_check(file_id, check)
             presentation.extract_images_with_captions(check_id)
             file_object = presentation
 
@@ -49,17 +34,6 @@ def parse(filepath, pdf_filepath):
 
             docx = DocxUploader()
             docx.upload(new_filepath, pdf_filepath)
-
-            check = Check({
-                'filename': basename(new_filepath),
-            })
-
-            file_id = 0
-            file = files_info_collection.find_one({'name': basename(new_filepath)})
-            if file:
-                file_id = file['_id']
-
-            check_id = add_check(file_id, check)
             docx.parse()
             docx.extract_images_with_captions(check_id)
             file_object = docx
diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py
index 3354c8a0..865c0d85 100644
--- a/app/main/reports/docx_uploader/docx_uploader.py
+++ b/app/main/reports/docx_uploader/docx_uploader.py
@@ -244,7 +244,7 @@ def show_chapters(self, work_type):
         return chapters_str
 
     def extract_images_with_captions(self, check_id):
-        from app.db.db_methods import save_image_to_db, get_images, add_tesseract_task_id
+        from app.db.db_methods import save_image_to_db, get_images
         from app.tesseract_tasks import tesseract_recognize
         
         emu_to_cm  = 360000
@@ -279,14 +279,12 @@ def extract_images_with_captions(self, check_id):
                                 caption = next_text
                                 break
                             next_paragraph_index += 1
-                        
-                        image_id = save_image_to_db(check_id, image_data, caption, (width_cm, height_cm))
-                        task = tesseract_recognize.delay(str(image_id), image_data)
-                        add_tesseract_task_id(image_id, task.id)
+                        save_image_to_db(check_id, image_data, caption, (width_cm, height_cm))
                         image_found = False
                         image_data = None 
                 
             self.images = get_images(check_id)
+            tesseract_recognize.delay(check_id)
                               
 
 
diff --git a/app/tasks.py b/app/tasks.py
index c7ba47df..10f1b275 100644
--- a/app/tasks.py
+++ b/app/tasks.py
@@ -52,7 +52,7 @@ def create_task(self, check_info):
     original_filepath = join(FILES_FOLDER, f"{check_id}.{check_obj.filename.rsplit('.', 1)[-1]}")
     pdf_filepath = join(FILES_FOLDER, f"{check_id}.pdf")
     try:
-        updated_check = check(parse(original_filepath, pdf_filepath), check_obj)
+        updated_check = check(parse(original_filepath, pdf_filepath, check_id), check_obj)
         updated_check.is_ended = True
         updated_check.is_failed = False
         db_methods.update_check(updated_check)  # save to db
diff --git a/app/tesseract_tasks.py b/app/tesseract_tasks.py
index 30035bca..126a4b0d 100644
--- a/app/tesseract_tasks.py
+++ b/app/tesseract_tasks.py
@@ -6,10 +6,13 @@
 import cv2
 import numpy as np
 from root_logger import get_root_logger
+from db import db_methods
+import re
+from .main.checks.report_checks.image_text_check import SYMBOLS_SET, MAX_SYMBOLS_PERCENTAGE, MAX_TEXT_DENSITY
 
-TASK_RETRY_COUNTDOWN = 30
+TASK_RETRY_COUNTDOWN = 60
 MAX_RETRIES = 2
-TASK_SOFT_TIME_LIMIT = 60
+TASK_SOFT_TIME_LIMIT = 120
 
 logger = get_root_logger('tesseract_tasks')
 
@@ -29,26 +32,76 @@ def at_start(sender, **k):
     logger.info("Tesseract worker is ready!")
 
 @celery.task(name="tesseract_recognize", queue='tesseract-queue', bind=True, max_retries=MAX_RETRIES, soft_time_limit=TASK_SOFT_TIME_LIMIT)
-def tesseract_recognize(self, image_id, image_data):
+def tesseract_recognize(self, check_id):
     try:
-        image_array = np.frombuffer(image_data, dtype=np.uint8)
-        img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
-        if img_cv is None:
-            raise ValueError("Не удалось декодировать изображение из двоичных данных")
-        text = pytesseract.image_to_string(img_cv, **TESSERACT_CONFIG)
-        if text is None:
-            logger.warning(f"Tesseract вернул None для image_id: {image_id}.")
-            text = ""
-        logger.info(f"Текст успешно распознан для image_id: {image_id}")
-        return text
+        images = db_methods.get_images(check_id)
+        for image in images:
+            image_array = np.frombuffer(image.image_data, dtype=np.uint8)
+            img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
+            if img_cv is None:
+                raise ValueError("Не удалось декодировать изображение из двоичных данных")
+            text = pytesseract.image_to_string(img_cv, **TESSERACT_CONFIG)
+            if text is None:
+                logger.warning(f"Tesseract вернул None для image_id: {image._id}.")
+                text = ""
+            logger.info(f"Текст успешно распознан для image_id: {image._id}")
+            
+            text = (re.sub(r'\s+', ' ', text)).strip()
+            db_methods.add_image_text(image._id, text)
+        update_ImageTextCheck(check_id)
+        
     except SoftTimeLimitExceeded:
-        logger.warning(f"Превышен мягкий лимит времени для image_id: {image_id}. Задача будет перезапущена.")
+        logger.warning(f"Превышен мягкий лимит времени для check_id: {check_id}. Задача будет перезапущена.")
         self.retry(countdown=TASK_RETRY_COUNTDOWN)
     except Exception as e:
         logger.error(f"Ошибка при распознавании текста: {e}", exc_info=True)
-        logger.info(f"Пустая строка записана для image_id: {image_id} из-за ошибки: {e}")
+        logger.info(f"Пустая строка записана для check_id: {check_id} из-за ошибки: {e}")
         if self.request.retries >= self.max_retries:
-            logger.error(f"Достигнуто максимальное количество попыток для image_id: {image_id}")
+            logger.error(f"Достигнуто максимальное количество попыток для check_id: {check_id}")
             return f"Ошибка: {e}"
-        logger.info(f"Повторная попытка распознавания для image_id: {image_id}. Попытка {self.request.retries + 1} из {self.max_retries}.")
+        logger.info(f"Повторная попытка распознавания для check_id: {check_id}. Попытка {self.request.retries + 1} из {self.max_retries}.")
         self.retry(countdown=TASK_RETRY_COUNTDOWN)
+
+
+def update_ImageTextCheck(check_id):
+    updated_check = db_methods.get_check(check_id)
+    images = db_methods.get_images(check_id)
+    deny_list = []
+    for image in images:
+        width, height = image.image_size
+        text_density = calculate_text_density(image.text, width * height)
+        if text_density > MAX_TEXT_DENSITY:
+            deny_list.append(
+                f"Изображение с подписью '{image.caption}' имеет слишком высокую плотность текста: "
+                f"{text_density:.4f} (максимум {MAX_TEXT_DENSITY}). Это может означать, что текст нечитаем.<br>"
+            )
+        symbols_count = count_symbols_in_text(image.text)
+        text_length = len(image.text)
+        symbols_percentage = (symbols_count / text_length) * 100
+        if symbols_percentage > MAX_SYMBOLS_PERCENTAGE:
+            deny_list.append(
+                f"На изображении с подписью '{image.caption}' содержится слишком много неверно распознанных символов: "
+                f"{symbols_percentage:.2f}% (максимум {MAX_SYMBOLS_PERCENTAGE}%). Это может означать, что размер шрифта слишком маленький или текст нечитаем.<br>"
+            )
+    if deny_list:
+        update_criteria_result(updated_check, 'image_quality_check', [f'Проблемы с текстом на изображениях! <br>{"".join(deny_list)}'], 0)
+    else:
+        update_criteria_result(updated_check, 'image_quality_check', ['Текст на изображениях корректен!'], 1)
+    db_methods.update_check(updated_check)
+
+def update_criteria_result(check, criteria_id, new_verdict, new_score):
+    for criteria in check.enabled_checks:
+        if criteria["id"] == criteria_id:
+            criteria["verdict"] = new_verdict
+            criteria["score"] = new_score
+            return True
+    return False
+
+def count_symbols_in_text(text):
+    return sum(1 for char in text if char in SYMBOLS_SET)
+
+def calculate_text_density(text, image_area):
+    text_without_spaces = ''.join(text.split())
+    if image_area == 0:
+        return 0
+    return len(text_without_spaces) / image_area
\ No newline at end of file

From c59c4756857e3269d7a45dd9bd1106c1b268ebe9 Mon Sep 17 00:00:00 2001
From: Dariiiii <usadariaa@yandex.ru>
Date: Wed, 16 Apr 2025 23:21:23 +0300
Subject: [PATCH 12/20] trial version

---
 app/db/db_methods.py   |  1 -
 app/db/db_types.py     |  1 +
 app/server.py          |  3 +-
 app/tasks.py           |  8 +++++-
 app/tesseract_tasks.py | 63 +++++++++++++++++++++++++-----------------
 5 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/app/db/db_methods.py b/app/db/db_methods.py
index f5713e85..be62c711 100644
--- a/app/db/db_methods.py
+++ b/app/db/db_methods.py
@@ -28,7 +28,6 @@ def get_client():
     return client
 
 def get_image(image_id):
-    image_id = ObjectId(image_id)
     image = images_collection.find({'_id': image_id})
     if image is not None:
         return Image(image)
diff --git a/app/db/db_types.py b/app/db/db_types.py
index 3ece6f68..d4dd5d43 100644
--- a/app/db/db_types.py
+++ b/app/db/db_types.py
@@ -104,6 +104,7 @@ def __init__(self, dictionary=None):
         self.is_failed = dictionary.get('is_failed', None)
         self.is_ended = dictionary.get('is_ended', True)
         self.is_passed = dictionary.get('is_passed', int(self.score) == 1)
+        self.tesseract_result = dictionary.get('tesseract_result', -1)
 
     def calc_score(self):
         # check after implementation criterion pack
diff --git a/app/server.py b/app/server.py
index 366978d9..59365d29 100644
--- a/app/server.py
+++ b/app/server.py
@@ -236,7 +236,8 @@ def run_task():
         'score': -1,  # score=-1 -> checking in progress
         'is_ended': False,
         'is_failed': False,
-        'params_for_passback': current_user.params_for_passback
+        'params_for_passback': current_user.params_for_passback,
+        'tesseract_result': -1
     })
     db_methods.add_check(file_id, check)  # add check for parsed_file to db
     task = create_task.delay(check.pack(to_str=True))  # add check to queue
diff --git a/app/tasks.py b/app/tasks.py
index 10f1b275..d8a39a49 100644
--- a/app/tasks.py
+++ b/app/tasks.py
@@ -12,6 +12,7 @@
 from main.parser import parse
 from main.check_packs import BASE_PACKS
 from root_logger import get_root_logger
+from tesseract_tasks import update_tesseract_criteria_result
 
 config = ConfigParser()
 config.read('app/config.ini')
@@ -53,8 +54,13 @@ def create_task(self, check_info):
     pdf_filepath = join(FILES_FOLDER, f"{check_id}.pdf")
     try:
         updated_check = check(parse(original_filepath, pdf_filepath, check_id), check_obj)
-        updated_check.is_ended = True
         updated_check.is_failed = False
+        if updated_check.tesseract_result == -1:
+            updated_check.tesseract_result = 0
+            logger.info(f"Результат тессеракта{updated_check.tesseract_result} записан, статус проверки {updated_check.is_ended}")
+        else:
+            update_tesseract_criteria_result(updated_check)
+            logger.info(f"Результат тессеракта{updated_check.tesseract_result} записан, статус проверки {updated_check.is_ended}")
         db_methods.update_check(updated_check)  # save to db
         db_methods.mark_celery_task_as_finished(self.request.id)
 
diff --git a/app/tesseract_tasks.py b/app/tesseract_tasks.py
index 126a4b0d..006dc397 100644
--- a/app/tesseract_tasks.py
+++ b/app/tesseract_tasks.py
@@ -8,7 +8,8 @@
 from root_logger import get_root_logger
 from db import db_methods
 import re
-from .main.checks.report_checks.image_text_check import SYMBOLS_SET, MAX_SYMBOLS_PERCENTAGE, MAX_TEXT_DENSITY
+from bson import ObjectId
+from main.checks.report_checks.image_text_check import SYMBOLS_SET, MAX_SYMBOLS_PERCENTAGE, MAX_TEXT_DENSITY
 
 TASK_RETRY_COUNTDOWN = 60
 MAX_RETRIES = 2
@@ -27,35 +28,35 @@
     'config': '--psm 6',
 }
 
-@worker_ready.connect
-def at_start(sender, **k):
-    logger.info("Tesseract worker is ready!")
-
 @celery.task(name="tesseract_recognize", queue='tesseract-queue', bind=True, max_retries=MAX_RETRIES, soft_time_limit=TASK_SOFT_TIME_LIMIT)
 def tesseract_recognize(self, check_id):
     try:
         images = db_methods.get_images(check_id)
-        for image in images:
-            image_array = np.frombuffer(image.image_data, dtype=np.uint8)
-            img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
-            if img_cv is None:
-                raise ValueError("Не удалось декодировать изображение из двоичных данных")
-            text = pytesseract.image_to_string(img_cv, **TESSERACT_CONFIG)
-            if text is None:
-                logger.warning(f"Tesseract вернул None для image_id: {image._id}.")
-                text = ""
-            logger.info(f"Текст успешно распознан для image_id: {image._id}")
-            
-            text = (re.sub(r'\s+', ' ', text)).strip()
-            db_methods.add_image_text(image._id, text)
-        update_ImageTextCheck(check_id)
-        
+        if images:
+            for image in images:
+                image_array = np.frombuffer(image.image_data, dtype=np.uint8)
+                img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
+                if img_cv is None:
+                    raise ValueError("Не удалось декодировать изображение из двоичных данных")
+                text = image.text
+                if not text:
+                    text = pytesseract.image_to_string(img_cv, **TESSERACT_CONFIG)
+                if text.strip():
+                    logger.info(f"Текст успешно распознан для image_id: {image._id}")
+                else:
+                    logger.warning(f"Текст для image_id: {image._id} пустой.")
+                text = (re.sub(r'\s+', ' ', text)).strip()
+                try:
+                    db_methods.add_image_text(image._id, text)
+                except Exception as e:
+                    logger.error(f"Ошибка при сохранении текста для image_id: {image._id}: {e}", exc_info=True)
+                    raise
+            update_ImageTextCheck(check_id)
     except SoftTimeLimitExceeded:
         logger.warning(f"Превышен мягкий лимит времени для check_id: {check_id}. Задача будет перезапущена.")
         self.retry(countdown=TASK_RETRY_COUNTDOWN)
     except Exception as e:
-        logger.error(f"Ошибка при распознавании текста: {e}", exc_info=True)
-        logger.info(f"Пустая строка записана для check_id: {check_id} из-за ошибки: {e}")
+        logger.error(f"Ошибка при распознавании текста для check_id: {check_id}: {e}", exc_info=True)
         if self.request.retries >= self.max_retries:
             logger.error(f"Достигнуто максимальное количество попыток для check_id: {check_id}")
             return f"Ошибка: {e}"
@@ -64,7 +65,6 @@ def tesseract_recognize(self, check_id):
 
 
 def update_ImageTextCheck(check_id):
-    updated_check = db_methods.get_check(check_id)
     images = db_methods.get_images(check_id)
     deny_list = []
     for image in images:
@@ -84,16 +84,27 @@ def update_ImageTextCheck(check_id):
                 f"{symbols_percentage:.2f}% (максимум {MAX_SYMBOLS_PERCENTAGE}%). Это может означать, что размер шрифта слишком маленький или текст нечитаем.<br>"
             )
     if deny_list:
-        update_criteria_result(updated_check, 'image_quality_check', [f'Проблемы с текстом на изображениях! <br>{"".join(deny_list)}'], 0)
+        result = [[f'Проблемы с текстом на изображениях! <br>{"".join(deny_list)}'], 0]
+    else:
+        result = [['Текст на изображениях корректен!'], 1]
+    updated_check = db_methods.get_check(ObjectId(check_id))
+    if updated_check.tesseract_result == 0:
+        updated_check.tesseract_result = result
+        update_tesseract_criteria_result(updated_check)
     else:
-        update_criteria_result(updated_check, 'image_quality_check', ['Текст на изображениях корректен!'], 1)
+        updated_check.tesseract_result = result   
     db_methods.update_check(updated_check)
+    logger.info(f"Результат тессеракта мяу {updated_check.tesseract_result} записан, статус проверки {updated_check.is_ended}")
 
-def update_criteria_result(check, criteria_id, new_verdict, new_score):
+def update_tesseract_criteria_result(check):
+    criteria_id = 'image_quality_check'
+    new_verdict = check.tesseract_result[0]
+    new_score = check.tesseract_result[1]
     for criteria in check.enabled_checks:
         if criteria["id"] == criteria_id:
             criteria["verdict"] = new_verdict
             criteria["score"] = new_score
+            check.is_ended = True
             return True
     return False
 

From 3f254052d9d7d7749434bf7782c13d510d63f4e3 Mon Sep 17 00:00:00 2001
From: Dariiiii <usadariaa@yandex.ru>
Date: Thu, 17 Apr 2025 01:38:07 +0300
Subject: [PATCH 13/20] correction of tesseract

---
 app/tasks.py           |  6 +-----
 app/tesseract_tasks.py | 48 ++++++++++++++++++++----------------------
 2 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/app/tasks.py b/app/tasks.py
index d8a39a49..8684c89e 100644
--- a/app/tasks.py
+++ b/app/tasks.py
@@ -55,12 +55,8 @@ def create_task(self, check_info):
     try:
         updated_check = check(parse(original_filepath, pdf_filepath, check_id), check_obj)
         updated_check.is_failed = False
-        if updated_check.tesseract_result == -1:
-            updated_check.tesseract_result = 0
-            logger.info(f"Результат тессеракта{updated_check.tesseract_result} записан, статус проверки {updated_check.is_ended}")
-        else:
+        if updated_check.tesseract_result != -1:
             update_tesseract_criteria_result(updated_check)
-            logger.info(f"Результат тессеракта{updated_check.tesseract_result} записан, статус проверки {updated_check.is_ended}")
         db_methods.update_check(updated_check)  # save to db
         db_methods.mark_celery_task_as_finished(self.request.id)
 
diff --git a/app/tesseract_tasks.py b/app/tesseract_tasks.py
index 006dc397..099e1712 100644
--- a/app/tesseract_tasks.py
+++ b/app/tesseract_tasks.py
@@ -1,7 +1,6 @@
 import os
 from celery import Celery
 from celery.exceptions import SoftTimeLimitExceeded
-from celery.signals import worker_ready
 import pytesseract
 import cv2
 import numpy as np
@@ -10,9 +9,10 @@
 import re
 from bson import ObjectId
 from main.checks.report_checks.image_text_check import SYMBOLS_SET, MAX_SYMBOLS_PERCENTAGE, MAX_TEXT_DENSITY
+from main.check_packs.pack_config import BASE_REPORT_CRITERION
 
-TASK_RETRY_COUNTDOWN = 60
-MAX_RETRIES = 2
+TASK_RETRY_COUNTDOWN = 30
+MAX_RETRIES = 1
 TASK_SOFT_TIME_LIMIT = 120
 
 logger = get_root_logger('tesseract_tasks')
@@ -44,22 +44,22 @@ def tesseract_recognize(self, check_id):
                 if text.strip():
                     logger.info(f"Текст успешно распознан для image_id: {image._id}")
                 else:
-                    logger.warning(f"Текст для image_id: {image._id} пустой.")
-                text = (re.sub(r'\s+', ' ', text)).strip()
+                    logger.info(f"Текст для image_id: {image._id} пустой.")
                 try:
-                    db_methods.add_image_text(image._id, text)
+                    db_methods.add_image_text(image._id, (re.sub(r'\s+', ' ', text)).strip())
                 except Exception as e:
-                    logger.error(f"Ошибка при сохранении текста для image_id: {image._id}: {e}", exc_info=True)
-                    raise
-            update_ImageTextCheck(check_id)
+                    raise ValueError(f"Ошибка при сохранении текста для image_id: {image._id}: {e}")
+            try:
+                update_ImageTextCheck(check_id)
+            except Exception as e:
+                raise ValueError(f"Ошибка во время проверки текста: {e}")
     except SoftTimeLimitExceeded:
         logger.warning(f"Превышен мягкий лимит времени для check_id: {check_id}. Задача будет перезапущена.")
         self.retry(countdown=TASK_RETRY_COUNTDOWN)
     except Exception as e:
-        logger.error(f"Ошибка при распознавании текста для check_id: {check_id}: {e}", exc_info=True)
         if self.request.retries >= self.max_retries:
-            logger.error(f"Достигнуто максимальное количество попыток для check_id: {check_id}")
-            return f"Ошибка: {e}"
+            add_tesseract_result(check_id,[[f"Ошибка при распознавании текста: {e}"], 0])
+        logger.error(f"Ошибка при распознавании текста для check_id: {check_id}: {e}", exc_info=True)
         logger.info(f"Повторная попытка распознавания для check_id: {check_id}. Попытка {self.request.retries + 1} из {self.max_retries}.")
         self.retry(countdown=TASK_RETRY_COUNTDOWN)
 
@@ -87,26 +87,24 @@ def update_ImageTextCheck(check_id):
         result = [[f'Проблемы с текстом на изображениях! <br>{"".join(deny_list)}'], 0]
     else:
         result = [['Текст на изображениях корректен!'], 1]
+    add_tesseract_result(check_id, result)
+
+
+def add_tesseract_result(check_id, result):
     updated_check = db_methods.get_check(ObjectId(check_id))
-    if updated_check.tesseract_result == 0:
-        updated_check.tesseract_result = result
+    updated_check.tesseract_result = result
+    if 'processing_time' in db_methods.get_celery_task_by_check(ObjectId(check_id)):
         update_tesseract_criteria_result(updated_check)
-    else:
-        updated_check.tesseract_result = result   
     db_methods.update_check(updated_check)
-    logger.info(f"Результат тессеракта мяу {updated_check.tesseract_result} записан, статус проверки {updated_check.is_ended}")
 
 def update_tesseract_criteria_result(check):
-    criteria_id = 'image_quality_check'
-    new_verdict = check.tesseract_result[0]
-    new_score = check.tesseract_result[1]
     for criteria in check.enabled_checks:
-        if criteria["id"] == criteria_id:
-            criteria["verdict"] = new_verdict
-            criteria["score"] = new_score
+        if criteria["id"] == 'image_text_check':
+            criteria["verdict"] = check.tesseract_result[0]
+            criteria["score"] = check.tesseract_result[1]
+            check.score = round(check.score - (1 - check.tesseract_result[1]) / len(BASE_REPORT_CRITERION), 3)
             check.is_ended = True
-            return True
-    return False
+            return
 
 def count_symbols_in_text(text):
     return sum(1 for char in text if char in SYMBOLS_SET)

From 7c195c80e6066fc202897041f1f8a900d2ccc07f Mon Sep 17 00:00:00 2001
From: Dariiiii <usadariaa@yandex.ru>
Date: Thu, 17 Apr 2025 15:50:04 +0300
Subject: [PATCH 14/20] fix update_tesseract_criteria_result

---
 app/routes/tasks.py | 6 ++++--
 app/tasks.py        | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/app/routes/tasks.py b/app/routes/tasks.py
index 2d7f48f2..0b8cc343 100644
--- a/app/routes/tasks.py
+++ b/app/routes/tasks.py
@@ -79,7 +79,8 @@ def run_task():
         'score': -1,  # score=-1 -> checking in progress
         'is_ended': False,
         'is_failed': False,
-        'params_for_passback': current_user.params_for_passback
+        'params_for_passback': current_user.params_for_passback,
+        'tesseract_result': -1
     })
     db_methods.add_check(file_id, check)  # add check for parsed_file to db
     task = create_task.delay(check.pack(to_str=True))  # add check to queue
@@ -138,7 +139,8 @@ def run_md_task_by_api():
         'score': -1,  # score=-1 -> checking in progress
         'is_ended': False,
         'is_failed': False,
-        'params_for_passback': None
+        'params_for_passback': current_user.params_for_passback,
+        'tesseract_result': -1
     })
     db_methods.add_check(file_id, check)  # add check for parsed_file to db
     task = create_task.delay(check.pack(to_str=True))  # add check to queue
diff --git a/app/tasks.py b/app/tasks.py
index 8684c89e..3a901b66 100644
--- a/app/tasks.py
+++ b/app/tasks.py
@@ -55,6 +55,7 @@ def create_task(self, check_info):
     try:
         updated_check = check(parse(original_filepath, pdf_filepath, check_id), check_obj)
         updated_check.is_failed = False
+        updated_check.tesseract_result = db_methods.get_check(check_obj._id).tesseract_result
         if updated_check.tesseract_result != -1:
             update_tesseract_criteria_result(updated_check)
         db_methods.update_check(updated_check)  # save to db

From 40f51beb5861171a0ce2a99ed29a9b2482ee7e9c Mon Sep 17 00:00:00 2001
From: Dariiiii <usadariaa@yandex.ru>
Date: Fri, 18 Apr 2025 01:18:12 +0300
Subject: [PATCH 15/20] update 469_extend_data_storage_model

---
 app/db/db_methods.py                          | 21 +++++++++--
 app/db/db_types.py                            | 11 +++++-
 .../reports/docx_uploader/docx_uploader.py    |  6 +++-
 app/main/reports/pasre_file/parse_file.py     | 36 +++++++++++++++++++
 app/tasks.py                                  | 13 +++++--
 app/tesseract_tasks.py                        |  6 ++--
 6 files changed, 84 insertions(+), 9 deletions(-)
 create mode 100644 app/main/reports/pasre_file/parse_file.py

diff --git a/app/db/db_methods.py b/app/db/db_methods.py
index b3eb9e31..320be1be 100644
--- a/app/db/db_methods.py
+++ b/app/db/db_methods.py
@@ -18,6 +18,7 @@
 checks_collection = db['checks']
 consumers_collection = db['consumers']
 criteria_pack_collection = db['criteria_pack']
+parsed_texts_collection = db['parsed_texts']
 logs_collection = db.create_collection(
     'logs', capped=True, size=5242880) if not db['logs'] else db['logs']
 celery_check_collection = db['celery_check']  # collection for mapping celery_task to check
@@ -44,17 +45,21 @@ def get_images(check_id):
     else:
         return None
 
-def save_image_to_db(check_id, image_data, caption, image_size, text=None):
+def save_image_to_db(check_id, image_data, caption, image_size, text=None, page=None):
     image = Image({
         'check_id': check_id,
         'image_data': image_data,
         'caption': caption,
         'image_size': image_size,
-        'text' : text
+        'text' : text,
+        'page' : page,
     })
     result = images_collection.insert_one(image.pack())
     return result.inserted_id 
 
+def update_image(image):
+    return bool(images_collection.find_one_and_replace({'_id': image._id}, image.pack()))
+
 def add_image_text(image_id, new_text):
     result = images_collection.update_one(
         {'_id': image_id},
@@ -62,6 +67,12 @@ def add_image_text(image_id, new_text):
     )
     return result.matched_count > 0
 
+def add_image_page(image_id, page):
+    result = images_collection.update_one(
+        {'_id': image_id},
+        {'$set': {'page': page}}
+    )
+    return result.matched_count > 0
 
 # Returns user if user was created and None if already exists
 def add_user(username, password_hash='', is_LTI=False):
@@ -181,6 +192,12 @@ def add_check(file_id, check):
 def update_check(check):
     return bool(checks_collection.find_one_and_replace({'_id': check._id}, check.pack()))
 
+def add_parsed_text(check_id, parsed_text):
+    result = parsed_texts_collection.update_one({'filename': parsed_text.filename}, {'$set': parsed_text.pack()}, upsert=True)
+    if result.upserted_id: parsed_texts_id = result.upserted_id
+    else: parsed_texts_id = parsed_texts_collection.find_one({'filename': parsed_text.filename})['_id']
+    files_info_collection.update_one({'_id': check_id}, {"$push": {'parsed_texts': parsed_texts_id}})
+    return parsed_texts_id
 
 def write_pdf(filename, filepath):
     converted_filepath = convert_to(filepath, target_format='pdf')
diff --git a/app/db/db_types.py b/app/db/db_types.py
index 6d409434..8d40bf00 100644
--- a/app/db/db_types.py
+++ b/app/db/db_types.py
@@ -160,7 +160,8 @@ def __init__(self, dictionary=None):
         self.caption = dictionary.get('caption', '')  # Подпись к изображению
         self.image_data = dictionary.get('image_data')  # Файл изображения в формате bindata
         self.image_size = dictionary.get('image_size')  # Размер изображения в сантимерах
-        self.text = dictionary.get('text')
+        self.text = dictionary.get('text', None)
+        self.page = dictionary.get('page', None)
 
     def pack(self):
         package = super().pack()
@@ -169,4 +170,12 @@ def pack(self):
         package['image_data'] = self.image_data
         package['image_size'] = self.image_size
         package['text'] = self.text
+        package['page'] = self.page
         return package
+
+class ParsedText(PackableWithId):
+    def __init__(self, dictionary=None):
+        super().__init__(dictionary)
+        dictionary = dictionary or {}
+        self.filename = dictionary.get('filename', '')
+        self.parsed_chapters = dictionary.get('parsed_chapters', [])
diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py
index 865c0d85..b98553c8 100644
--- a/app/main/reports/docx_uploader/docx_uploader.py
+++ b/app/main/reports/docx_uploader/docx_uploader.py
@@ -250,6 +250,7 @@ def extract_images_with_captions(self, check_id):
         emu_to_cm  = 360000
         image_found = False
         image_data = None
+        image_style="ВКР_Подпись для рисунков"
         if not self.images:
             for i, paragraph in enumerate(self.file.paragraphs):
                 for run in paragraph.runs:
@@ -274,8 +275,11 @@ def extract_images_with_captions(self, check_id):
                         next_paragraph_index = i + 1
                         while next_paragraph_index < len(self.file.paragraphs):
                             next_paragraph = self.file.paragraphs[next_paragraph_index]
+                            style_name = next_paragraph.style.name.lower()
                             next_text = next_paragraph.text.strip()
-                            if next_text and not any("graphic" in r._element.xml for r in next_paragraph.runs):
+                            if any("graphic" in r._element.xml for r in next_paragraph.runs):
+                                break
+                            elif next_text and style_name == image_style.lower() and 'Рисунок' in next_text:
                                 caption = next_text
                                 break
                             next_paragraph_index += 1
diff --git a/app/main/reports/pasre_file/parse_file.py b/app/main/reports/pasre_file/parse_file.py
new file mode 100644
index 00000000..d6f272f6
--- /dev/null
+++ b/app/main/reports/pasre_file/parse_file.py
@@ -0,0 +1,36 @@
+import re
+from db import db_methods
+
+def parse_headers_and_pages_and_images(chapters, docx):
+    text_on_page = docx.pdf_file.get_text_on_page()
+    images = docx.images
+    for page, text in text_on_page.items():
+        text = re.sub(r"(-\n)", "", text)
+        text = re.sub(r"\s\n", " ", text)
+        if "СОДЕРЖАНИЕ" in text:
+            continue
+        for chapter in chapters:
+            if chapter["header"] in text:
+                chapter["start_page"] = page
+        for image in images:
+            if image.caption in text:
+                db_methods.add_image_page(image._id, page)
+    for chapter in chapters:
+        for image in images:
+            if image.caption in chapter["text"]:
+                chapter["images"].append(image._id)
+    return chapters
+
+
+def parse_chapters(docx):
+    chapters = []
+    for chapter in docx.chapters:
+        head = chapter["styled_text"]["text"]
+        if "ПРИЛОЖЕНИЕ" in head:
+            head = head.split(".")[0]
+        if chapter["child"] != [] and "heading" in chapter["style"]:
+            temp_text = ""
+            for i in range(len(chapter["child"])):
+                temp_text += chapter["child"][i]["styled_text"]["text"]
+            chapters.append({"header": head, "start_page": 0, "text": temp_text, "images": []})
+    return chapters
\ No newline at end of file
diff --git a/app/tasks.py b/app/tasks.py
index 3a901b66..8b9db706 100644
--- a/app/tasks.py
+++ b/app/tasks.py
@@ -6,8 +6,9 @@
 from celery.signals import worker_ready
 
 from passback_grades import run_passback
+from main.reports.pasre_file import parse_file
 from db import db_methods
-from db.db_types import Check
+from db.db_types import Check, ParsedText
 from main.checker import check
 from main.parser import parse
 from main.check_packs import BASE_PACKS
@@ -53,11 +54,19 @@ def create_task(self, check_info):
     original_filepath = join(FILES_FOLDER, f"{check_id}.{check_obj.filename.rsplit('.', 1)[-1]}")
     pdf_filepath = join(FILES_FOLDER, f"{check_id}.pdf")
     try:
-        updated_check = check(parse(original_filepath, pdf_filepath, check_id), check_obj)
+        parsed_file_object = parse(original_filepath, pdf_filepath, check_id)
+        parsed_file_object.make_chapters(check_obj.file_type['report_type'])
+        parsed_file_object.make_headers(check_obj.file_type['report_type'])
+        chapters = parse_file.parse_chapters(parsed_file_object)
+        
+        updated_check = check(parsed_file_object, check_obj)
         updated_check.is_failed = False
         updated_check.tesseract_result = db_methods.get_check(check_obj._id).tesseract_result
         if updated_check.tesseract_result != -1:
             update_tesseract_criteria_result(updated_check)
+        parsed_text = ParsedText(dict(filename=check_info['filename']))
+        parsed_text.parsed_chapters = parse_file.parse_headers_and_pages_and_images(chapters, parsed_file_object)
+        db_methods.add_parsed_text(check_id, parsed_text)
         db_methods.update_check(updated_check)  # save to db
         db_methods.mark_celery_task_as_finished(self.request.id)
 
diff --git a/app/tesseract_tasks.py b/app/tesseract_tasks.py
index 099e1712..c9080980 100644
--- a/app/tesseract_tasks.py
+++ b/app/tesseract_tasks.py
@@ -42,13 +42,13 @@ def tesseract_recognize(self, check_id):
                 if not text:
                     text = pytesseract.image_to_string(img_cv, **TESSERACT_CONFIG)
                 if text.strip():
-                    logger.info(f"Текст успешно распознан для image_id: {image._id}")
+                    logger.info(f"Текст успешно распознан для изображения с подписью: {image.caption}")
                 else:
-                    logger.info(f"Текст для image_id: {image._id} пустой.")
+                    logger.info(f"Текст для изображения с подписью: {image.caption} пустой.")
                 try:
                     db_methods.add_image_text(image._id, (re.sub(r'\s+', ' ', text)).strip())
                 except Exception as e:
-                    raise ValueError(f"Ошибка при сохранении текста для image_id: {image._id}: {e}")
+                    raise ValueError(f"Ошибка при сохранении текста для изображения с подписью: {image.caption}: {e}")
             try:
                 update_ImageTextCheck(check_id)
             except Exception as e:

From 24eb092fcb0d4bfb2e8e4b14f8dc9ae637c16aa1 Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <darcenrall@gmail.com>
Date: Tue, 22 Apr 2025 23:29:23 +0300
Subject: [PATCH 16/20] update docker base tag

---
 Dockerfile      | 4 ++--
 Dockerfile_base | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 00795845..a4182c9e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,10 +7,10 @@ RUN npm install && npm install webpack
 ADD ./assets ./assets
 RUN npm run build
 
-FROM dvivanov/dis-base:v0.4
+FROM dvivanov/dis-base:v0.5
 
 LABEL project='dis'
-LABEL version='0.4'
+LABEL version='0.5'
 
 WORKDIR /usr/src/project
 
diff --git a/Dockerfile_base b/Dockerfile_base
index 12a02028..d1724abe 100644
--- a/Dockerfile_base
+++ b/Dockerfile_base
@@ -1,7 +1,7 @@
 FROM python:3.10-slim-bullseye
 
 LABEL project='dis'
-LABEL version='0.4-base'
+LABEL version='0.5-base'
 
 ENV LANG en_US.UTF-8
 ENV TZ=Europe/Moscow

From 57bee013b89f8496554288ab39bcde85fcb00203 Mon Sep 17 00:00:00 2001
From: Dariiiii <usadariaa@yandex.ru>
Date: Thu, 24 Apr 2025 21:52:54 +0300
Subject: [PATCH 17/20] correction of comments

---
 app/db/db_methods.py                          | 38 +++++++++
 app/db/db_types.py                            |  1 -
 app/main/check_packs/pack_config.py           |  4 +-
 .../report_checks/image_quality_check.py      |  6 +-
 .../checks/report_checks/image_text_check.py  | 11 ++-
 .../reports/docx_uploader/docx_uploader.py    |  2 -
 app/routes/tasks.py                           |  6 +-
 app/tasks.py                                  |  5 +-
 app/tesseract_tasks.py                        | 81 +++++++++++++------
 docker-compose.yml                            |  2 +-
 10 files changed, 111 insertions(+), 45 deletions(-)

diff --git a/app/db/db_methods.py b/app/db/db_methods.py
index 320be1be..3b797104 100644
--- a/app/db/db_methods.py
+++ b/app/db/db_methods.py
@@ -22,6 +22,7 @@
 logs_collection = db.create_collection(
     'logs', capped=True, size=5242880) if not db['logs'] else db['logs']
 celery_check_collection = db['celery_check']  # collection for mapping celery_task to check
+celery_tesseract_collection = db['celery_tesseract']
 images_collection = db['images']  # коллекция для хранения изображений
 
 
@@ -496,3 +497,40 @@ def get_celery_task(celery_task_id):
 
 def get_celery_task_by_check(check_id):
     return celery_check_collection.find_one({'check_id': check_id})
+
+
+def get_celery_task_status_by_check(check_id):
+    celery_task = get_celery_task_by_check(check_id)
+    if celery_task and 'finished_at' in celery_task:
+        return True
+    return False
+
+
+def add_celery_tesseract_task(celery_tesseract_task_id, check_id):
+    return celery_tesseract_collection.insert_one(
+        {'celery_tesseract_task_id': celery_tesseract_task_id, 'check_id': check_id, 'started_at': datetime.now()}).inserted_id
+    
+    
+def get_celery_tesseract_task_status_by_check(check_id):
+    celery_tesseract_task = get_celery_tesseract_task_by_check(check_id)
+    if celery_tesseract_task and 'finished_at' in celery_tesseract_task:
+        return True
+    return False
+
+
+def mark_celery_tesseract_task_as_finished_by_check(check_id, tesseract_result, finished_time=None):
+    celery_tesseract_task = get_celery_tesseract_task_by_check(check_id)
+    if not celery_tesseract_task: return
+    if finished_time is None: finished_time = datetime.now()
+    return celery_tesseract_collection.update_one({'check_id': check_id}, {
+        '$set': {'finished_at': finished_time,
+                 'tesseract_result': tesseract_result,
+                 'processing_time': (finished_time - celery_tesseract_task['started_at']).total_seconds()}})
+
+
+def get_celery_tesseract_task(celery_tesseract_task_id):
+    return celery_tesseract_collection.find_one({'celery_tesseract_task_id': celery_tesseract_task_id})
+
+
+def get_celery_tesseract_task_by_check(check_id):
+    return celery_tesseract_collection.find_one({'check_id': check_id})
diff --git a/app/db/db_types.py b/app/db/db_types.py
index 8d40bf00..53d3a07f 100644
--- a/app/db/db_types.py
+++ b/app/db/db_types.py
@@ -104,7 +104,6 @@ def __init__(self, dictionary=None):
         self.is_failed = dictionary.get('is_failed', None)
         self.is_ended = dictionary.get('is_ended', True)
         self.is_passed = dictionary.get('is_passed', int(self.score) == 1)
-        self.tesseract_result = dictionary.get('tesseract_result', -1)
 
     def calc_score(self):
         # check after implementation criterion pack
diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py
index 93e65436..89b1af43 100644
--- a/app/main/check_packs/pack_config.py
+++ b/app/main/check_packs/pack_config.py
@@ -22,6 +22,8 @@
 ]
 BASE_REPORT_CRITERION = [
     ["simple_check"],
+    ["image_text_check"],
+    ['image_quality_check'],
     ["banned_words_in_literature"],
     ["page_counter"],
     ["image_share_check"],
@@ -45,8 +47,6 @@
     ["max_abstract_size_check"],
     ["theme_in_report_check"],
     ["empty_task_page_check"],
-    ["image_text_check"],
-    ['image_quality_check'],
     ["water_in_the_text_check"],
 ]
 
diff --git a/app/main/checks/report_checks/image_quality_check.py b/app/main/checks/report_checks/image_quality_check.py
index 96df0f90..d069fe94 100644
--- a/app/main/checks/report_checks/image_quality_check.py
+++ b/app/main/checks/report_checks/image_quality_check.py
@@ -7,7 +7,7 @@ class ImageQualityCheck(BaseReportCriterion):
     description = ''
     id = 'image_quality_check'
     # необходимо подобрать min_laplacian и min_entropy
-    def __init__(self, file_info, min_laplacian=10, min_entropy=1):
+    def __init__(self, file_info, min_laplacian=100, min_entropy=1):
         super().__init__(file_info)
         self.images = self.file.images
         self.min_laplacian = min_laplacian
@@ -33,10 +33,10 @@ def check(self):
                     continue
                 
                 if self.laplacian_score < self.min_laplacian:
-                    deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкий показатель лапласиана: {self.laplacian_score} (минимум {self.min_laplacian}).<br>")
+                    deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкий показатель лапласиана: {self.laplacian_score:.2f} (минимум {self.min_laplacian:.2f}).<br>")
                 
                 if self.entropy_score < self.min_entropy:
-                    deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкую энтропию: {self.entropy_score} (минимум {self.min_entropy}).<br>")
+                    deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкую энтропию: {self.entropy_score:.2f} (минимум {self.min_entropy:.2f}).<br>")
         else: 
             return answer(True, 'Изображения не найдены!')
         if deny_list:
diff --git a/app/main/checks/report_checks/image_text_check.py b/app/main/checks/report_checks/image_text_check.py
index bba1c6fe..0c5add85 100644
--- a/app/main/checks/report_checks/image_text_check.py
+++ b/app/main/checks/report_checks/image_text_check.py
@@ -1,8 +1,5 @@
 from ..base_check import BaseReportCriterion, answer
 
-SYMBOLS_SET = ['%', '1']
-MAX_SYMBOLS_PERCENTAGE = 0
-MAX_TEXT_DENSITY = 4
 
 class ImageTextCheck(BaseReportCriterion):
     label = "Проверка текста, считанного с изображений"
@@ -17,7 +14,15 @@ def __init__(self, file_info, symbols_set=['%', '1'], max_symbols_percentage=0,
         self.max_text_density = max_text_density
 
     def check(self):
+        from app.tesseract_tasks import tesseract_recognize, callback_task
+        from db.db_methods import add_celery_tesseract_task
         if self.images:
+            tesseract_task = tesseract_recognize.apply_async(
+                args=[self.images[0].check_id, self.symbols_set, self.max_symbols_percentage, self.max_text_density],
+                link=callback_task.s(self.images[0].check_id),
+                link_error=callback_task.s(self.images[0].check_id)
+            )
+            add_celery_tesseract_task(tesseract_task.id, self.images[0].check_id)
             return answer(True, 'Изображения проверяются!')
         else:
             return answer(True, 'Изображения не найдены!')
diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py
index b98553c8..1c52295c 100644
--- a/app/main/reports/docx_uploader/docx_uploader.py
+++ b/app/main/reports/docx_uploader/docx_uploader.py
@@ -245,7 +245,6 @@ def show_chapters(self, work_type):
 
     def extract_images_with_captions(self, check_id):
         from app.db.db_methods import save_image_to_db, get_images
-        from app.tesseract_tasks import tesseract_recognize
         
         emu_to_cm  = 360000
         image_found = False
@@ -288,7 +287,6 @@ def extract_images_with_captions(self, check_id):
                         image_data = None 
                 
             self.images = get_images(check_id)
-            tesseract_recognize.delay(check_id)
                               
 
 
diff --git a/app/routes/tasks.py b/app/routes/tasks.py
index 0b8cc343..75a19481 100644
--- a/app/routes/tasks.py
+++ b/app/routes/tasks.py
@@ -79,8 +79,7 @@ def run_task():
         'score': -1,  # score=-1 -> checking in progress
         'is_ended': False,
         'is_failed': False,
-        'params_for_passback': current_user.params_for_passback,
-        'tesseract_result': -1
+        'params_for_passback': current_user.params_for_passback
     })
     db_methods.add_check(file_id, check)  # add check for parsed_file to db
     task = create_task.delay(check.pack(to_str=True))  # add check to queue
@@ -139,8 +138,7 @@ def run_md_task_by_api():
         'score': -1,  # score=-1 -> checking in progress
         'is_ended': False,
         'is_failed': False,
-        'params_for_passback': current_user.params_for_passback,
-        'tesseract_result': -1
+        'params_for_passback': current_user.params_for_passback
     })
     db_methods.add_check(file_id, check)  # add check for parsed_file to db
     task = create_task.delay(check.pack(to_str=True))  # add check to queue
diff --git a/app/tasks.py b/app/tasks.py
index 8b9db706..e4510ce4 100644
--- a/app/tasks.py
+++ b/app/tasks.py
@@ -61,12 +61,11 @@ def create_task(self, check_info):
         
         updated_check = check(parsed_file_object, check_obj)
         updated_check.is_failed = False
-        updated_check.tesseract_result = db_methods.get_check(check_obj._id).tesseract_result
-        if updated_check.tesseract_result != -1:
-            update_tesseract_criteria_result(updated_check)
         parsed_text = ParsedText(dict(filename=check_info['filename']))
         parsed_text.parsed_chapters = parse_file.parse_headers_and_pages_and_images(chapters, parsed_file_object)
         db_methods.add_parsed_text(check_id, parsed_text)
+        if db_methods.get_celery_tesseract_task_status_by_check(check_id):
+            update_tesseract_criteria_result(updated_check)
         db_methods.update_check(updated_check)  # save to db
         db_methods.mark_celery_task_as_finished(self.request.id)
 
diff --git a/app/tesseract_tasks.py b/app/tesseract_tasks.py
index c9080980..53ec7b27 100644
--- a/app/tesseract_tasks.py
+++ b/app/tesseract_tasks.py
@@ -1,6 +1,7 @@
 import os
+import time
 from celery import Celery
-from celery.exceptions import SoftTimeLimitExceeded
+from celery.exceptions import SoftTimeLimitExceeded, MaxRetriesExceededError
 import pytesseract
 import cv2
 import numpy as np
@@ -8,10 +9,10 @@
 from db import db_methods
 import re
 from bson import ObjectId
-from main.checks.report_checks.image_text_check import SYMBOLS_SET, MAX_SYMBOLS_PERCENTAGE, MAX_TEXT_DENSITY
 from main.check_packs.pack_config import BASE_REPORT_CRITERION
 
 TASK_RETRY_COUNTDOWN = 30
+SOFT_TIME_LIMIT_FOR_CALLBACK = 30
 MAX_RETRIES = 1
 TASK_SOFT_TIME_LIMIT = 120
 
@@ -29,7 +30,7 @@
 }
 
 @celery.task(name="tesseract_recognize", queue='tesseract-queue', bind=True, max_retries=MAX_RETRIES, soft_time_limit=TASK_SOFT_TIME_LIMIT)
-def tesseract_recognize(self, check_id):
+def tesseract_recognize(self, check_id, symbols_set, max_symbols_percentage, max_text_density):
     try:
         images = db_methods.get_images(check_id)
         if images:
@@ -37,51 +38,78 @@ def tesseract_recognize(self, check_id):
                 image_array = np.frombuffer(image.image_data, dtype=np.uint8)
                 img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
                 if img_cv is None:
-                    raise ValueError("Не удалось декодировать изображение из двоичных данных")
+                    raise ValueError(f"Не удалось декодировать изображение с подписью '{image.caption}' из двоичных данных")
                 text = image.text
                 if not text:
                     text = pytesseract.image_to_string(img_cv, **TESSERACT_CONFIG)
                 if text.strip():
-                    logger.info(f"Текст успешно распознан для изображения с подписью: {image.caption}")
+                    logger.info(f"Текст успешно распознан для изображения с подписью '{image.caption}'")
                 else:
-                    logger.info(f"Текст для изображения с подписью: {image.caption} пустой.")
+                    logger.info(f"Текст для изображения с подписью '{image.caption}' пустой.")
                 try:
                     db_methods.add_image_text(image._id, (re.sub(r'\s+', ' ', text)).strip())
                 except Exception as e:
-                    raise ValueError(f"Ошибка при сохранении текста для изображения с подписью: {image.caption}: {e}")
+                    raise ValueError(f"Ошибка при сохранении текста для изображения с подписью '{image.caption}': {e}")
             try:
-                update_ImageTextCheck(check_id)
+                update_ImageTextCheck(check_id, symbols_set, max_symbols_percentage, max_text_density)
             except Exception as e:
                 raise ValueError(f"Ошибка во время проверки текста: {e}")
     except SoftTimeLimitExceeded:
         logger.warning(f"Превышен мягкий лимит времени для check_id: {check_id}. Задача будет перезапущена.")
-        self.retry(countdown=TASK_RETRY_COUNTDOWN)
+        try:
+            self.retry(countdown=TASK_RETRY_COUNTDOWN)
+        except MaxRetriesExceededError:
+            logger.error(f"Достигнут лимит повторных попыток для check_id: {check_id}")
+            add_tesseract_result(check_id, [[f"Превышен лимит времени и попыток"], 0])
     except Exception as e:
-        if self.request.retries >= self.max_retries:
-            add_tesseract_result(check_id,[[f"Ошибка при распознавании текста: {e}"], 0])
         logger.error(f"Ошибка при распознавании текста для check_id: {check_id}: {e}", exc_info=True)
-        logger.info(f"Повторная попытка распознавания для check_id: {check_id}. Попытка {self.request.retries + 1} из {self.max_retries}.")
-        self.retry(countdown=TASK_RETRY_COUNTDOWN)
+        try:
+            self.retry(countdown=TASK_RETRY_COUNTDOWN)
+        except MaxRetriesExceededError:
+            logger.error(f"Достигнут лимит повторных попыток для check_id: {check_id}")
+            add_tesseract_result(check_id,[[f"Ошибка при распознавании текста: {e}"], 0])
+
+
+@celery.task(name="callback_task", queue='callback-queue', soft_time_limit=SOFT_TIME_LIMIT_FOR_CALLBACK)
+def callback_task(result, check_id):
+    try:
+        time.sleep(10)
+        check = db_methods.get_check(ObjectId(check_id))
+        if db_methods.get_celery_task_status_by_check(check_id):
+            if check.is_ended:
+                logger.info(f"Проверка успешно завершена для check_id: {check_id}")
+                return
+            update_tesseract_criteria_result(check)
+            db_methods.update_check(check)
+            logger.info(f"Проверка успешно обновлена для check_id: {check_id}")
+            return
+        else:
+            logger.info(f"Задачи create_task и tesseract_recognize для check_id: {check_id} обрабатываются корректно. Состояние гонки исключено.")
+            return
+    except SoftTimeLimitExceeded:
+        logger.warning(f"Превышен мягкий лимит времени для callback_task с check_id: {check_id}.")
+    except Exception as e:
+        logger.error(f"Ошибка в callback_task для check_id: {check_id}: {e}")
 
 
-def update_ImageTextCheck(check_id):
+def update_ImageTextCheck(check_id, symbols_set, max_symbols_percentage, max_text_density):
     images = db_methods.get_images(check_id)
     deny_list = []
     for image in images:
         width, height = image.image_size
         text_density = calculate_text_density(image.text, width * height)
-        if text_density > MAX_TEXT_DENSITY:
+        if text_density > max_text_density:
             deny_list.append(
                 f"Изображение с подписью '{image.caption}' имеет слишком высокую плотность текста: "
-                f"{text_density:.4f} (максимум {MAX_TEXT_DENSITY}). Это может означать, что текст нечитаем.<br>"
+                f"{text_density:.2f} (максимум {max_text_density:.2f}). Это может означать, что текст нечитаем.<br>"
             )
-        symbols_count = count_symbols_in_text(image.text)
+        symbols_count = count_symbols_in_text(image.text, symbols_set)
         text_length = len(image.text)
         symbols_percentage = (symbols_count / text_length) * 100
-        if symbols_percentage > MAX_SYMBOLS_PERCENTAGE:
+        if symbols_percentage > max_symbols_percentage:
             deny_list.append(
                 f"На изображении с подписью '{image.caption}' содержится слишком много неверно распознанных символов: "
-                f"{symbols_percentage:.2f}% (максимум {MAX_SYMBOLS_PERCENTAGE}%). Это может означать, что размер шрифта слишком маленький или текст нечитаем.<br>"
+                f"{symbols_percentage:.2f}% (максимум {max_symbols_percentage:.2f}%). Это может означать, что размер шрифта слишком маленький или текст нечитаем.<br>"
             )
     if deny_list:
         result = [[f'Проблемы с текстом на изображениях! <br>{"".join(deny_list)}'], 0]
@@ -92,22 +120,23 @@ def update_ImageTextCheck(check_id):
 
 def add_tesseract_result(check_id, result):
     updated_check = db_methods.get_check(ObjectId(check_id))
-    updated_check.tesseract_result = result
-    if 'processing_time' in db_methods.get_celery_task_by_check(ObjectId(check_id)):
+    db_methods.mark_celery_tesseract_task_as_finished_by_check(check_id, result)
+    if db_methods.get_celery_task_status_by_check(check_id):
         update_tesseract_criteria_result(updated_check)
     db_methods.update_check(updated_check)
 
 def update_tesseract_criteria_result(check):
+    tesseract_task = db_methods.get_celery_tesseract_task_by_check(str(check._id))
     for criteria in check.enabled_checks:
         if criteria["id"] == 'image_text_check':
-            criteria["verdict"] = check.tesseract_result[0]
-            criteria["score"] = check.tesseract_result[1]
-            check.score = round(check.score - (1 - check.tesseract_result[1]) / len(BASE_REPORT_CRITERION), 3)
+            criteria["verdict"] = tesseract_task['tesseract_result'][0]
+            criteria["score"] = tesseract_task['tesseract_result'][1]
+            check.score = round(check.score - (1 - tesseract_task['tesseract_result'][1]) / len(BASE_REPORT_CRITERION), 3)
             check.is_ended = True
             return
 
-def count_symbols_in_text(text):
-    return sum(1 for char in text if char in SYMBOLS_SET)
+def count_symbols_in_text(text, symbols_set):
+    return sum(1 for char in text if char in symbols_set)
 
 def calculate_text_density(text, image_area):
     text_without_spaces = ''.join(text.split())
diff --git a/docker-compose.yml b/docker-compose.yml
index d5fc98f9..5c1cf6d3 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -78,7 +78,7 @@ services:
   tesseract_worker:
     image: document_insight_system_image
     restart: always
-    command: celery --app=app.tesseract_tasks.celery worker -n tesseract@worker -Q tesseract-queue --loglevel=info
+    command: celery --app=app.tesseract_tasks.celery worker -n tesseract@worker -Q tesseract-queue,callback-queue --loglevel=info
     environment:
       - CELERY_BROKER_URL=${REDIS_URL}
       - CELERY_RESULT_BACKEND=${REDIS_URL}

From 3b18e36d700bb847ba9012fc725ec22ae750fa2b Mon Sep 17 00:00:00 2001
From: Dariiiii <usadariaa@yandex.ru>
Date: Thu, 24 Apr 2025 22:23:04 +0300
Subject: [PATCH 18/20] remove the typo

---
 app/routes/tasks.py    | 2 +-
 app/tesseract_tasks.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/app/routes/tasks.py b/app/routes/tasks.py
index 75a19481..2d7f48f2 100644
--- a/app/routes/tasks.py
+++ b/app/routes/tasks.py
@@ -138,7 +138,7 @@ def run_md_task_by_api():
         'score': -1,  # score=-1 -> checking in progress
         'is_ended': False,
         'is_failed': False,
-        'params_for_passback': current_user.params_for_passback
+        'params_for_passback': None
     })
     db_methods.add_check(file_id, check)  # add check for parsed_file to db
     task = create_task.delay(check.pack(to_str=True))  # add check to queue
diff --git a/app/tesseract_tasks.py b/app/tesseract_tasks.py
index 53ec7b27..15118d7c 100644
--- a/app/tesseract_tasks.py
+++ b/app/tesseract_tasks.py
@@ -125,6 +125,7 @@ def add_tesseract_result(check_id, result):
         update_tesseract_criteria_result(updated_check)
     db_methods.update_check(updated_check)
 
+
 def update_tesseract_criteria_result(check):
     tesseract_task = db_methods.get_celery_tesseract_task_by_check(str(check._id))
     for criteria in check.enabled_checks:
@@ -135,9 +136,11 @@ def update_tesseract_criteria_result(check):
             check.is_ended = True
             return
 
+
 def count_symbols_in_text(text, symbols_set):
     return sum(1 for char in text if char in symbols_set)
 
+
 def calculate_text_density(text, image_area):
     text_without_spaces = ''.join(text.split())
     if image_area == 0:

From 050163a998cb4900cf5ad519353946f4892e040a Mon Sep 17 00:00:00 2001
From: Dariiiii <usadariaa@yandex.ru>
Date: Sat, 10 May 2025 17:22:07 +0300
Subject: [PATCH 19/20] fix bug

---
 app/tesseract_tasks.py | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/app/tesseract_tasks.py b/app/tesseract_tasks.py
index 15118d7c..e6175243 100644
--- a/app/tesseract_tasks.py
+++ b/app/tesseract_tasks.py
@@ -75,7 +75,7 @@ def callback_task(result, check_id):
     try:
         time.sleep(10)
         check = db_methods.get_check(ObjectId(check_id))
-        if db_methods.get_celery_task_status_by_check(check_id):
+        if db_methods.get_celery_task_status_by_check(ObjectId(check_id)):
             if check.is_ended:
                 logger.info(f"Проверка успешно завершена для check_id: {check_id}")
                 return
@@ -96,21 +96,22 @@ def update_ImageTextCheck(check_id, symbols_set, max_symbols_percentage, max_tex
     images = db_methods.get_images(check_id)
     deny_list = []
     for image in images:
-        width, height = image.image_size
-        text_density = calculate_text_density(image.text, width * height)
-        if text_density > max_text_density:
-            deny_list.append(
-                f"Изображение с подписью '{image.caption}' имеет слишком высокую плотность текста: "
-                f"{text_density:.2f} (максимум {max_text_density:.2f}). Это может означать, что текст нечитаем.<br>"
-            )
-        symbols_count = count_symbols_in_text(image.text, symbols_set)
-        text_length = len(image.text)
-        symbols_percentage = (symbols_count / text_length) * 100
-        if symbols_percentage > max_symbols_percentage:
-            deny_list.append(
-                f"На изображении с подписью '{image.caption}' содержится слишком много неверно распознанных символов: "
-                f"{symbols_percentage:.2f}% (максимум {max_symbols_percentage:.2f}%). Это может означать, что размер шрифта слишком маленький или текст нечитаем.<br>"
-            )
+        if image.text:
+            width, height = image.image_size
+            text_density = calculate_text_density(image.text, width * height)
+            if text_density > max_text_density:
+                deny_list.append(
+                    f"Изображение с подписью '{image.caption}' имеет слишком высокую плотность текста: "
+                    f"{text_density:.2f} (максимум {max_text_density:.2f}). Это может означать, что текст нечитаем.<br>"
+                )
+            symbols_count = count_symbols_in_text(image.text, symbols_set)
+            text_length = len(image.text)
+            symbols_percentage = (symbols_count / text_length) * 100
+            if symbols_percentage > max_symbols_percentage:
+                deny_list.append(
+                    f"На изображении с подписью '{image.caption}' содержится слишком много неверно распознанных символов: "
+                    f"{symbols_percentage:.2f}% (максимум {max_symbols_percentage:.2f}%). Это может означать, что размер шрифта слишком маленький или текст нечитаем.<br>"
+                )
     if deny_list:
         result = [[f'Проблемы с текстом на изображениях! <br>{"".join(deny_list)}'], 0]
     else:
@@ -132,7 +133,7 @@ def update_tesseract_criteria_result(check):
         if criteria["id"] == 'image_text_check':
             criteria["verdict"] = tesseract_task['tesseract_result'][0]
             criteria["score"] = tesseract_task['tesseract_result'][1]
-            check.score = round(check.score - (1 - tesseract_task['tesseract_result'][1]) / len(BASE_REPORT_CRITERION), 3)
+            check.score = max(0, round(check.score - (1 - tesseract_task['tesseract_result'][1]) / len(BASE_REPORT_CRITERION), 3))
             check.is_ended = True
             return
 

From 5796e5f5f234fbb5f957f8d296e15d6e6b7708f4 Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <darcenrall@gmail.com>
Date: Mon, 10 Nov 2025 22:50:09 +0300
Subject: [PATCH 20/20] update tesseract_worker volume

---
 docker-compose.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 998aa6e8..c8795414 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -84,7 +84,7 @@ services:
       - redis
       - mongodb
     volumes:
-      - presentation_files:/usr/src/project/files/
+      - files:/usr/src/project/files/
       - "/etc/timezone:/etc/timezone:ro"
       - "/etc/localtime:/etc/localtime:ro"
     cpuset: ${CONTAINER_CPU:-0-1}