diff --git a/.dvc/.gitignore b/.dvc/.gitignore
new file mode 100644
index 0000000..528f30c
--- /dev/null
+++ b/.dvc/.gitignore
@@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
diff --git a/.dvc/config b/.dvc/config
new file mode 100644
index 0000000..e69de29
diff --git a/.dvcignore b/.dvcignore
new file mode 100644
index 0000000..5197305
--- /dev/null
+++ b/.dvcignore
@@ -0,0 +1,3 @@
+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore
diff --git a/.gitignore b/.gitignore
index 6c10c35..0e991d3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -55,10 +55,9 @@ MANIFEST
 mlruns/
 
 # Data
-data
 data/*
 
 # Cookiecutter
 hack_digital_transformation/
 uv.lock
-.coverage
\ No newline at end of file
+.coverage
diff --git a/job_config.yaml b/job_config.yaml
index ae1851d..59111d0 100644
--- a/job_config.yaml
+++ b/job_config.yaml
@@ -1,20 +1,24 @@
-name: digital_hack_ml_job  # Уникальное имя задания
-desc: "Джоба для проведения экспериментов по созданию CV модели для определения местоположения по фотографии" # Описание
+name: digital_hack_ml_job
+desc: "Джоба для проведения экспериментов по созданию CV модели для определения местоположения по фотографии"
 
-# Команда для запуска. ${VARIABLE} - шаблоны, заменяемые на значения из `inputs`
-cmd: python src/engine/main.py #--input ${INPUT_FILE}
+# Команда для запуска
+cmd: python src/engine/main.py --csv-path csv_file_mob --images-dir images_dir_dzk/
 
-# передать в DataSphere код и зависимости pip
+# Используйте python: auto для автоматического определения версии Python
 env:
   python: auto
 
-# Входные данные (файлы или каталоги)
-#inputs:
-#  - input_data.txt: INPUT_FILE  
+# Входные данные (обновите алиасы)
+inputs:
+  - data/processed_data/merged_data.csv: csv_file_mob
+  - data/raw_data/data/metadata/INC/united_image: images_dir_dzk
+  - src/models: models
+  - src/utils: utils
 
-# Выходные данные (файлы или каталоги, которые вернутся с облака)
-#outputs:
-#  - output_results.zip
-#  - logs.txt
+# Выходные данные
+outputs:
+  - optuna_study.pkl: optuna_study
+  - ocr_model_params.json: ocr_model_params
+  - test_results.json: test_results
 
 cloud-instance-type: gt4.1
\ No newline at end of file
diff --git a/notebooks/1_data_exploration/1_1_download_data.ipynb b/notebooks/1_data_exploration/1_1_download_data.ipynb
index 430accf..2f15849 100644
--- a/notebooks/1_data_exploration/1_1_download_data.ipynb
+++ b/notebooks/1_data_exploration/1_1_download_data.ipynb
@@ -615,6 +615,56 @@
     "               file_s3_dst='processed_data/merge_data.csv',\n",
     "               bucket_name='s3-dvc',)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "64d2d0d0",
+   "metadata": {},
+   "source": [
+    "# Перемещяем все фотки в единую папку "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1f56653",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.mkdir(ROOT_DIR / 'data/raw_data/data/metadata/INC/united_image')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35417e5a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "move_and_remove_files(source_dir=ROOT_DIR / 'data/raw_data/data/metadata/INC/18-001_gin_building_echd_19.08.25', \n",
+    "                      destination_dir=ROOT_DIR / 'data/raw_data/data/metadata/INC/united_image',\n",
+    "                      remove_after_move=True,)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b123a07e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "move_and_remove_files(source_dir=ROOT_DIR / 'data/raw_data/data/metadata/INC/19-001_gin_garbage_echd_19.08.25', \n",
+    "                      destination_dir=ROOT_DIR / 'data/raw_data/data/metadata/INC/united_image',\n",
+    "                      remove_after_move=True,)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b193a00",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/notebooks/1_data_exploration/1_2_prepare_data.ipynb b/notebooks/1_data_exploration/1_2_prepare_data.ipynb
index 605b691..7956bd0 100644
--- a/notebooks/1_data_exploration/1_2_prepare_data.ipynb
+++ b/notebooks/1_data_exploration/1_2_prepare_data.ipynb
@@ -18,7 +18,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "id": "97feaebc",
    "metadata": {},
    "outputs": [],
@@ -38,7 +38,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "id": "cd10fe31",
    "metadata": {},
    "outputs": [],
@@ -73,7 +73,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "id": "e7ef1558",
    "metadata": {},
    "outputs": [],
@@ -100,8 +100,8 @@
    "source": [
     "# Создание датасета с автоматическим разделением\n",
     "dataset = PrepareData(\n",
-    "    excel_path=ROOT_DIR / \"data/raw_data/data/metadata/INC/18-001_gin_building_echd_19.08.25.xlsx\",\n",
-    "    images_dir=ROOT_DIR / \"data/raw_data/data/metadata/INC/18-001_gin_building_echd_19.08.25/\",\n",
+    "    csv_path=ROOT_DIR / \"data/processed_data/merged_data.csv\",\n",
+    "    images_dir=ROOT_DIR / \"data/raw_data/data/metadata/INC/united_image/\",\n",
     "    test_size=0.2,\n",
     "    random_state=42\n",
     ")\n",
@@ -128,7 +128,18 @@
    "execution_count": null,
    "id": "5fb26062",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Best ROI: left_bottom | conf=0.93\n",
+      "joined: MMChdUZAO_112301\n",
+      "norm  : MMC_hd_UZAO_112301\n",
+      "final : MMC_hd_UZAO_1123_0_1\n"
+     ]
+    }
+   ],
    "source": [
     "import cv2\n",
     "import numpy as np\n",
@@ -329,7 +340,7 @@
     "# ===================== пример использования =====================\n",
     "\n",
     "if __name__ == \"__main__\":\n",
-    "    IMG = r'/home/lizardapn/Hack_digital/hack_digital_transformation/data/raw_data/data/metadata/INC/19-001_gin_garbage_echd_19.08.25/ffccf36c-075a-43d9-8570-a01f3afcaf76.jpg'\n",
+    "    IMG = r'/home/lizardapn/Hack_digital/hack_digital_transformation/data/raw_data/data/metadata/INC/united_image/0a0ee2fb-b7ad-4430-97d7-281e2c293041.jpg'\n",
     "\n",
     "    ocr = OverlayOCR(\n",
     "        langs=['en'],          # ['en','ru'] если нужна кириллица\n",
@@ -353,63 +364,284 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "d56cc01f",
+   "id": "99b46a62",
    "metadata": {},
    "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "299610c2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[I 2025-09-25 20:53:37,368] A new study created in memory with name: no-name-ad852449-0235-45d1-a60b-e535dcbb5227\n",
+      "[I 2025-09-25 20:53:47,455] Trial 0 finished with value: 3.0 and parameters: {'gap_mult': 2.8068367308375892, 'canvas_size': 3600, 'mag_ratio': 2.8965996021592537, 'add_margin': 0.48089093342192746, 'text_threshold': 0.30768847754184525, 'low_text': 0.5186617904372725, 'link_threshold': 0.3295188034326136}. Best is trial 0 with value: 3.0.\n",
+      "[I 2025-09-25 20:53:56,705] Trial 1 finished with value: 3.0 and parameters: {'gap_mult': 0.4616970821705757, 'canvas_size': 7200, 'mag_ratio': 3.9151446432090062, 'add_margin': 0.29848539656963197, 'text_threshold': 0.7381783711907166, 'low_text': 0.6797473138005795, 'link_threshold': 0.31919910593867973}. Best is trial 0 with value: 3.0.\n",
+      "[I 2025-09-25 20:54:00,930] Trial 2 finished with value: 1.0 and parameters: {'gap_mult': 2.95068046948933, 'canvas_size': 1800, 'mag_ratio': 1.6335501479294163, 'add_margin': 0.23019075051561422, 'text_threshold': 0.5364046793357755, 'low_text': 0.14346526109875232, 'link_threshold': 0.4549901859491947}. Best is trial 2 with value: 1.0.\n",
+      "[I 2025-09-25 20:54:03,561] Trial 3 finished with value: 2.0 and parameters: {'gap_mult': 0.7098592466029646, 'canvas_size': 1800, 'mag_ratio': 4.337665270375808, 'add_margin': 0.2092324992312398, 'text_threshold': 0.5999943848050431, 'low_text': 0.5505604402067924, 'link_threshold': 0.3594102577222953}. Best is trial 2 with value: 1.0.\n",
+      "[I 2025-09-25 20:54:11,619] Trial 4 finished with value: 8.0 and parameters: {'gap_mult': 2.6082037341148663, 'canvas_size': 10000, 'mag_ratio': 3.858049154855571, 'add_margin': 0.023438922614506454, 'text_threshold': 0.7258377412928896, 'low_text': 0.3718230280903212, 'link_threshold': 0.41824734965498844}. Best is trial 2 with value: 1.0.\n",
+      "[I 2025-09-25 20:54:14,502] Trial 5 finished with value: 6.0 and parameters: {'gap_mult': 2.6706992390768276, 'canvas_size': 3600, 'mag_ratio': 3.235255263782191, 'add_margin': 0.26475654378958785, 'text_threshold': 0.4536236374494962, 'low_text': 0.5543332082573351, 'link_threshold': 0.28821655261683743}. Best is trial 2 with value: 1.0.\n",
+      "[I 2025-09-25 20:54:17,267] Trial 6 finished with value: 2.0 and parameters: {'gap_mult': 0.582283392877384, 'canvas_size': 7200, 'mag_ratio': 2.7611406531446723, 'add_margin': 0.38970235970730355, 'text_threshold': 0.7295818110214943, 'low_text': 0.6309958719665154, 'link_threshold': 0.1740186779693806}. Best is trial 2 with value: 1.0.\n",
+      "[I 2025-09-25 20:54:19,088] Trial 7 finished with value: 0.0 and parameters: {'gap_mult': 0.1148899697656891, 'canvas_size': 1800, 'mag_ratio': 2.0909813860745077, 'add_margin': 0.38880953861561235, 'text_threshold': 0.5046870997550761, 'low_text': 0.2758043304152779, 'link_threshold': 0.4121455607297143}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:54:21,489] Trial 8 finished with value: 6.0 and parameters: {'gap_mult': 0.306489544901438, 'canvas_size': 3600, 'mag_ratio': 2.431735944829139, 'add_margin': 0.12216310769525666, 'text_threshold': 0.4190839052600482, 'low_text': 0.27219395882445296, 'link_threshold': 0.37805487357277634}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:54:23,559] Trial 9 finished with value: 6.0 and parameters: {'gap_mult': 0.7405645718677436, 'canvas_size': 1800, 'mag_ratio': 2.7070172489585387, 'add_margin': 0.4140575014567742, 'text_threshold': 0.6300710361851376, 'low_text': 0.33516922393069803, 'link_threshold': 0.17953131386104623}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:54:25,982] Trial 10 finished with value: 0.0 and parameters: {'gap_mult': 1.4093994989267604, 'canvas_size': 10000, 'mag_ratio': 1.711106955826645, 'add_margin': 0.34762496219028105, 'text_threshold': 0.3909935170351837, 'low_text': 0.11283459845938454, 'link_threshold': 0.6099373327834569}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:54:27,935] Trial 11 finished with value: 0.0 and parameters: {'gap_mult': 1.4792872699039796, 'canvas_size': 10000, 'mag_ratio': 1.5223010354107902, 'add_margin': 0.33563162563951027, 'text_threshold': 0.37565955475152524, 'low_text': 0.1168980126968841, 'link_threshold': 0.6325641683059166}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:54:30,647] Trial 12 finished with value: 0.0 and parameters: {'gap_mult': 1.363569314546833, 'canvas_size': 10000, 'mag_ratio': 2.1897337958053624, 'add_margin': 0.469594695826354, 'text_threshold': 0.48388464532737874, 'low_text': 0.21658316372857112, 'link_threshold': 0.5597509755711151}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:54:32,654] Trial 13 finished with value: 0.0 and parameters: {'gap_mult': 2.0469309820631105, 'canvas_size': 1800, 'mag_ratio': 2.006596797142189, 'add_margin': 0.3626671511083453, 'text_threshold': 0.36655356940398864, 'low_text': 0.22865266846485077, 'link_threshold': 0.5216438095511898}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:54:37,208] Trial 14 finished with value: 6.0 and parameters: {'gap_mult': 1.1069357731698148, 'canvas_size': 10000, 'mag_ratio': 1.8969889704811436, 'add_margin': 0.42390499933071407, 'text_threshold': 0.5157168099224722, 'low_text': 0.10164163381677017, 'link_threshold': 0.6704962998573097}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:54:40,128] Trial 15 finished with value: 2.0 and parameters: {'gap_mult': 1.8996243187385848, 'canvas_size': 1800, 'mag_ratio': 4.870066126775307, 'add_margin': 0.32483866469189737, 'text_threshold': 0.585824466151432, 'low_text': 0.4191998906943226, 'link_threshold': 0.5162837033151318}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:54:42,649] Trial 16 finished with value: 6.0 and parameters: {'gap_mult': 0.15078914260503726, 'canvas_size': 10000, 'mag_ratio': 2.336137777441351, 'add_margin': 0.15215492650074974, 'text_threshold': 0.41400283625495554, 'low_text': 0.18781709662258905, 'link_threshold': 0.5934394365460154}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:54:44,730] Trial 17 finished with value: 8.0 and parameters: {'gap_mult': 2.013170127993129, 'canvas_size': 7200, 'mag_ratio': 1.8268150446726152, 'add_margin': 0.448717840682068, 'text_threshold': 0.31527561348760785, 'low_text': 0.30206312727673806, 'link_threshold': 0.4580136822784667}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:54:50,832] Trial 18 finished with value: 7.0 and parameters: {'gap_mult': 1.059265015642122, 'canvas_size': 10000, 'mag_ratio': 3.336471276803885, 'add_margin': 0.282447622846455, 'text_threshold': 0.45135572176395305, 'low_text': 0.4253467296918087, 'link_threshold': 0.26399052554383395}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:54:53,151] Trial 19 finished with value: 6.0 and parameters: {'gap_mult': 2.381889433799428, 'canvas_size': 1800, 'mag_ratio': 2.455352248219552, 'add_margin': 0.3617385824967006, 'text_threshold': 0.5375199556741124, 'low_text': 0.17247623825345915, 'link_threshold': 0.5205447782602558}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:54:58,417] Trial 20 finished with value: 0.0 and parameters: {'gap_mult': 1.7314693373323178, 'canvas_size': 10000, 'mag_ratio': 3.1603467746819867, 'add_margin': 0.17358331235688695, 'text_threshold': 0.6662549037510657, 'low_text': 0.27133518926858646, 'link_threshold': 0.6857614766409486}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:55:00,525] Trial 21 finished with value: 0.0 and parameters: {'gap_mult': 1.4760274400451803, 'canvas_size': 10000, 'mag_ratio': 1.505120282497312, 'add_margin': 0.33500238184214015, 'text_threshold': 0.3640023294357934, 'low_text': 0.10303266997568411, 'link_threshold': 0.6226069867707704}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:55:02,569] Trial 22 finished with value: 3.0 and parameters: {'gap_mult': 1.0719868998063475, 'canvas_size': 10000, 'mag_ratio': 1.5514445085936461, 'add_margin': 0.38165340541644804, 'text_threshold': 0.36735195851134994, 'low_text': 0.15146302969794537, 'link_threshold': 0.6318307720851756}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:55:04,977] Trial 23 finished with value: 0.0 and parameters: {'gap_mult': 1.2827856065009335, 'canvas_size': 10000, 'mag_ratio': 2.0522093394602043, 'add_margin': 0.33364441087307467, 'text_threshold': 0.4009411598080816, 'low_text': 0.22636549446097454, 'link_threshold': 0.6997776020712903}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:55:07,408] Trial 24 finished with value: 6.0 and parameters: {'gap_mult': 1.688073858124835, 'canvas_size': 10000, 'mag_ratio': 1.8147929044605347, 'add_margin': 0.41187921882338346, 'text_threshold': 0.48018628231507815, 'low_text': 0.14505741985334714, 'link_threshold': 0.5725158142189293}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:55:12,225] Trial 25 finished with value: 3.0 and parameters: {'gap_mult': 2.2273244859507426, 'canvas_size': 1800, 'mag_ratio': 2.1619788064541146, 'add_margin': 0.29680817784938923, 'text_threshold': 0.34280209793757394, 'low_text': 0.3598483802527688, 'link_threshold': 0.47532698356188297}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:55:14,578] Trial 26 finished with value: 2.0 and parameters: {'gap_mult': 1.6266005039888947, 'canvas_size': 7200, 'mag_ratio': 1.8054129677884196, 'add_margin': 0.4967591135107159, 'text_threshold': 0.40062455240630057, 'low_text': 0.27142466964041806, 'link_threshold': 0.6268694818093563}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:55:17,799] Trial 27 finished with value: 0.0 and parameters: {'gap_mult': 0.8716046775322024, 'canvas_size': 3600, 'mag_ratio': 2.591535568961233, 'add_margin': 0.23398228930290385, 'text_threshold': 0.45887982369469826, 'low_text': 0.4700297698310453, 'link_threshold': 0.40555964970795816}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:55:20,157] Trial 28 finished with value: 3.0 and parameters: {'gap_mult': 0.12634067763172063, 'canvas_size': 10000, 'mag_ratio': 1.687707077278028, 'add_margin': 0.44168182474903117, 'text_threshold': 0.5678991209026768, 'low_text': 0.19168263433456684, 'link_threshold': 0.4849722942730099}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:55:22,844] Trial 29 finished with value: 0.0 and parameters: {'gap_mult': 0.917887107858529, 'canvas_size': 3600, 'mag_ratio': 2.982888945913079, 'add_margin': 0.3144487437683528, 'text_threshold': 0.32134529399419554, 'low_text': 0.11131753239257092, 'link_threshold': 0.6478445771500324}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:55:24,929] Trial 30 finished with value: 6.0 and parameters: {'gap_mult': 1.2354572298871604, 'canvas_size': 1800, 'mag_ratio': 2.192116830092118, 'add_margin': 0.3752636196109105, 'text_threshold': 0.49763505040728695, 'low_text': 0.3099171209105759, 'link_threshold': 0.5630321790923893}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:55:27,347] Trial 31 finished with value: 0.0 and parameters: {'gap_mult': 1.4152985716317703, 'canvas_size': 10000, 'mag_ratio': 2.271704485180925, 'add_margin': 0.46402968781748494, 'text_threshold': 0.43744132250758155, 'low_text': 0.22150449012133064, 'link_threshold': 0.5866215734802223}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:55:29,694] Trial 32 finished with value: 3.0 and parameters: {'gap_mult': 1.3470046288290496, 'canvas_size': 10000, 'mag_ratio': 2.019964977098176, 'add_margin': 0.4809199347109301, 'text_threshold': 0.48927782801688746, 'low_text': 0.2061179495820208, 'link_threshold': 0.5576719211378318}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:55:31,811] Trial 33 finished with value: 3.0 and parameters: {'gap_mult': 0.42628313834266474, 'canvas_size': 10000, 'mag_ratio': 1.5404390532162884, 'add_margin': 0.40748830793442736, 'text_threshold': 0.38476933858540535, 'low_text': 0.1395917836080438, 'link_threshold': 0.6056158927267469}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:55:34,106] Trial 34 finished with value: 6.0 and parameters: {'gap_mult': 1.5326447367463771, 'canvas_size': 10000, 'mag_ratio': 1.72870248378863, 'add_margin': 0.34528521463794626, 'text_threshold': 0.5150794337111221, 'low_text': 0.24899664426731927, 'link_threshold': 0.6577784494576011}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:55:46,792] Trial 35 finished with value: 3.0 and parameters: {'gap_mult': 1.8110834294493454, 'canvas_size': 7200, 'mag_ratio': 3.6702375016738, 'add_margin': 0.4364732116125293, 'text_threshold': 0.537895085676295, 'low_text': 0.16136650600797486, 'link_threshold': 0.5402750834526241}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:55:50,265] Trial 36 finished with value: 6.0 and parameters: {'gap_mult': 1.2219988072354946, 'canvas_size': 10000, 'mag_ratio': 2.482952487867, 'add_margin': 0.26693338583603277, 'text_threshold': 0.4302885444872687, 'low_text': 0.13132228009588398, 'link_threshold': 0.32944012665242317}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:55:52,689] Trial 37 finished with value: 0.0 and parameters: {'gap_mult': 2.38129567347274, 'canvas_size': 1800, 'mag_ratio': 2.8968484899989493, 'add_margin': 0.4686449723543468, 'text_threshold': 0.302028768728002, 'low_text': 0.19115299871272867, 'link_threshold': 0.37876224726940994}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:56:09,392] Trial 38 finished with value: 6.0 and parameters: {'gap_mult': 2.8763678218532216, 'canvas_size': 10000, 'mag_ratio': 4.438316910484946, 'add_margin': 0.29829477938301685, 'text_threshold': 0.4700875822689006, 'low_text': 0.25353183963236203, 'link_threshold': 0.4213216239998583}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:56:12,401] Trial 39 finished with value: 2.0 and parameters: {'gap_mult': 0.6026962750957594, 'canvas_size': 3600, 'mag_ratio': 2.0123631514680036, 'add_margin': 0.3923298820411264, 'text_threshold': 0.5656381172012847, 'low_text': 0.6985243561080292, 'link_threshold': 0.44513454191041124}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:56:15,008] Trial 40 finished with value: 6.0 and parameters: {'gap_mult': 1.552605334791691, 'canvas_size': 1800, 'mag_ratio': 2.1836186583312256, 'add_margin': 0.24118715165087418, 'text_threshold': 0.34225003730198067, 'low_text': 0.30669141039365844, 'link_threshold': 0.29277314219252115}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:56:19,676] Trial 41 finished with value: 0.0 and parameters: {'gap_mult': 2.032777215476117, 'canvas_size': 1800, 'mag_ratio': 1.9646027862079287, 'add_margin': 0.36009363571123254, 'text_threshold': 0.36392934369845953, 'low_text': 0.22829120356983643, 'link_threshold': 0.5138704932472115}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:56:21,678] Trial 42 finished with value: 1.0 and parameters: {'gap_mult': 2.176387089985841, 'canvas_size': 1800, 'mag_ratio': 1.7019410486508, 'add_margin': 0.3540410973587066, 'text_threshold': 0.38904691450406964, 'low_text': 0.173755568533145, 'link_threshold': 0.5359395965294611}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:56:24,049] Trial 43 finished with value: 0.0 and parameters: {'gap_mult': 1.9814160602074264, 'canvas_size': 1800, 'mag_ratio': 2.6618241803684515, 'add_margin': 0.0866096719455687, 'text_threshold': 0.3297646130497309, 'low_text': 0.12249114396162103, 'link_threshold': 0.4813419467114378}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:56:26,529] Trial 44 finished with value: 6.0 and parameters: {'gap_mult': 2.413259320694561, 'canvas_size': 1800, 'mag_ratio': 1.9177687920847541, 'add_margin': 0.31469920463221435, 'text_threshold': 0.4377968866124756, 'low_text': 0.3696136534062048, 'link_threshold': 0.607201945666187}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:56:29,283] Trial 45 finished with value: 0.0 and parameters: {'gap_mult': 2.64463349486065, 'canvas_size': 7200, 'mag_ratio': 2.3460275763329173, 'add_margin': 0.39700866313058947, 'text_threshold': 0.3774153417241256, 'low_text': 0.24211527857587653, 'link_threshold': 0.3648129541396672}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:56:31,494] Trial 46 finished with value: 3.0 and parameters: {'gap_mult': 1.836878721870324, 'canvas_size': 1800, 'mag_ratio': 2.126890360324983, 'add_margin': 0.20726513840245164, 'text_threshold': 0.4137371364867357, 'low_text': 0.3342805619353377, 'link_threshold': 0.3978205666085285}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:56:33,911] Trial 47 finished with value: 2.0 and parameters: {'gap_mult': 0.9236009105572118, 'canvas_size': 10000, 'mag_ratio': 1.7057803405365435, 'add_margin': 0.36889361538736837, 'text_threshold': 0.35131987552000854, 'low_text': 0.6331189497946297, 'link_threshold': 0.5031830624015114}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:56:36,036] Trial 48 finished with value: 0.0 and parameters: {'gap_mult': 0.7075348045987095, 'canvas_size': 1800, 'mag_ratio': 1.8754699231214058, 'add_margin': 0.4201108898173849, 'text_threshold': 0.6401314045523575, 'low_text': 0.20134221483850864, 'link_threshold': 0.5473098961171385}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:56:43,775] Trial 49 finished with value: 6.0 and parameters: {'gap_mult': 1.4055293711987713, 'canvas_size': 10000, 'mag_ratio': 3.4829554961756273, 'add_margin': 0.2727130551864318, 'text_threshold': 0.5107363738699169, 'low_text': 0.2931375550010872, 'link_threshold': 0.5858997098239191}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:56:46,294] Trial 50 finished with value: 16.0 and parameters: {'gap_mult': 0.28271106449673233, 'canvas_size': 3600, 'mag_ratio': 1.6064784759584878, 'add_margin': 0.49696348828068293, 'text_threshold': 0.5542307215159156, 'low_text': 0.1614448436114824, 'link_threshold': 0.4435283721263239}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:56:54,785] Trial 51 finished with value: 6.0 and parameters: {'gap_mult': 1.6865773792850522, 'canvas_size': 10000, 'mag_ratio': 3.0544204754949122, 'add_margin': 0.1930166327695941, 'text_threshold': 0.6943934752754893, 'low_text': 0.2687102769555533, 'link_threshold': 0.6833448732377796}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:57:05,363] Trial 52 finished with value: 6.0 and parameters: {'gap_mult': 1.7335815942306727, 'canvas_size': 10000, 'mag_ratio': 4.0263794400950585, 'add_margin': 0.16928223850489543, 'text_threshold': 0.7483832969099119, 'low_text': 0.3418904310684578, 'link_threshold': 0.6450346895704159}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:57:24,354] Trial 53 finished with value: 9.0 and parameters: {'gap_mult': 2.0730357287645598, 'canvas_size': 10000, 'mag_ratio': 4.9661237759392405, 'add_margin': 0.06628189141235036, 'text_threshold': 0.6993694651671535, 'low_text': 0.39641751411068, 'link_threshold': 0.6913533032142294}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:57:29,650] Trial 54 finished with value: 6.0 and parameters: {'gap_mult': 1.8378208995218732, 'canvas_size': 10000, 'mag_ratio': 2.8058557292492172, 'add_margin': 0.14877145505589487, 'text_threshold': 0.6067678630149258, 'low_text': 0.28406293177241143, 'link_threshold': 0.6084234792788263}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:57:36,053] Trial 55 finished with value: 6.0 and parameters: {'gap_mult': 1.6077956791294892, 'canvas_size': 10000, 'mag_ratio': 3.184685723255018, 'add_margin': 0.33362136342078796, 'text_threshold': 0.6586094330040534, 'low_text': 0.47617920380466766, 'link_threshold': 0.6612731611178574}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:57:39,722] Trial 56 finished with value: 6.0 and parameters: {'gap_mult': 1.140093656057085, 'canvas_size': 10000, 'mag_ratio': 2.5390512016510867, 'add_margin': 0.43015222762309546, 'text_threshold': 0.45677922363281287, 'low_text': 0.22243443589468803, 'link_threshold': 0.6776769331383322}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:57:42,584] Trial 57 finished with value: 0.0 and parameters: {'gap_mult': 2.2173003288074886, 'canvas_size': 7200, 'mag_ratio': 2.30125078762444, 'add_margin': 0.31399758400455013, 'text_threshold': 0.407354067246686, 'low_text': 0.1741364232105632, 'link_threshold': 0.5703705499828124}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:57:45,367] Trial 58 finished with value: 6.0 and parameters: {'gap_mult': 1.4348361111451478, 'canvas_size': 1800, 'mag_ratio': 4.119267428393518, 'add_margin': 0.3921988549011919, 'text_threshold': 0.6108450546952728, 'low_text': 0.10297918293515232, 'link_threshold': 0.6423078633822412}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:05,731] Trial 59 finished with value: 19.0 and parameters: {'gap_mult': 1.3146190886260487, 'canvas_size': 10000, 'mag_ratio': 4.517561455513468, 'add_margin': 0.46035628899869246, 'text_threshold': 0.4989918282450292, 'low_text': 0.13611074435464954, 'link_threshold': 0.2370550150876778}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:08,652] Trial 60 finished with value: 0.0 and parameters: {'gap_mult': 2.4903957108815717, 'canvas_size': 10000, 'mag_ratio': 1.7991544479501766, 'add_margin': 0.28447853167616255, 'text_threshold': 0.43034949021994956, 'low_text': 0.32262981212166886, 'link_threshold': 0.6261847736969334}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:11,166] Trial 61 finished with value: 0.0 and parameters: {'gap_mult': 1.5420067502330739, 'canvas_size': 10000, 'mag_ratio': 1.6027903475171934, 'add_margin': 0.3483023224854118, 'text_threshold': 0.36393439736200833, 'low_text': 0.10422663391107893, 'link_threshold': 0.6208477423703296}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:13,672] Trial 62 finished with value: 3.0 and parameters: {'gap_mult': 1.9141280029919703, 'canvas_size': 10000, 'mag_ratio': 1.5213430371729078, 'add_margin': 0.3347560723493002, 'text_threshold': 0.3330972928489088, 'low_text': 0.12091536219081386, 'link_threshold': 0.6690028802757306}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:15,964] Trial 63 finished with value: 0.0 and parameters: {'gap_mult': 1.4423427725511269, 'canvas_size': 10000, 'mag_ratio': 1.5108889381954187, 'add_margin': 0.3806898775360643, 'text_threshold': 0.38740713862944387, 'low_text': 0.15382441727496243, 'link_threshold': 0.5866630175683626}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:18,364] Trial 64 finished with value: 6.0 and parameters: {'gap_mult': 1.734875836674064, 'canvas_size': 10000, 'mag_ratio': 1.7948695199875504, 'add_margin': 0.011307090053004093, 'text_threshold': 0.3507411762015381, 'low_text': 0.20246618057597243, 'link_threshold': 0.601242330820969}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:20,854] Trial 65 finished with value: 2.0 and parameters: {'gap_mult': 1.1757405169069743, 'canvas_size': 10000, 'mag_ratio': 2.086165783438007, 'add_margin': 0.2456882382996693, 'text_threshold': 0.4741634337377318, 'low_text': 0.2684493145468628, 'link_threshold': 0.6333830867101429}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:23,446] Trial 66 finished with value: 0.0 and parameters: {'gap_mult': 1.0583138290250695, 'canvas_size': 3600, 'mag_ratio': 1.9833204433007152, 'add_margin': 0.4078906425246266, 'text_threshold': 0.37507193446364606, 'low_text': 0.17773740261098023, 'link_threshold': 0.6550365183841861}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:25,700] Trial 67 finished with value: 0.0 and parameters: {'gap_mult': 2.1113828186904646, 'canvas_size': 1800, 'mag_ratio': 1.627737129670728, 'add_margin': 0.31184312655078883, 'text_threshold': 0.5238607673015194, 'low_text': 0.24788455119303154, 'link_threshold': 0.6997993963347023}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:28,419] Trial 68 finished with value: 0.0 and parameters: {'gap_mult': 0.9773717590997882, 'canvas_size': 10000, 'mag_ratio': 2.4010170219452776, 'add_margin': 0.21829639411230875, 'text_threshold': 0.30988261906742653, 'low_text': 0.1398668395141619, 'link_threshold': 0.5298214659650317}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:31,028] Trial 69 finished with value: 0.0 and parameters: {'gap_mult': 1.619967386716393, 'canvas_size': 7200, 'mag_ratio': 1.8845116539294495, 'add_margin': 0.37112199407447244, 'text_threshold': 0.42276720376960203, 'low_text': 0.21533980924102802, 'link_threshold': 0.5734968318657384}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:33,749] Trial 70 finished with value: 3.0 and parameters: {'gap_mult': 2.751326216417243, 'canvas_size': 10000, 'mag_ratio': 2.2278059789258613, 'add_margin': 0.4494222828148234, 'text_threshold': 0.4452982706384931, 'low_text': 0.11598893420937323, 'link_threshold': 0.49921300401957514}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:38,611] Trial 71 finished with value: 0.0 and parameters: {'gap_mult': 1.2993737247556747, 'canvas_size': 10000, 'mag_ratio': 2.043840714353784, 'add_margin': 0.3350083247295932, 'text_threshold': 0.35748588299055994, 'low_text': 0.2265551804542083, 'link_threshold': 0.6748472701551019}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:40,711] Trial 72 finished with value: 0.0 and parameters: {'gap_mult': 1.487951326924774, 'canvas_size': 10000, 'mag_ratio': 1.7554868354126691, 'add_margin': 0.28706797466207484, 'text_threshold': 0.4008671166219604, 'low_text': 0.23519385817676244, 'link_threshold': 0.6174831773618225}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:43,253] Trial 73 finished with value: 0.0 and parameters: {'gap_mult': 1.287024339527743, 'canvas_size': 10000, 'mag_ratio': 1.6619019790647513, 'add_margin': 0.35253473628188264, 'text_threshold': 0.3995679960639146, 'low_text': 0.18665460608037232, 'link_threshold': 0.6997429613770489}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:49,743] Trial 74 finished with value: 0.0 and parameters: {'gap_mult': 0.7917739839237641, 'canvas_size': 1800, 'mag_ratio': 3.691191180464256, 'add_margin': 0.256227708458774, 'text_threshold': 0.588628250867445, 'low_text': 0.15526564029661177, 'link_threshold': 0.6388558339492125}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:52,332] Trial 75 finished with value: 6.0 and parameters: {'gap_mult': 1.3836209109248103, 'canvas_size': 10000, 'mag_ratio': 1.9303637543024748, 'add_margin': 0.3266009581459893, 'text_threshold': 0.32642898614871146, 'low_text': 0.26056379346676267, 'link_threshold': 0.5581534261963809}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:54,961] Trial 76 finished with value: 2.0 and parameters: {'gap_mult': 1.2096224319499855, 'canvas_size': 1800, 'mag_ratio': 2.1238326363224926, 'add_margin': 0.3017380262152115, 'text_threshold': 0.37394001822801004, 'low_text': 0.28678593358242277, 'link_threshold': 0.6599286178042858}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:58:57,430] Trial 77 finished with value: 6.0 and parameters: {'gap_mult': 1.7802378509775458, 'canvas_size': 10000, 'mag_ratio': 1.8816946343259815, 'add_margin': 0.36388919602950753, 'text_threshold': 0.34090268515882893, 'low_text': 0.5479980945347005, 'link_threshold': 0.3892473209789098}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:00,036] Trial 78 finished with value: 6.0 and parameters: {'gap_mult': 1.6665504567091305, 'canvas_size': 3600, 'mag_ratio': 2.054080103730055, 'add_margin': 0.3410899908917761, 'text_threshold': 0.5498926207140523, 'low_text': 0.2100280220064789, 'link_threshold': 0.35186590062085316}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:02,646] Trial 79 finished with value: 6.0 and parameters: {'gap_mult': 2.2858125900796873, 'canvas_size': 10000, 'mag_ratio': 2.598654169175222, 'add_margin': 0.1096485806039654, 'text_threshold': 0.39632565711047857, 'low_text': 0.13232194942047554, 'link_threshold': 0.46478059695144425}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:04,819] Trial 80 finished with value: 6.0 and parameters: {'gap_mult': 1.9478604127774308, 'canvas_size': 1800, 'mag_ratio': 1.7354733093971937, 'add_margin': 0.3808447412146811, 'text_threshold': 0.48479170584830067, 'low_text': 0.3556634621163809, 'link_threshold': 0.5922711205439385}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:07,509] Trial 81 finished with value: 0.0 and parameters: {'gap_mult': 0.23671399010645872, 'canvas_size': 3600, 'mag_ratio': 2.2341864972807914, 'add_margin': 0.1768337152281278, 'text_threshold': 0.5006948515280124, 'low_text': 0.43160854679965444, 'link_threshold': 0.44315830511907794}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:12,406] Trial 82 finished with value: 0.0 and parameters: {'gap_mult': 0.48961947690267943, 'canvas_size': 3600, 'mag_ratio': 2.4105107256619434, 'add_margin': 0.22217208323937157, 'text_threshold': 0.4656720325400188, 'low_text': 0.4844647385891675, 'link_threshold': 0.4065883472444202}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:15,689] Trial 83 finished with value: 3.0 and parameters: {'gap_mult': 0.7997340915545504, 'canvas_size': 3600, 'mag_ratio': 3.329512876745448, 'add_margin': 0.47893630067320886, 'text_threshold': 0.42471000078598803, 'low_text': 0.4466149722453414, 'link_threshold': 0.3563209303791829}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:19,726] Trial 84 finished with value: 3.0 and parameters: {'gap_mult': 1.486979265948101, 'canvas_size': 3600, 'mag_ratio': 3.0996875437037343, 'add_margin': 0.1406713123567832, 'text_threshold': 0.41231281178387974, 'low_text': 0.3988143194327536, 'link_threshold': 0.6880342966933964}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:22,964] Trial 85 finished with value: 6.0 and parameters: {'gap_mult': 1.0932715433629328, 'canvas_size': 10000, 'mag_ratio': 2.7965070007470225, 'add_margin': 0.3037215646374164, 'text_threshold': 0.46391666668681963, 'low_text': 0.5906737765487706, 'link_threshold': 0.43021720939102975}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:25,413] Trial 86 finished with value: 6.0 and parameters: {'gap_mult': 1.5809283981134732, 'canvas_size': 1800, 'mag_ratio': 2.903098445629802, 'add_margin': 0.323204629547763, 'text_threshold': 0.4475691348897395, 'low_text': 0.4917204876920026, 'link_threshold': 0.409723948089749}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:28,208] Trial 87 finished with value: 7.0 and parameters: {'gap_mult': 1.0241737060926785, 'canvas_size': 7200, 'mag_ratio': 2.170019590734165, 'add_margin': 0.39982023963456487, 'text_threshold': 0.38733220577207395, 'low_text': 0.5104734664626996, 'link_threshold': 0.4278163892556177}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:30,772] Trial 88 finished with value: 6.0 and parameters: {'gap_mult': 1.268613828376473, 'canvas_size': 10000, 'mag_ratio': 2.501792093226784, 'add_margin': 0.19746104473498233, 'text_threshold': 0.533720206705651, 'low_text': 0.3001198513836001, 'link_threshold': 0.547632988263816}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:32,882] Trial 89 finished with value: 3.0 and parameters: {'gap_mult': 1.3478705357481844, 'canvas_size': 3600, 'mag_ratio': 1.5703952619051842, 'add_margin': 0.2383571152444115, 'text_threshold': 0.4911376231475963, 'low_text': 0.3204541870778804, 'link_threshold': 0.6127908600439828}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:35,365] Trial 90 finished with value: 0.0 and parameters: {'gap_mult': 1.490753514396979, 'canvas_size': 10000, 'mag_ratio': 2.6871785603774363, 'add_margin': 0.2705280840013191, 'text_threshold': 0.35332971931310364, 'low_text': 0.19192155927759122, 'link_threshold': 0.650409989571328}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:37,984] Trial 91 finished with value: 18.0 and parameters: {'gap_mult': 0.6059944269888327, 'canvas_size': 3600, 'mag_ratio': 3.057057959857495, 'add_margin': 0.35778874256558385, 'text_threshold': 0.3212941201056074, 'low_text': 0.1174286089721853, 'link_threshold': 0.6696455061067778}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:40,618] Trial 92 finished with value: 6.0 and parameters: {'gap_mult': 0.3775662373815719, 'canvas_size': 3600, 'mag_ratio': 2.925896099570541, 'add_margin': 0.34378187458078385, 'text_threshold': 0.30183448603810026, 'low_text': 0.16525888651370374, 'link_threshold': 0.6300161950690972}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:43,262] Trial 93 finished with value: 0.0 and parameters: {'gap_mult': 0.8521897751266599, 'canvas_size': 3600, 'mag_ratio': 3.4865557187596905, 'add_margin': 0.2894311604971045, 'text_threshold': 0.3717456107531875, 'low_text': 0.11135697155385867, 'link_threshold': 0.601458513802612}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:47,807] Trial 94 finished with value: 0.0 and parameters: {'gap_mult': 0.6891387968747075, 'canvas_size': 3600, 'mag_ratio': 2.3574090634090874, 'add_margin': 0.3717055471560436, 'text_threshold': 0.5113421563825482, 'low_text': 0.1454440500741882, 'link_threshold': 0.5800254192403381}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:50,163] Trial 95 finished with value: 0.0 and parameters: {'gap_mult': 0.9404696206232311, 'canvas_size': 10000, 'mag_ratio': 2.5879800085298745, 'add_margin': 0.3178008713854836, 'text_threshold': 0.3188847847129784, 'low_text': 0.10302789858767883, 'link_threshold': 0.6469583229932883}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:52,175] Trial 96 finished with value: 2.0 and parameters: {'gap_mult': 1.1576708263622413, 'canvas_size': 1800, 'mag_ratio': 3.16407741524424, 'add_margin': 0.32888131722555225, 'text_threshold': 0.3388924348605037, 'low_text': 0.24394707724107206, 'link_threshold': 0.6660346756545124}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:54,166] Trial 97 finished with value: 6.0 and parameters: {'gap_mult': 1.3644247440046287, 'canvas_size': 10000, 'mag_ratio': 1.8352085571892742, 'add_margin': 0.42619636062008526, 'text_threshold': 0.3800261620480223, 'low_text': 0.1296581027202471, 'link_threshold': 0.6787481334559138}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:56,295] Trial 98 finished with value: 0.0 and parameters: {'gap_mult': 0.5284987720929559, 'canvas_size': 10000, 'mag_ratio': 1.967725610612323, 'add_margin': 0.3076465681849014, 'text_threshold': 0.3605602907368438, 'low_text': 0.14851815752665998, 'link_threshold': 0.6373355528450428}. Best is trial 7 with value: 0.0.\n",
+      "[I 2025-09-25 20:59:58,253] Trial 99 finished with value: 0.0 and parameters: {'gap_mult': 1.8840728444738277, 'canvas_size': 1800, 'mag_ratio': 3.44988791521599, 'add_margin': 0.17183901821211403, 'text_threshold': 0.4381693314913302, 'low_text': 0.18140874209964664, 'link_threshold': 0.4916185895177267}. Best is trial 7 with value: 0.0.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Лучшие параметры:\n",
+      "{'gap_mult': 0.1148899697656891, 'canvas_size': 1800, 'mag_ratio': 2.0909813860745077, 'add_margin': 0.38880953861561235, 'text_threshold': 0.5046870997550761, 'low_text': 0.2758043304152779, 'link_threshold': 0.4121455607297143}\n",
+      "Лучшее расстояние: 0.0\n"
+     ]
+    }
+   ],
    "source": [
-    "import requests\n",
-    "\n",
-    "# Базовый URL API портала открытых данных Москвы\n",
-    "base_url = \"https://api.data.mos.ru/v1/1498/rows\"\n",
+    "import optuna \n",
+    "import pandas as pd\n",
+    "from pathlib import Path\n",
+    "import itertools\n",
+    "import collections\n",
+    "\n",
+    "# Ваша реализация расстояния Левенштейна\n",
+    "def levenshtein_distance(string1, string2):\n",
+    "    \"\"\"\n",
+    "    >>> levenshtein_distance('AATZ', 'AAAZ')\n",
+    "    1\n",
+    "    >>> levenshtein_distance('AATZZZ', 'AAAZ')\n",
+    "    3\n",
+    "    \"\"\"\n",
+    "    distance = 0\n",
+    "    if len(string1) < len(string2):\n",
+    "        string1, string2 = string2, string1\n",
+    "    \n",
+    "    # Заменяем itertools.izip_longest на zip_longest для Python 3\n",
+    "    from itertools import zip_longest\n",
+    "    for i, v in zip_longest(string1, string2, fillvalue='-'):\n",
+    "        if i != v:\n",
+    "            distance += 1\n",
+    "    return distance\n",
+    "\n",
+    "# Определите ROOT_DIR (добавьте ваш путь)\n",
+    "ROOT_DIR = Path('/home/lizardapn/Hack_digital/hack_digital_transformation')\n",
+    "\n",
+    "data = pd.read_csv(filepath_or_buffer=ROOT_DIR / 'data/processed_data/merged_data.csv')\n",
+    "\n",
+    "def objective(trial):\n",
+    "    params = {\n",
+    "        'gap_mult': trial.suggest_float('gap_mult', 0.1, 3.0),\n",
+    "        'canvas_size': trial.suggest_categorical('canvas_size', [1800, 3600, 7200, 10000]),\n",
+    "        'mag_ratio': trial.suggest_float('mag_ratio', 1.5, 5.0),\n",
+    "        'add_margin': trial.suggest_float('add_margin', 0.01, 0.5),\n",
+    "        'text_threshold': trial.suggest_float('text_threshold', 0.3, 0.75),  # Исправлено: text_treshold -> text_threshold\n",
+    "        'low_text': trial.suggest_float('low_text', 0.1, 0.7),\n",
+    "        'link_threshold': trial.suggest_float('link_threshold', 0.15, 0.7),\n",
+    "        'langs': ['en'], \n",
+    "        'gpu': True,\n",
+    "    }\n",
+    "    \n",
+    "    ocr = OverlayOCR(**params)\n",
     "\n",
-    "# Если требуется API-KEY, его нужно добавить в заголовки\n",
-    "headers = {\n",
-    "     \"api-key\": os.getenv('API_KEY_DATA_MOS') \n",
-    "}\n",
+    "    IMG_PATH = ROOT_DIR / 'data/raw_data/data/metadata/INC/united_image/0a0ee2fb-b7ad-4430-97d7-281e2c293041.jpg'\n",
     "\n",
-    "try:\n",
-    "    print('Начали')\n",
-    "    # Выполняем GET-запрос к API для получения списка датасетов\n",
-    "    response = requests.get(base_url, headers=headers)\n",
-    "    print(response)\n",
-    "    response.raise_for_status()  # Проверяем на ошибки HTTP\n",
-    "    print(response)\n",
-    "    # Парсим JSON-ответ\n",
-    "    datasets = response.json()\n",
+    "    # Получаем реальные ID для этого изображения\n",
+    "    real_ids = data[data['camera_id'] == '0a0ee2fb-b7ad-4430-97d7-281e2c293041.jpg'].filename.values\n",
     "    \n",
-    "    # Выводим информацию о первых нескольких датасетах\n",
-    "    for i, dataset in enumerate(datasets[:5]):  # Ограничиваем вывод первыми 5\n",
-    "        print(f\"Датасет {i+1}: ID - {dataset.get('Id', 'N/A')}, Название - {dataset.get('Caption', 'N/A')}\")\n",
-    "\n",
-    "except requests.exceptions.RequestException as e:\n",
-    "    print(f\"Ошибка при выполнении запроса: {e}\")\n",
-    "except ValueError as e:\n",
-    "    print(f\"Ошибка при парсинге JSON: {e}\")"
+    "    # Распознаем текст с текущими параметрами\n",
+    "    final, norm, joined, conf, roi_name = ocr.run_on_image(str(IMG_PATH))\n",
+    "    \n",
+    "    # Вычисляем расстояния до всех реальных ID\n",
+    "    distances = []\n",
+    "    for real_id in real_ids:\n",
+    "        distance = levenshtein_distance(str(real_id), final)\n",
+    "        distances.append(distance)\n",
+    "    \n",
+    "    # Находим минимальное расстояние (наиболее близкий ID)\n",
+    "    if distances:\n",
+    "        min_distance = min(distances)\n",
+    "    else:\n",
+    "        min_distance = float('inf')  # Если нет реальных ID для сравнения\n",
+    "    \n",
+    "    return min_distance\n",
+    "\n",
+    "# Создаем и запускаем исследование\n",
+    "study = optuna.create_study(direction='minimize')\n",
+    "study.optimize(objective, n_trials=100)  # Можно указать количество trials\n",
+    "\n",
+    "# Выводим лучшие параметры\n",
+    "print(\"Лучшие параметры:\")\n",
+    "print(study.best_params)\n",
+    "print(f\"Лучшее расстояние: {study.best_value}\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "d85e9c25",
+   "execution_count": 41,
+   "id": "6f887d35",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "best_params = study.best_params\n",
+    "\n",
+    "best_params.update(\n",
+    "            langs=['en'], \n",
+    "            gpu=True,\n",
+    "            )"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "7934bd0c",
+   "execution_count": 42,
+   "id": "a549736d",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'gap_mult': 0.1148899697656891,\n",
+       " 'canvas_size': 1800,\n",
+       " 'mag_ratio': 2.0909813860745077,\n",
+       " 'add_margin': 0.38880953861561235,\n",
+       " 'text_threshold': 0.5046870997550761,\n",
+       " 'low_text': 0.2758043304152779,\n",
+       " 'link_threshold': 0.4121455607297143,\n",
+       " 'langs': ['en'],\n",
+       " 'gpu': True}"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "best_params"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "33b6f8f8",
+   "execution_count": 46,
+   "id": "e0b6bb75",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Best ROI: left_bottom | conf=0.57\n",
+      "joined: DUN_Ka_Vao_2_50151\n",
+      "norm  : DUN_Ka_Vao_2_50151\n",
+      "final : DUN_Ka_Vao_2_50151\n"
+     ]
+    }
+   ],
+   "source": [
+    "IMG = r'/home/lizardapn/Hack_digital/hack_digital_transformation/data/raw_data/data/metadata/INC/united_image/ffeeecc1-407c-4976-a625-280e8c987f66.jpg'\n",
+    "\n",
+    "ocr = OverlayOCR(**best_params)\n",
+    "\n",
+    "final, norm, joined, conf, roi_name = ocr.run_on_image(IMG)\n",
+    "print(f\"Best ROI: {roi_name} | conf={conf:.2f}\")\n",
+    "print(\"joined:\", joined)\n",
+    "print(\"norm  :\", norm)\n",
+    "print(\"final :\", final)\n"
+   ]
   }
  ],
  "metadata": {
diff --git a/requirements.txt b/requirements.txt
index 0e60daa..8784e57 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,110 +1,47 @@
-arrow==1.2.3
-attrs==23.2.0
-Automat==22.10.0
-Babel==2.10.3
-bcrypt==3.2.2
-binaryornot==0.4.4
-blinker==1.7.0
-build==1.0.3
-CacheControl==0.14.0
-certifi==2023.11.17
-chardet==5.2.0
-cleo==2.1.0
-click==8.1.6
-cloud-init==25.1.4
+Mako==1.3.10
+MarkupSafe==3.0.2
+PyYAML==6.0.2
+Pygments==2.19.2
+SQLAlchemy==2.0.43
+alembic==1.16.5
+charset-normalizer==3.4.3
+cmaes==0.12.0
 colorama==0.4.6
-command-not-found==0.3
-configobj==5.0.8
-constantly==23.10.4
-cookiecutter==2.6.0
-crashtest==0.4.1
-cryptography==41.0.7
-dbus-python==1.3.2
-distlib==0.3.8
-distro==1.9.0
-distro-info==1.7+build1
-dulwich==0.21.6
-fastimport==0.9.14
-fastjsonschema==2.19.0
-filelock==3.13.1
-h11==0.14.0
-httplib2==0.20.4
-hyperlink==21.0.0
-idna==3.6
-importlib-metadata==4.12.0
-incremental==22.10.0
-installer==0.7.0
-jaraco.classes==3.2.1
-jeepney==0.8.0
-Jinja2==3.1.2
-jsonpatch==1.32
-jsonpointer==2.0
-jsonschema==4.10.3
-keyring==24.3.1
-launchpadlib==1.11.0
-lazr.restfulclient==0.14.6
-lazr.uri==1.0.6
-lockfile==0.12.2
-markdown-it-py==3.0.0
-MarkupSafe==2.1.5
-mdurl==0.1.2
-more-itertools==10.2.0
-msgpack==1.0.3
-netifaces==0.11.0
-oauthlib==3.2.2
-packaging==24.0
-pexpect==4.9.0
-pkginfo==1.9.6
-platformdirs==4.2.0
-poetry==1.8.2
-poetry-core==1.9.0
-poetry-plugin-export==1.6.0
-ptyprocess==0.7.0
-pyasn1==0.4.8
-pyasn1-modules==0.2.8
-pycurl==7.45.3
-Pygments==2.17.2
-PyGObject==3.48.2
-PyHamcrest==2.1.0
-PyJWT==2.7.0
-pylev==1.4.0
-pyOpenSSL==23.2.0
-pyparsing==3.1.1
-pyproject_hooks==1.0.0
-pyrsistent==0.20.0
-pyserial==3.5
-python-apt==2.7.7+ubuntu5
-python-dateutil==2.8.2
-python-magic==0.4.27
-python-slugify==8.0.4
-pytz==2024.1
-PyYAML==6.0.1
-requests==2.31.0
-requests-toolbelt==1.0.0
-rich==13.7.1
-ruamel.yaml==0.17.21
-ruamel.yaml.clib==0.2.8
-s3cmd==2.4.0
-SecretStorage==3.3.3
-service-identity==24.1.0
-setuptools==68.1.2
-shellingham==1.5.4
-six==1.16.0
-systemd-python==235
-toml==0.10.2
-tomlkit==0.12.4
-trove-classifiers==2024.1.31
-Twisted==24.3.0
-typing_extensions==4.10.0
-ubuntu-pro-client==8001
-unattended-upgrades==0.1
-Unidecode==1.3.8
-urllib3==2.0.7
-uvicorn==0.27.1
-uvloop==0.19.0
-virtualenv==20.25.0+ds
-wadllib==1.3.6
-wheel==0.42.0
-wsproto==1.2.0
-zipp==1.0.0
-zope.interface==6.1
+colorlog==6.9.0
+defusedxml==0.7.1
+easyocr==1.7.2
+googleapis-common-protos==1.70.0
+greenlet==3.2.4
+grpcio==1.75.0
+imageio==2.37.0
+importlib-metadata==6.11.0
+joblib==1.5.2
+mpmath==1.3.0
+networkx==3.5
+numpy==2.2.6
+opencv-python==4.12.0.88
+optuna==4.5.0
+packaging==23.2
+pandas==2.3.2
+pillow==11.3.0
+platformdirs==4.4.0
+protobuf==6.32.1
+psutil==7.1.0
+pyarrow==21.0.0
+pydot==4.0.1
+python-bidi==0.6.6
+python-dateutil==2.9.0.post0
+pytz==2025.2
+scikit-image==0.25.2
+scikit-learn==1.7.2
+scipy==1.16.2
+six==1.17.0
+sympy==1.14.0
+threadpoolctl==3.6.0
+tifffile==2025.9.20
+torch==2.8.0
+torchvision==0.23.0
+tqdm==4.67.1
+triton==3.4.0
+typing-extensions==4.15.0
+zipp==3.23.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
index aa8aee6..22b1805 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,6 @@
         "scikit-learn>=1.0.0",
         "matplotlib>=3.5.0",
         "seaborn>=0.11.0",
-        "dvc>=2.0.0",
     ],
     extras_require={
         "dev": [
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..076a757
--- /dev/null
+++ b/src/__init__.py
@@ -0,0 +1,3 @@
+from . import utils
+
+__all__ = ['utils']
\ No newline at end of file
diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py
deleted file mode 100644
index 13fb721..0000000
--- a/src/data/make_dataset.py
+++ /dev/null
@@ -1,225 +0,0 @@
-import logging
-import os
-import zipfile
-from io import BytesIO, StringIO
-
-import boto3
-import numpy as np
-import pandas as pd
-import yaml
-from sklearn.model_selection import train_test_split
-
-# Настройка логирования
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-def load_data_from_s3(
-    bucket_name,
-    file_key,
-    endpoint_url=None,
-    aws_access_key_id=None,
-    aws_secret_access_key=None,
-    region_name="ru-central1",
-):
-    """Загрузка данных из S3-совместимого хранилища"""
-    logger.info(f"Загрузка данных из s3://{bucket_name}/{file_key}")
-
-    try:
-        s3 = boto3.client(
-            "s3",
-            endpoint_url=endpoint_url or "https://storage.yandexcloud.net",
-            aws_access_key_id=aws_access_key_id,
-            aws_secret_access_key=aws_secret_access_key,
-            region_name=region_name,
-        )
-
-        # Загрузка данных
-        s3.download_file(bucket_name, file_s3_src, file_local)
-
-    except Exception as e:
-        logger.error(f"Ошибка при загрузке данных из S3: {str(e)}")
-        raise
-
-
-def load_data(file_path):
-    """Загрузка данных из CSV файла (локально)"""
-    logger.info(f"Загрузка данных из {file_path}")
-    if not os.path.exists(file_path):
-        raise FileNotFoundError(f"Файл данных не найден: {file_path}")
-    return pd.read_csv(file_path)
-
-
-def preprocess_data(df, numeric_features, categorical_features):
-    """Предобработка данных путем обработки пропущенных значений и кодирования категориальных признаков"""
-    logger.info("Предобработка данных")
-
-    # Обработка пропущенных значений
-    for col in numeric_features:
-        if col in df.columns:
-            df[col].fillna(df[col].median(), inplace=True)
-
-    for col in categorical_features:
-        if col in df.columns:
-            df[col].fillna("Неизвестно", inplace=True)
-
-    # Кодирование категориальных переменных
-    df_processed = pd.get_dummies(df, columns=categorical_features, drop_first=True)
-
-    return df_processed
-
-
-def split_data(df, target_column, test_size=0.2, random_state=42):
-    """Разделение данных на обучающую и тестовую выборки"""
-    logger.info("Разделение данных на обучающую и тестовую выборки")
-
-    if target_column not in df.columns:
-        raise ValueError(f"Целевой столбец '{target_column}' не найден в датафрейме")
-
-    X = df.drop(columns=[target_column])
-    y = df[target_column]
-
-    return train_test_split(X, y, test_size=test_size, random_state=random_state)
-
-
-def save_data(X_train, X_test, y_train, y_test, train_path, test_path):
-    """Сохранение обучающей и тестовой выборок в CSV файлы"""
-    logger.info(f"Сохранение обучающих данных в {train_path}")
-    logger.info(f"Сохранение тестовых данных в {test_path}")
-
-    # Создание директорий, если они не существуют
-    os.makedirs(os.path.dirname(train_path), exist_ok=True)
-    os.makedirs(os.path.dirname(test_path), exist_ok=True)
-
-    # Сохранение данных
-    X_train.to_csv(train_path, index=False)
-    X_test.to_csv(test_path, index=False)
-
-    # Сохранение целевых переменных
-    pd.DataFrame({"target": y_train}).to_csv(train_path.replace(".csv", "_target.csv"), index=False)
-    pd.DataFrame({"target": y_test}).to_csv(test_path.replace(".csv", "_target.csv"), index=False)
-
-
-def upload_to_s3(
-    local_file_path,
-    bucket_name,
-    s3_key,
-    endpoint_url=None,
-    aws_access_key_id=None,
-    aws_secret_access_key=None,
-    region_name="ru-central1",
-):
-    """Загрузка файла в S3-совместимое хранилище"""
-    logger.info(f"Загрузка {local_file_path} в s3://{bucket_name}/{s3_key}")
-
-    try:
-        # Создание клиента S3
-        s3 = boto3.client(
-            "s3",
-            endpoint_url=endpoint_url or f"https://storage.yandexcloud.net",
-            aws_access_key_id=aws_access_key_id,
-            aws_secret_access_key=aws_secret_access_key,
-            region_name=region_name,
-        )
-
-        # Загрузка файла
-        s3.upload_file(local_file_path, bucket_name, s3_key)
-        logger.info(f"Файл успешно загружен в S3: s3://{bucket_name}/{s3_key}")
-
-    except Exception as e:
-        logger.error(f"Ошибка при загрузке файла в S3: {str(e)}")
-        raise
-
-
-def main():
-    """Главная функция для подготовки набора данных"""
-    # Загрузка конфигурации
-    with open("configs/config.yaml", "r") as f:
-        config = yaml.safe_load(f)
-
-    # Параметры из конфигурации
-    params = config
-
-    # Параметры S3 из переменных окружения
-    s3_bucket = os.getenv("S3_BUCKET", "s3-dvc")
-    s3_raw_data_key = os.getenv("S3_RAW_DATA_KEY", "Датасет.zip")
-    s3_processed_prefix = os.getenv("S3_PROCESSED_PREFIX", "data/processed/")
-    aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
-    aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
-    endpoint_url = os.getenv("AWS_ENDPOINT_URL", "https://storage.yandexcloud.net")
-    region_name = os.getenv("AWS_REGION", "ru-central1")
-
-    try:
-        # Загрузка сырых данных из S3 или локально
-        if s3_bucket and aws_access_key_id and aws_secret_access_key:
-            df = load_data_from_s3(
-                s3_bucket, s3_raw_data_key, endpoint_url, aws_access_key_id, aws_secret_access_key, region_name
-            )
-        else:
-            raw_data_path = os.path.join(config["data"]["raw_path"], "data.csv")
-            df = load_data(raw_data_path)
-
-        # Предобработка данных
-        df_processed = preprocess_data(
-            df, params["features"]["numeric_features"], params["features"]["categorical_features"]
-        )
-
-        # Разделение данных
-        X_train, X_test, y_train, y_test = split_data(
-            df_processed, config["data"]["target_column"], params["data"]["test_size"], params["data"]["random_state"]
-        )
-
-        # Сохранение обработанных данных локально
-        train_path = os.path.join(config["data"]["processed_path"], config["data"]["train_file"])
-        test_path = os.path.join(config["data"]["processed_path"], config["data"]["test_file"])
-
-        save_data(X_train, X_test, y_train, y_test, train_path, test_path)
-
-        # Загрузка обработанных данных в S3, если указаны credentials
-        if s3_bucket and aws_access_key_id and aws_secret_access_key:
-            upload_to_s3(
-                train_path,
-                s3_bucket,
-                s3_processed_prefix + config["data"]["train_file"],
-                endpoint_url,
-                aws_access_key_id,
-                aws_secret_access_key,
-                region_name,
-            )
-            upload_to_s3(
-                test_path,
-                s3_bucket,
-                s3_processed_prefix + config["data"]["test_file"],
-                endpoint_url,
-                aws_access_key_id,
-                aws_secret_access_key,
-                region_name,
-            )
-            upload_to_s3(
-                train_path.replace(".csv", "_target.csv"),
-                s3_bucket,
-                s3_processed_prefix + config["data"]["train_file"].replace(".csv", "_target.csv"),
-                endpoint_url,
-                aws_access_key_id,
-                aws_secret_access_key,
-                region_name,
-            )
-            upload_to_s3(
-                test_path.replace(".csv", "_target.csv"),
-                s3_bucket,
-                s3_processed_prefix + config["data"]["test_file"].replace(".csv", "_target.csv"),
-                endpoint_url,
-                aws_access_key_id,
-                aws_secret_access_key,
-                region_name,
-            )
-
-        logger.info("Подготовка данных успешно завершена")
-
-    except Exception as e:
-        logger.error(f"Ошибка в подготовке данных: {str(e)}")
-        raise
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/data/prepare_data.py b/src/data/prepare_data.py
new file mode 100644
index 0000000..01ff08c
--- /dev/null
+++ b/src/data/prepare_data.py
@@ -0,0 +1,183 @@
+from pathlib import Path
+
+import pandas as pd
+import torch
+import torchvision.transforms as transforms
+from PIL import Image
+from sklearn.model_selection import train_test_split
+from torch.utils.data import DataLoader, Dataset
+
+
+class PrepareData(Dataset):
+    """
+    Класс для подготовки датасета из изображений и их географических координат.
+    Автоматически разделяет данные на тренировочную и тестовую выборки с разными трансформациями.
+
+    Attributes
+    ----------
+    images_dir : str
+        Путь к директории с изображениями.
+    transform : callable or None
+        Трансформации для применения к изображениям.
+    df : pandas.DataFrame
+        Данные из CSV-файла.
+    train_indices : list
+        Индексы тренировочной выборки.
+    test_indices : list
+        Индексы тестовой выборки.
+    """
+
+    def __init__(self, csv_path, images_dir, test_size=0.2, random_state=42):
+        """
+        Инициализация датасета.
+
+        Parameters
+        ----------
+        csv_path : str or Path
+            Путь к CSV-файлу с данными.
+        images_dir : str or Path
+            Путь к директории с изображениями.
+        test_size : float, optional
+            Доля тестовой выборки, by default 0.2
+        random_state : int, optional
+            Random state для воспроизводимости, by default 42
+        """
+        self.images_dir = Path(images_dir)
+        self.transform = transforms.Compose(
+            [
+                transforms.Resize((224, 224)),
+                transforms.ToTensor(),
+            ]
+        )
+
+        # Загрузка данных из CSV
+        self.df = pd.read_csv(csv_path)
+        print(f"Загружено {len(self.df)} записей из CSV-файла")
+
+        # Фильтрация данных по наличию изображений
+        self._filter_by_images()
+        print(f"После фильтрации по изображениям осталось {len(self.df)} записей")
+
+        # Разделение данных
+        if len(self.df) > 0:
+            self._split_data(test_size, random_state)
+        else:
+            raise ValueError("После фильтрации не осталось ни одной записи")
+
+    def _filter_by_images(self):
+        """
+        Фильтрация данных по наличию изображений.
+        """
+        # Проверяем, что файлы изображений существуют
+        image_files = set(
+            f.name for f in self.images_dir.iterdir() if f.is_file() and f.suffix.lower() in [".jpg", ".jpeg", ".png"]
+        )
+        self.df = self.df[self.df["camera_id"].isin(image_files)]
+        self.df = self.df.reset_index(drop=True)
+
+    def _split_data(self, test_size, random_state):
+        """
+        Разделение данных на тренировочную и тестовую выборки.
+
+        Parameters
+        ----------
+        test_size : float
+            Доля тестовой выборки.
+        random_state : int
+            Random state для воспроизводимости.
+        """
+        if len(self.df) == 0:
+            self.train_indices = []
+            self.test_indices = []
+            return
+
+        # Разделение индексов
+        indices = list(range(len(self.df)))
+        self.train_indices, self.test_indices = train_test_split(
+            indices, test_size=test_size, random_state=random_state
+        )
+        print(f"Разделение данных: {len(self.train_indices)} тренировочных, {len(self.test_indices)} тестовых")
+
+    def get_train_dataset(self):
+        """
+        Получение тренировочного датасета.
+
+        Returns
+        -------
+        PrepareData
+            Тренировочный датасет.
+        """
+        return self._create_subset(self.train_indices)
+
+    def get_test_dataset(self):
+        """
+        Получение тестового датасета.
+
+        Returns
+        -------
+        PrepareData
+            Тестовый датасет.
+        """
+        return self._create_subset(self.test_indices)
+
+    def _create_subset(self, indices):
+        """
+        Создание подмножества датасета по индексам.
+
+        Parameters
+        ----------
+        indices : list
+            Список индексов.
+
+        Returns
+        -------
+        PrepareData
+            Подмножество датасета.
+        """
+        # Создаем копию объекта
+        subset = PrepareData.__new__(PrepareData)
+        subset.images_dir = self.images_dir
+        subset.transform = self.transform
+        subset.df = self.df.iloc[indices].reset_index(drop=True)
+        # Для подмножества не нужно разделять данные
+        subset.train_indices = list(range(len(subset.df)))
+        subset.test_indices = []
+        return subset
+
+    def __len__(self):
+        """
+        Получение длины датасета.
+
+        Returns
+        -------
+        int
+            Длина датасета.
+        """
+        return len(self.df)
+
+    def __getitem__(self, idx):
+        """
+        Получение элемента датасета по индексу.
+
+        Parameters
+        ----------
+        idx : int
+            Индекс элемента.
+
+        Returns
+        -------
+        tuple
+            Кортеж из изображения и координат (тензор изображения, тензор координат).
+        """
+        row = self.df.iloc[idx]
+        image_path = self.images_dir / row["camera_id"]
+
+        # Загрузка изображения
+        image = Image.open(image_path).convert("RGB")
+        if self.transform:
+            image = self.transform(image)
+
+        # Получение координат
+        coordinates = torch.tensor([row["lat_real"], row["lon_real"]], dtype=torch.float32)
+
+        return image, coordinates
diff --git a/src/engine/main.py b/src/engine/main.py
index 378fa8f..e51e21d 100644
--- a/src/engine/main.py
+++ b/src/engine/main.py
@@ -1,2 +1,686 @@
+import argparse
+import json
+import os
+import sys
+import warnings
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+
+import cv2
+import joblib
+import numpy as np
+import optuna
+import pandas as pd
+import torch
+import torch.nn as nn
+from sklearn.model_selection import train_test_split
+from torch.utils.data import DataLoader, Dataset
+
+# =============================================================================
+# КРИТИЧЕСКИЕ ИСПРАВЛЕНИЯ ДЛЯ DATASPHERE
+# =============================================================================
+
+
+# Динамическое определение путей для работы в DataSphere
+current_file = Path(__file__).resolve()
+project_root_in_cloud = Path("/job")  # Явно указываем корень в облаке
+local_project_root = current_file.parent.parent
+
+# Выбираем корень в зависимости от окружения
+# Проверяем, находимся ли мы в среде DataSphere (существует ли папка /job)
+if project_root_in_cloud.exists():
+    ROOT_DIR = project_root_in_cloud
+    print("✓ Обнаружена среда DataSphere. Используем путь /job")
+else:
+    ROOT_DIR = local_project_root
+    print("✓ Обнаружена локальная среда. Используем локальный путь")
+
+# Добавляем возможные пути к модулям в sys.path
+possible_paths_to_models = [
+    ROOT_DIR / "models",  # Папка models в корне
+    ROOT_DIR / "src" / "models",  # Папка models внутри src
+    ROOT_DIR,  # Сам корень проекта
+    ROOT_DIR / "src",  # Папка src
+    ROOT_DIR / "utils",  # Папка utils в корне
+    ROOT_DIR / "src" / "utils",  # Папка utils внутри src
+]
+
+for path in possible_paths_to_models:
+    path_str = str(path)
+    if path.exists() and path_str not in sys.path:
+        sys.path.insert(0, path_str)
+        print(f"✓ Добавлен путь: {path}")
+
+# Также добавляем родительскую директорию текущего файла
+current_parent = str(current_file.parent)
+if current_parent not in sys.path:
+    sys.path.insert(0, current_parent)
+
+print("=" * 60)
+print("FINAL ENVIRONMENT INFO:")
+print(f"Current file: {current_file}")
+print(f"ROOT_DIR: {ROOT_DIR}")
+print(f"Current working directory: {Path.cwd()}")
+print(f"Python will look for modules in:")
+for i, path in enumerate(sys.path[:10]):  # Показываем первые 10 путей
+    print(f"  {i+1}. {path}")
+print("=" * 60)
+
+# Диагностика: что действительно есть в облаке
+print("\nCHECKING CLOUD ENVIRONMENT STRUCTURE:")
+check_paths = [ROOT_DIR, Path(".")]
+for path in check_paths:
+    if path.exists():
+        print(f"\nСодержимое {path}:")
+        try:
+            items = list(path.iterdir())
+            if not items:
+                print("  [EMPTY]")
+            for item in items:
+                item_type = "DIR" if item.is_dir() else "FILE"
+                print(f"  [{item_type}] {item.name}")
+        except Exception as e:
+            print(f"  Ошибка доступа: {e}")
+print("=" * 60)
+
+# Теперь пробуем импортировать
+try:
+    from models.OCR_model import OverlayOCR
+
+    print("✓ Модуль models.OCR_model успешно импортирован")
+except ImportError as e:
+    print(f"✗ Ошибка импорта models.OCR_model: {e}")
+    # Попробуем альтернативный путь
+    try:
+        # Если модуль в той же директории, что и main.py
+        from OCR_model import OverlayOCR
+
+        print("✓ Модуль OCR_model успешно импортирован из текущей директории")
+    except ImportError as e2:
+        print(f"✗ Ошибка импорта OCR_model: {e2}")
+        raise
+
+try:
+    from utils.useful_functions import levenshtein_distance
+
+    print("✓ Модуль utils.useful_functions успешно импортирован")
+except ImportError as e:
+    print(f"✗ Ошибка импорта utils.useful_functions: {e}")
+    # Попробуем альтернативный путь
+    try:
+        from useful_functions import levenshtein_distance
+
+        print("✓ Модуль useful_functions успешно импортирован из текущей директории")
+    except ImportError as e2:
+        print(f"✗ Ошибка импорта useful_functions: {e2}")
+
+        # Создаем заглушку, если функция не найдена
+        def levenshtein_distance(s1, s2):
+            print(f"WARNING: Using dummy levenshtein_distance for '{s1}' and '{s2}'")
+            return abs(len(s1) - len(s2))
+
+        print("✓ Создана заглушка для levenshtein_distance")
+
+warnings.filterwarnings("ignore")
+
+# Директория для сохранения результатов
+save_dir = ROOT_DIR / "models" / "ocr_model"
+save_dir.mkdir(parents=True, exist_ok=True)
+print(f"Save directory: {save_dir}")
+
+
+def parse_args():
+    """Парсинг аргументов командной строки"""
+    parser = argparse.ArgumentParser(description="OCR Model Training")
+    parser.add_argument(
+        "--csv-path", type=str, default="data/processed_data/merged_data.csv", help="Путь к CSV файлу с данными"
+    )
+    parser.add_argument(
+        "--images-dir",
+        type=str,
+        default="data/raw_data/data/metadata/INC/united_image/",
+        help="Директория с изображениями",
+    )
+    parser.add_argument("--optuna-study", type=str, default="optuna_study.pkl", help="Имя файла для сохранения study")
+    parser.add_argument(
+        "--ocr-model-params", type=str, default="ocr_model_params.json", help="Имя файла для сохранения параметров"
+    )
+    parser.add_argument("--n-trials", type=int, default=50, help="Количество trials для Optuna")
+    parser.add_argument("--max-samples", type=int, default=200, help="Максимальное количество образцов для оценки")
+    return parser.parse_args()
+
+
+class OCRDataset(Dataset):
+    """Датасет для обучения OCR модели"""
+
+    def __init__(self, image_paths: List[Path], labels: List[str], transform=None):
+        self.image_paths = image_paths
+        self.labels = labels
+        self.transform = transform
+
+    def __len__(self):
+        return len(self.image_paths)
+
+    def __getitem__(self, idx):
+        image_path = self.image_paths[idx]
+        image = cv2.imread(str(image_path))
+        if image is None:
+            image = np.zeros((1000, 1000, 3), dtype=np.uint8)
+
+        image = image.astype(np.float32) / 255.0
+        image = torch.from_numpy(image).permute(2, 0, 1)
+        label = self.labels[idx]
+
+        return image, label, str(image_path)
+
+
+class PrepareData:
+    """Подготовка данных для обучения"""
+
+    def __init__(self, csv_path: Path, images_dir: Path, test_size: float = 0.2, random_state: int = 42):
+        self.csv_path = csv_path
+        self.images_dir = images_dir
+        self.test_size = test_size
+        self.random_state = random_state
+        self.data = None
+        self._prepare_data()
+
+    def _prepare_data(self):
+        # Загружаем данные
+        print(f"Загрузка данных из {self.csv_path}")
+        self.data = pd.read_csv(self.csv_path)
+        print(f"Загружено {len(self.data)} записей")
+
+        # Создаем словарь для группировки меток по camera_id
+        image_to_labels = {}
+        missing_count = 0
+
+        for _, row in self.data.iterrows():
+            img_name = row["camera_id"]
+            label = row["filename"]
+
+            # Проверяем различные возможные расширения
+            possible_extensions = ["", ".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"]
+            img_found = False
+
+            for ext in possible_extensions:
+                if ext and img_name.endswith(ext):
+                    test_name = img_name
+                else:
+                    test_name = img_name + ext
+
+                img_path = self.images_dir / test_name
+                if img_path.exists():
+                    if str(img_path) not in image_to_labels:
+                        image_to_labels[str(img_path)] = []
+                    image_to_labels[str(img_path)].append(label)
+                    img_found = True
+                    break
+
+            if not img_found:
+                missing_count += 1
+                img_path = self.images_dir / img_name
+                if img_path.exists():
+                    if str(img_path) not in image_to_labels:
+                        image_to_labels[str(img_path)] = []
+                    image_to_labels[str(img_path)].append(label)
+                else:
+                    print(f"Изображение не найдено: {img_name}")
+
+        print(
+            f"Найдено {len(image_to_labels)} уникальных изображений с {sum(len(labels) for labels in image_to_labels.values())} метками"
+        )
+        print(f"Пропущено {missing_count} изображений")
+
+        if len(image_to_labels) == 0:
+            raise ValueError("Не найдено ни одного валидного изображения!")
+
+        # Создаем списки для разделения
+        image_paths = list(image_to_labels.keys())
+        labels_lists = list(image_to_labels.values())
+
+        # Для стратификации используем первую метку каждого изображения
+        first_labels = [labels[0] for labels in labels_lists]
+
+        # Проверяем, можно ли стратифицировать
+        from collections import Counter
+
+        label_counts = Counter(first_labels)
+        min_samples = min(label_counts.values()) if label_counts else 0
+
+        if min_samples < 2 or len(set(first_labels)) == len(first_labels):
+            stratify = None
+            print("Стратификация отключена (уникальные метки или недостаточно образцов)")
+        else:
+            stratify = first_labels
+            print(f"Стратификация включена, классов: {len(set(first_labels))}")
+
+        # Разделяем данные
+        (self.train_paths, self.test_paths, self.train_labels, self.test_labels) = train_test_split(
+            image_paths, labels_lists, test_size=self.test_size, random_state=self.random_state, stratify=stratify
+        )
+
+        print(f"Тренировочный набор: {len(self.train_paths)} изображений")
+        print(f"Тестовый набор: {len(self.test_paths)} изображений")
+
+    def get_train_dataset(self):
+        # Для обратной совместимости используем первую метку
+        train_single_labels = [labels[0] for labels in self.train_labels]
+        return OCRDataset([Path(p) for p in self.train_paths], train_single_labels)
+
+    def get_test_dataset(self):
+        test_single_labels = [labels[0] for labels in self.test_labels]
+        return OCRDataset([Path(p) for p in self.test_paths], test_single_labels)
+
+    def get_train_dataset_with_all_labels(self):
+        """Возвращает датасет со всеми метками для каждого изображения"""
+        return MultiLabelDataset([Path(p) for p in self.train_paths], self.train_labels)
+
+    def get_test_dataset_with_all_labels(self):
+        """Возвращает датасет со всеми метками для каждого изображения"""
+        return MultiLabelDataset([Path(p) for p in self.test_paths], self.test_labels)
+
+
+class MultiLabelDataset(Dataset):
+    """Датасет с несколькими метками для каждого изображения"""
+
+    def __init__(self, image_paths: List[Path], labels_lists: List[List[str]], transform=None):
+        self.image_paths = image_paths
+        self.labels_lists = labels_lists
+        self.transform = transform
+
+    def __len__(self):
+        return len(self.image_paths)
+
+    def __getitem__(self, idx):
+        image_path = self.image_paths[idx]
+        image = cv2.imread(str(image_path))
+        if image is None:
+            image = np.zeros((1000, 1000, 3), dtype=np.uint8)
+
+        image = image.astype(np.float32) / 255.0
+        image = torch.from_numpy(image).permute(2, 0, 1)
+        labels = self.labels_lists[idx]
+
+        return image, labels, str(image_path)
+
+
+class OCRModel:
+    """Обертка для OCR системы с возможностью обучения гиперпараметров"""
+
+    def __init__(self):
+        self.best_params = None
+        self.best_score = float("inf")
+        self.study = None
+
+    @staticmethod
+    def check_data_quality(dataset, num_samples=5):
+        """Проверка качества данных"""
+        print("=== ДИАГНОСТИКА ДАННЫХ ===")
+
+        for i in range(min(num_samples, len(dataset))):
+            if isinstance(dataset, MultiLabelDataset):
+                _, labels, path = dataset[i]
+                print(f"{i+1}. {Path(path).name}")
+                print(f"   Метки ({len(labels)}): {labels}")
+            else:
+                _, label, path = dataset[i]
+                print(f"{i+1}. {Path(path).name}")
+                print(f"   Метка: '{label}'")
+
+            original = cv2.imread(str(path))
+            print(f"   Размер оригинала: {original.shape if original is not None else 'N/A'}")
+
+            if original is not None:
+                h, w = original.shape[:2]
+                roi_height = min(150, h // 5)
+                roi = original[h - roi_height : h, : min(600, w)]
+
+                debug_path = f"debug_sample_{i}.jpg"
+                cv2.imwrite(debug_path, roi)
+                print(f"   Превью сохранено: {debug_path}")
+            print("---")
+
+    def find_best_match_distance(self, predicted_text, true_labels):
+        """Находит минимальное расстояние Левенштейна между предсказанным текстом и всеми возможными метками"""
+        if not true_labels:
+            return float("inf"), None
+
+        min_distance = float("inf")
+        best_match = None
+
+        for true_label in true_labels:
+            distance = levenshtein_distance(str(true_label), predicted_text)
+            if distance < min_distance:
+                min_distance = distance
+                best_match = true_label
+
+        return min_distance, best_match
+
+    def evaluate_params(self, params: Dict[str, Any], dataset, max_samples: int = 200) -> float:
+        """Оценка параметров на датасете"""
+        try:
+            ocr = OverlayOCR(**params)
+            total_distance = 0
+            count = 0
+
+            # Используем подвыборку для ускорения
+            n_samples = min(max_samples, len(dataset))
+            indices = np.random.choice(len(dataset), n_samples, replace=False)
+
+            print(f"Оценка параметров на {n_samples} образцах...")
+
+            for i, idx in enumerate(indices):
+                if isinstance(dataset, MultiLabelDataset):
+                    _, true_labels, image_path = dataset[idx]
+                else:
+                    _, true_label, image_path = dataset[idx]
+                    true_labels = [true_label]  # Преобразуем в список для единообразия
+
+                # Всегда используем оригинальный путь к изображению
+                try:
+                    final, norm, joined, conf, roi_name = ocr.run_on_image(str(image_path))
+
+                    # Находим наилучшее соответствие среди всех меток
+                    distance, best_match = self.find_best_match_distance(final, true_labels)
+                    total_distance += distance
+                    count += 1
+
+                    if i % 20 == 0:  # Логируем каждые 20 образцов
+                        print(
+                            f"  [{i+1}/{n_samples}] Лучшее соответствие: '{best_match}' -> '{final}', dist: {distance}"
+                        )
+
+                except Exception as e:
+                    print(f"  Ошибка при обработке {image_path}: {e}")
+                    continue
+
+            avg_distance = total_distance / count if count > 0 else float("inf")
+            print(f"  Среднее минимальное расстояние Левенштейна: {avg_distance:.2f}")
+            return avg_distance
+
+        except Exception as e:
+            print(f"Ошибка оценки параметров: {e}")
+            return float("inf")
+
+    def objective(self, trial, train_dataset, max_samples: int):
+        """Целевая функция для Optuna"""
+        params = {
+            "gap_mult": trial.suggest_float("gap_mult", 1.0, 2.0),
+            "canvas_size": trial.suggest_categorical("canvas_size", [3600, 4800, 6000]),
+            "mag_ratio": trial.suggest_float("mag_ratio", 2.0, 4.0),
+            "add_margin": trial.suggest_float("add_margin", 0.05, 0.2),
+            "text_threshold": trial.suggest_float("text_threshold", 0.45, 0.7),
+            "low_text": trial.suggest_float("low_text", 0.2, 0.4),
+            "link_threshold": trial.suggest_float("link_threshold", 0.3, 0.5),
+            "langs": ["en"],
+            "gpu": torch.cuda.is_available(),
+        }
+
+        score = self.evaluate_params(params, train_dataset, max_samples)
+
+        if score < self.best_score:
+            self.best_score = score
+            self.best_params = params.copy()
+            print(f"  Новый лучший результат: {score:.2f}")
+
+        return score
+
+    def train(self, train_dataset, n_trials: int = 50, max_samples: int = 200):
+        """Обучение модели"""
+        print(f"Запуск оптимизации гиперпараметров ({n_trials} trials)...")
+
+        self.study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=42))
+
+        self.study.optimize(
+            lambda trial: self.objective(trial, train_dataset, max_samples), n_trials=n_trials, show_progress_bar=True
+        )
+
+        print("Оптимизация завершена!")
+        print(f"Лучшие параметры: {self.study.best_params}")
+        print(f"Лучшее расстояние: {self.study.best_value:.2f}")
+
+        return self.study.best_params
+
+    def save_model(self, save_path: Path, args=None):
+        """Сохранение модели и параметров"""
+        if self.best_params is None:
+            raise ValueError("Модель не обучена. Сначала вызовите train()")
+
+        save_path.mkdir(parents=True, exist_ok=True)
+
+        if args is not None:
+            params_filename = args.ocr_model_params
+            study_filename = args.optuna_study
+        else:
+            params_filename = "ocr_model_params.json"
+            study_filename = "optuna_study.pkl"
+
+        model_info = {
+            "best_params": self.best_params,
+            "best_score": self.best_score,
+            "study_trials": len(self.study.trials) if self.study else 0,
+            "timestamp": pd.Timestamp.now().isoformat(),
+        }
+
+        with open(save_path / params_filename, "w") as f:
+            json.dump(model_info, f, indent=2, ensure_ascii=False)
+
+        if self.study:
+            joblib.dump(self.study, save_path / study_filename)
+
+        print(f"Модель сохранена в: {save_path}")
+
+    def load_model(self, load_path: Path, args=None):
+        """Загрузка модели"""
+        if args is not None:
+            params_filename = args.ocr_model_params
+            study_filename = args.optuna_study
+        else:
+            params_filename = "ocr_model_params.json"
+            study_filename = "optuna_study.pkl"
+
+        with open(load_path / params_filename, "r") as f:
+            model_info = json.load(f)
+
+        self.best_params = model_info["best_params"]
+        self.best_score = model_info["best_score"]
+
+        study_path = load_path / study_filename
+        if study_path.exists():
+            self.study = joblib.load(study_path)
+
+        return OverlayOCR(**self.best_params)
+
+
+def find_file_by_pattern(directory, pattern):
+    """
+    Ищет файл в директории по шаблону имени.
+    Возвращает Path к первому найденному файлу или None.
+    """
+    path = Path(directory)
+    if not path.exists():
+        return None
+    for file_path in path.iterdir():
+        if file_path.is_file() and pattern in file_path.name:
+            return file_path
+    return None
+
+
+def find_dir_by_pattern(directory, pattern):
+    """
+    Ищет директорию по шаблону имени.
+    Возвращает Path к первой найденной директории или None.
+    """
+    path = Path(directory)
+    if not path.exists():
+        return None
+    for dir_path in path.iterdir():
+        if dir_path.is_dir() and pattern in dir_path.name:
+            return dir_path
+    return None
+
+
+def main():
+    """Основная функция обучения"""
+    args = parse_args()
+
+    try:
+        # Диагностика путей в DataSphere
+        print("\n" + "=" * 60)
+        print("DATASPHERE PATH DIAGNOSTICS:")
+        print(f"Original CSV path: {args.csv_path}")
+        print(f"Original Images dir: {args.images_dir}")
+
+        # 1. Определяем корневую директорию для поиска
+        search_root = ROOT_DIR
+
+        # 2. Гибкий поиск CSV-файла
+        csv_path = Path(args.csv_path)
+        if not csv_path.exists():
+            # Пробуем найти файл, содержащий в имени ключевые слова
+            possible_csv = find_file_by_pattern(search_root, "csv_file")
+            if possible_csv:
+                csv_path = possible_csv
+                print(f"Найден CSV-файл по шаблону: {csv_path}")
+            else:
+                # Если по шаблону не нашли, пробуем просто взять первый файл в корне с расширением .csv
+                for item in search_root.iterdir():
+                    if item.is_file() and item.suffix.lower() == ".csv":
+                        csv_path = item
+                        print(f"Найден CSV-файл по расширению: {csv_path}")
+                        break
+                else:
+                    raise FileNotFoundError(
+                        f"CSV файл не найден: {args.csv_path}. Доступные файлы в {search_root}: {list(search_root.iterdir())}"
+                    )
+
+        # 3. Гибкий поиск директории с изображениями
+        images_dir = Path(args.images_dir)
+        if not images_dir.exists():
+            # Пробуем найти директорию, содержащую в имени ключевые слова
+            possible_images_dir = find_dir_by_pattern(search_root, "images_dir")
+            if possible_images_dir:
+                images_dir = possible_images_dir
+                print(f"Найдена директория с изображениями по шаблону: {images_dir}")
+            else:
+                raise FileNotFoundError(
+                    f"Директория с изображениями не найдена: {args.images_dir}. Доступные директории в {search_root}: {[d.name for d in search_root.iterdir() if d.is_dir()]}"
+                )
+
+        print(f"Final CSV path: {csv_path}")
+        print(f"Final Images dir: {images_dir}")
+        print(f"CSV exists: {csv_path.exists()}")
+        print(f"Images dir exists: {images_dir.exists()}")
+        print("=" * 60 + "\n")
+
+        # Создание датасета с исправленными путями
+        print("Подготовка данных...")
+        dataset = PrepareData(csv_path=csv_path, images_dir=images_dir, test_size=0.2, random_state=42)
+
+        # Получаем датасеты со всеми метками
+        train_dataset = dataset.get_train_dataset_with_all_labels()
+        test_dataset = dataset.get_test_dataset_with_all_labels()
+
+        print(f"Размер тренировочного датасета: {len(train_dataset)}")
+        print(f"Размер тестового датасета: {len(test_dataset)}")
+
+        # Диагностика данных перед обучением
+        OCRModel.check_data_quality(train_dataset, num_samples=3)
+
+        # Тестовый прогон на одном изображении
+        if len(train_dataset) > 0:
+            _, test_labels, test_path = train_dataset[0]
+            print(f"\nТестовый прогон на первом изображении:")
+            print(f"Путь: {test_path}")
+            print(f"Возможные метки ({len(test_labels)}): {test_labels}")
+
+            # Проверяем базовый OCR
+            ocr = OverlayOCR()
+            try:
+                final, norm, joined, conf, roi_name = ocr.run_on_image(str(test_path))
+                distance, best_match = OCRModel().find_best_match_distance(final, test_labels)
+                print(f"Результат OCR: '{final}'")
+                print(f"Лучшее соответствие: '{best_match}', расстояние: {distance}")
+
+                if distance > 10:
+                    print("ВНИМАНИЕ: Большая ошибка на тестовом изображении!")
+            except Exception as e:
+                print(f"Ошибка при тестовом OCR: {e}")
+
+        # Создаем и обучаем модель
+        model = OCRModel()
+
+        # Обучаем на тренировочных данных со всеми метками
+        best_params = model.train(train_dataset, n_trials=args.n_trials, max_samples=args.max_samples)
+
+        # Сохраняем модель в текущую директорию (для DataSphere)
+        model.save_model(ROOT_DIR, args)
+
+        # Тестируем на тестовых данных со всеми метками
+        print("\nТестирование на тестовом наборе...")
+        test_ocr = OverlayOCR(**best_params)
+        test_distances = []
+        best_matches = []
+        test_samples = min(50, len(test_dataset))
+
+        for i in range(test_samples):
+            _, true_labels, image_path = test_dataset[i]
+
+            try:
+                final, norm, joined, conf, roi_name = test_ocr.run_on_image(str(image_path))
+                distance, best_match = model.find_best_match_distance(final, true_labels)
+                test_distances.append(distance)
+                best_matches.append(best_match)
+
+                if i % 10 == 0:
+                    print(f"Тест [{i+1}/{test_samples}]: '{best_match}' -> '{final}', dist: {distance}")
+            except Exception as e:
+                print(f"Ошибка при тестировании {image_path}: {e}")
+                continue
+
+        if test_distances:
+            avg_test_distance = np.mean(test_distances)
+            std_test_distance = np.std(test_distances)
+
+            # Анализ результатов
+            perfect_matches = sum(1 for d in test_distances if d == 0)
+            good_matches = sum(1 for d in test_distances if d <= 2)
+
+            print(f"\nРезультаты тестирования:")
+            print(f"Среднее минимальное расстояние: {avg_test_distance:.2f}")
+            print(f"Стандартное отклонение: {std_test_distance:.2f}")
+            print(f"Идеальные совпадения (расстояние=0): {perfect_matches}/{len(test_distances)}")
+            print(f"Хорошие совпадения (расстояние≤2): {good_matches}/{len(test_distances)}")
+            print(f"Минимальное расстояние: {np.min(test_distances):.2f}")
+            print(f"Максимальное расстояние: {np.max(test_distances):.2f}")
+
+            # Сохраняем результаты тестирования
+            results = {
+                "test_avg_distance": avg_test_distance,
+                "test_std_distance": std_test_distance,
+                "perfect_matches": perfect_matches,
+                "good_matches": good_matches,
+                "test_samples_evaluated": len(test_distances),
+                "train_dataset_size": len(train_dataset),
+                "test_dataset_size": len(test_dataset),
+                "best_params": best_params,
+            }
+
+            with open(ROOT_DIR / "test_results.json", "w") as f:
+                json.dump(results, f, indent=2, ensure_ascii=False)
+        else:
+            print("Не удалось получить результаты тестирования!")
+
+        print(f"\nОбучение завершено! Результаты сохранены в: {ROOT_DIR.absolute()}")
+
+    except Exception as e:
+        print(f"Критическая ошибка в main: {e}")
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+
+
 if __name__ == "__main__":
-    print("Hello world")
+    main()
diff --git a/src/models/OCR_model.py b/src/models/OCR_model.py
new file mode 100644
index 0000000..3a797c5
--- /dev/null
+++ b/src/models/OCR_model.py
@@ -0,0 +1,195 @@
+import re
+from typing import List, Optional, Tuple
+
+import cv2
+import easyocr
+import numpy as np
+
+
+class OverlayOCR:
+    WHITELIST_RE = re.compile(r"[A-Za-z0-9_]+")
+
+    def __init__(
+        self,
+        langs: Optional[List[str]] = None,
+        gpu: bool = False,
+        verbose: bool = False,
+        gap_mult: float = 1.6,
+        canvas_size: int = 3600,
+        mag_ratio: float = 3.0,
+        add_margin: float = 0.10,
+        text_threshold: float = 0.55,
+        low_text: float = 0.30,
+        link_threshold: float = 0.30,
+    ):
+        """
+        langs: языки easyocr, напр. ['en'] или ['en','ru']
+        gap_mult: чувствительность к горизонтальным разрывам (меньше -> больше '_')
+        canvas_size/mag_ratio: масштабирование внутри easyocr
+        """
+        self.langs = langs or ["en"]
+        self.reader = easyocr.Reader(self.langs, gpu=gpu, verbose=verbose)
+        self.gap_mult = gap_mult
+        self.canvas_size = canvas_size
+        self.mag_ratio = mag_ratio
+        self.add_margin = add_margin
+        self.text_threshold = text_threshold
+        self.low_text = low_text
+        self.link_threshold = link_threshold
+
+    # ---------- утилиты ----------
+    @staticmethod
+    def _clean_token(t: str) -> str:
+        return "".join(OverlayOCR.WHITELIST_RE.findall(t))
+
+    @staticmethod
+    def _alnum_class(ch: str) -> str:
+        return "D" if ch.isdigit() else ("A" if ch.isalpha() else "_")
+
+    def _join_with_gaps(self, results, sep="_") -> Tuple[str, float, list]:
+        """
+        Склейка токенов слева направо:
+        - '_' если горизонтальный зазор >> медианного,
+        - '_' на границах A<->D.
+        """
+        items = []
+        for bbox, text, conf in results:
+            t = self._clean_token(text)
+            if not t:
+                continue
+            x0 = min(p[0] for p in bbox)
+            x1 = max(p[0] for p in bbox)
+            items.append((x0, x1, t, float(conf)))
+        if not items:
+            return "", 0.0, []
+
+        items.sort(key=lambda z: z[0])
+        gaps = []
+        for i in range(1, len(items)):
+            gaps.append(items[i][0] - items[i - 1][1])
+        med_gap = np.median(gaps) if gaps else 0
+
+        out = []
+        confs = []
+        prev = None
+        for i, (x0, x1, t, c) in enumerate(items):
+            if prev is not None:
+                gap = x0 - prev[1]
+                need_sep = med_gap > 0 and gap > self.gap_mult * med_gap
+                # буква↔️цифра – полезно отделить
+                if not need_sep:
+                    prev_last = out[-1][-1] if out else ""
+                    if prev_last and t:
+                        need_sep = self._alnum_class(prev_last) != self._alnum_class(t[0])
+                if need_sep and (not out or out[-1] != sep):
+                    out.append(sep)
+            out.append(t)
+            confs.append(c)
+            prev = (x0, x1)
+
+        text = "".join(out)
+        text = re.sub(r"_+", "_", text).strip("_")
+        avg_conf = float(sum(confs) / len(confs)) if confs else 0.0
+        return text, avg_conf, items
+
+    @staticmethod
+    def _normalize_overlays(s: str) -> str:
+        """Правки под формат MMC_hd_... и расстановка подчёркиваний."""
+        s = re.sub(r"^MMC(?:_)?h(?:d)?", "MMC_hd", s, flags=re.IGNORECASE)
+        s = re.sub(r"^MMC_?hd_?", "MMC_hd_", s, flags=re.IGNORECASE)
+        s = re.sub(r"([A-Za-z])([0-9])", r"\1_\2", s)
+        s = re.sub(r"([0-9])([A-Za-z])", r"\1_\2", s)
+        s = re.sub(r"_+", "_", s).strip("_")
+        return s
+
+    @staticmethod
+    def _snap_digits_tail(s: str) -> str:
+        """
+        Если хвост цифр склеен, режем на 4-1-1 (типичный случай).
+        Пример: ...229221 -> ...2292_2_1
+        """
+        m = re.search(r"^(.*?)(\d{6,})$", s)
+        if not m:
+            return s
+        head, digits = m.group(1), m.group(2)
+        if len(digits) >= 6:
+            s = f"{head}{digits[:4]}_{digits[4:5]}_{digits[5:]}"
+        return re.sub(r"_+", "_", s).strip("_")
+
+    # ---------- EasyOCR запуск на ROI ----------
+    def run_on_roi(self, roi_bgr) -> Tuple[str, str, str, float]:
+        params = dict(
+            decoder="greedy",
+            detail=1,
+            paragraph=False,
+            contrast_ths=0.05,
+            adjust_contrast=0.7,
+            text_threshold=self.text_threshold,
+            low_text=self.low_text,
+            link_threshold=self.link_threshold,
+            canvas_size=self.canvas_size,
+            mag_ratio=self.mag_ratio,
+            add_margin=self.add_margin,
+        )
+        results = self.reader.readtext(roi_bgr, **params)
+        joined, conf, _ = self._join_with_gaps(results, sep="_")
+        norm = self._normalize_overlays(joined)
+        final = self._snap_digits_tail(norm)
+        final = re.sub(r"^MMC_?hd_?", "MMC_hd_", final, flags=re.IGNORECASE)
+        final = re.sub(r"_+", "_", final).strip("_")
+        return joined, norm, final, conf
+
+    # ---------- ROI генераторы ----------
+    @staticmethod
+    def roi_left_bottom(img, w_frac=1 / 3, h_frac=1 / 4):
+        H, W = img.shape[:2]
+        return img[H - int(H * h_frac) : H, 0 : int(W * w_frac)]
+
+    @staticmethod
+    def roi_bottom_band(img, h_frac=1 / 3):
+        H, _ = img.shape[:2]
+        y0 = H - int(H * h_frac)
+        return img[y0:H, :]
+
+    @staticmethod
+    def roi_auto_band(img):
+        g = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        _, b = cv2.threshold(cv2.GaussianBlur(g, (5, 5), 0), 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+        row = (b > 0).sum(axis=1).astype(np.float32)
+        k = max(3, (img.shape[0] // 100) * 2 + 1)
+        row = cv2.GaussianBlur(row.reshape(-1, 1), (1, k), 0).ravel()
+        start = img.shape[0] // 2
+        idx = start + int(np.argmax(row[start:]))
+        band_half = max(img.shape[0] // 12, 20)
+        y0, y1 = max(0, idx - band_half), min(img.shape[0], idx + band_half)
+        return img[y0:y1, :]
+
+    # ---------- главный метод ----------
+    def run_on_image(self, image_path: str) -> Tuple[str, str, str, float, str]:
+        """
+        Возвращает:
+          final, norm, joined, conf, best_roi_name
+        """
+        img = cv2.imread(image_path)
+        assert img is not None, f"Не удалось загрузить изображение: {image_path}"
+
+        rois = [
+            ("left_bottom", self.roi_left_bottom(img, 1 / 3, 1 / 4)),
+            ("bottom_band", self.roi_bottom_band(img, 1 / 3)),
+            ("auto_band", self.roi_auto_band(img)),
+        ]
+
+        best = None
+        best_name = ""
+        best_pack = ("", "", "", 0.0)
+
+        for name, roi in rois:
+            joined, norm, final, conf = self.run_on_roi(roi)
+            cand = (conf, len(final), (final, norm, joined, conf), name)
+            if (best is None) or (cand > best):
+                best = cand
+                best_pack = (final, norm, joined, conf)
+                best_name = name
+
+        final, norm, joined, conf = best_pack
+        return final, norm, joined, conf, best_name
diff --git a/src/models/__init__.py b/src/models/__init__.py
new file mode 100644
index 0000000..2261379
--- /dev/null
+++ b/src/models/__init__.py
@@ -0,0 +1,3 @@
+from .OCR_model import OverlayOCR
+
+__all__ = ["OverlayOCR"]
diff --git a/src/models/evaluate.py b/src/models/evaluate.py
deleted file mode 100644
index c0bcfb4..0000000
--- a/src/models/evaluate.py
+++ /dev/null
@@ -1,263 +0,0 @@
-import json
-import logging
-import os
-from io import StringIO
-
-import boto3
-import joblib
-import matplotlib.pyplot as plt
-import mlflow
-import mlflow.sklearn
-import numpy as np
-import pandas as pd
-import seaborn as sns
-import yaml
-from sklearn.metrics import (
-    accuracy_score,
-    confusion_matrix,
-    f1_score,
-    precision_score,
-    recall_score,
-)
-
-# Настройка логирования
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-def load_processed_data_from_s3(bucket_name, data_key, target_key, aws_access_key_id=None, aws_secret_access_key=None):
-    """Загрузка обработанных данных и целевых переменных из S3"""
-    logger.info(f"Загрузка обработанных данных из s3://{bucket_name}/{data_key} и s3://{bucket_name}/{target_key}")
-
-    # Создание клиента S3
-    if aws_access_key_id and aws_secret_access_key:
-        s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
-    else:
-        # Используем IAM роли или credentials из окружения
-        s3 = boto3.client("s3")
-
-    try:
-        # Загрузка данных
-        data_response = s3.get_object(Bucket=bucket_name, Key=data_key)
-        data_content = data_response["Body"].read().decode("utf-8")
-        X = pd.read_csv(StringIO(data_content))
-
-        # Загрузка целевых переменных
-        target_response = s3.get_object(Bucket=bucket_name, Key=target_key)
-        target_content = target_response["Body"].read().decode("utf-8")
-        y = pd.read_csv(StringIO(target_content))["target"]
-
-        return X, y
-    except Exception as e:
-        logger.error(f"Ошибка при загрузке данных из S3: {str(e)}")
-        raise
-
-
-def load_processed_data(data_path, target_path):
-    """Загрузка обработанных данных и целевых переменных (локально)"""
-    logger.info(f"Загрузка обработанных данных из {data_path}")
-    X = pd.read_csv(data_path)
-    y = pd.read_csv(target_path)["target"]
-    return X, y
-
-
-def load_model_from_s3(bucket_name, model_key, aws_access_key_id=None, aws_secret_access_key=None):
-    """Загрузка обученной модели из S3"""
-    logger.info(f"Загрузка модели из s3://{bucket_name}/{model_key}")
-
-    # Создание клиента S3
-    if aws_access_key_id and aws_secret_access_key:
-        s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
-    else:
-        # Используем IAM роли или credentials из окружения
-        s3 = boto3.client("s3")
-
-    try:
-        # Загрузка модели
-        model_response = s3.get_object(Bucket=bucket_name, Key=model_key)
-        model_content = model_response["Body"].read()
-
-        # Сохранение модели во временный файл
-        temp_model_path = "temp_model.pkl"
-        with open(temp_model_path, "wb") as f:
-            f.write(model_content)
-
-        # Загрузка модели
-        model = joblib.load(temp_model_path)
-
-        # Удаление временного файла
-        os.remove(temp_model_path)
-
-        return model
-    except Exception as e:
-        logger.error(f"Ошибка при загрузке модели из S3: {str(e)}")
-        raise
-
-
-def load_model(model_path):
-    """Загрузка обученной модели с диска"""
-    logger.info(f"Загрузка модели из {model_path}")
-    if not os.path.exists(model_path):
-        raise FileNotFoundError(f"Файл модели не найден: {model_path}")
-    return joblib.load(model_path)
-
-
-def evaluate_model(model, X_test, y_test):
-    """Оценка производительности модели"""
-    logger.info("Оценка модели")
-    y_pred = model.predict(X_test)
-
-    metrics = {
-        "accuracy": float(accuracy_score(y_test, y_pred)),
-        "precision": float(precision_score(y_test, y_pred, average="weighted")),
-        "recall": float(recall_score(y_test, y_pred, average="weighted")),
-        "f1_score": float(f1_score(y_test, y_pred, average="weighted")),
-    }
-
-    return metrics, y_pred
-
-
-def save_metrics(metrics, metrics_path):
-    """Сохранение метрик в JSON файл"""
-    logger.info(f"Сохранение метрик в {metrics_path}")
-    with open(metrics_path, "w") as f:
-        json.dump(metrics, f, indent=2)
-
-
-def plot_confusion_matrix(y_true, y_pred, class_names=None, save_path="reports/figures/confusion_matrix.png"):
-    """Построение и сохранение матрицы ошибок"""
-    logger.info(f"Построение матрицы ошибок в {save_path}")
-
-    # Создание директории, если она не существует
-    os.makedirs(os.path.dirname(save_path), exist_ok=True)
-
-    # Создание матрицы ошибок
-    cm = confusion_matrix(y_true, y_pred)
-
-    # Построение графика
-    plt.figure(figsize=(8, 6))
-    sns.heatmap(
-        cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names or "auto", yticklabels=class_names or "auto"
-    )
-    plt.title("Матрица ошибок")
-    plt.xlabel("Предсказанный класс")
-    plt.ylabel("Истинный класс")
-
-    # Сохранение графика
-    plt.tight_layout()
-    plt.savefig(save_path)
-    plt.close()
-
-
-def upload_to_s3(local_file_path, bucket_name, s3_key, aws_access_key_id=None, aws_secret_access_key=None):
-    """Загрузка файла в S3"""
-    logger.info(f"Загрузка {local_file_path} в s3://{bucket_name}/{s3_key}")
-
-    # Создание клиента S3
-    if aws_access_key_id and aws_secret_access_key:
-        s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
-    else:
-        # Используем IAM роли или credentials из окружения
-        s3 = boto3.client("s3")
-
-    # Загрузка файла
-    try:
-        s3.upload_file(local_file_path, bucket_name, s3_key)
-        logger.info(f"Файл успешно загружен в S3: s3://{bucket_name}/{s3_key}")
-    except Exception as e:
-        logger.error(f"Ошибка при загрузке файла в S3: {str(e)}")
-        raise
-
-
-def main():
-    """Главная функция для оценки модели"""
-    # Загрузка конфигурации
-    with open("configs/config.yaml", "r") as f:
-        config = yaml.safe_load(f)
-
-    # Параметры S3 из переменных окружения
-    s3_bucket = os.getenv("S3_BUCKET")
-    s3_processed_prefix = os.getenv("S3_PROCESSED_PREFIX", "data/processed/")
-    aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
-    aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
-
-    # Параметры MLflow из переменных окружения
-    mlflow_tracking_uri = os.getenv("MLFLOW_TRACKING_URI", "http://localhost:5000")
-    mlflow_experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME", "ml-project")
-
-    # Настройка MLflow
-    mlflow.set_tracking_uri(mlflow_tracking_uri)
-    mlflow.set_experiment(mlflow_experiment_name)
-
-    try:
-        # Загрузка тестовых данных из S3 или локально
-        test_data_path = os.path.join(config["data"]["processed_path"], config["data"]["test_file"])
-        test_target_path = test_data_path.replace(".csv", "_target.csv")
-
-        if s3_bucket:
-            X_test, y_test = load_processed_data_from_s3(
-                s3_bucket,
-                s3_processed_prefix + config["data"]["test_file"],
-                s3_processed_prefix + config["data"]["test_file"].replace(".csv", "_target.csv"),
-                aws_access_key_id,
-                aws_secret_access_key,
-            )
-        else:
-            X_test, y_test = load_processed_data(test_data_path, test_target_path)
-
-        # Загрузка обученной модели из S3 или локально
-        model_path = "models/model.pkl"
-        if s3_bucket:
-            model = load_model_from_s3(s3_bucket, "models/model.pkl", aws_access_key_id, aws_secret_access_key)
-        else:
-            model = load_model(model_path)
-
-        # Оценка модели
-        test_metrics, y_pred = evaluate_model(model, X_test, y_test)
-        logger.info(f"Тестовые метрики: {test_metrics}")
-
-        # Сохранение метрик
-        save_metrics(test_metrics, "metrics.json")
-
-        # Построение матрицы ошибок
-        plot_confusion_matrix(y_test, y_pred, save_path="reports/figures/confusion_matrix.png")
-
-        # Загрузка графиков и метрик в S3, если указан bucket
-        if s3_bucket:
-            upload_to_s3(
-                "reports/figures/confusion_matrix.png",
-                s3_bucket,
-                "reports/figures/confusion_matrix.png",
-                aws_access_key_id,
-                aws_secret_access_key,
-            )
-            upload_to_s3("metrics.json", s3_bucket, "metrics.json", aws_access_key_id, aws_secret_access_key)
-
-        # Логирование в MLflow, если URI указан
-        if mlflow_tracking_uri:
-            # Получение активного эксперимента
-            experiment = mlflow.get_experiment_by_name(mlflow_experiment_name)
-            if experiment:
-                # Поиск последнего запуска
-                runs = mlflow.search_runs(
-                    experiment_ids=[experiment.experiment_id], order_by=["start_time DESC"], max_results=1
-                )
-                if not runs.empty:
-                    run_id = runs.iloc[0]["run_id"]
-                    with mlflow.start_run(run_id=run_id):
-                        # Логирование метрик
-                        mlflow.log_metrics(test_metrics)
-
-                        # Логирование артефактов
-                        mlflow.log_artifact("reports/figures/confusion_matrix.png")
-                        mlflow.log_artifact("metrics.json")
-
-        logger.info("Оценка модели успешно завершена")
-
-    except Exception as e:
-        logger.error(f"Ошибка в оценке модели: {str(e)}")
-        raise
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/models/train.py b/src/models/train.py
deleted file mode 100644
index c657523..0000000
--- a/src/models/train.py
+++ /dev/null
@@ -1,218 +0,0 @@
-import json
-import logging
-import os
-from io import StringIO
-
-import boto3
-import joblib
-import mlflow
-import mlflow.sklearn
-import numpy as np
-import pandas as pd
-import yaml
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
-
-# Настройка логирования
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-def load_processed_data_from_s3(bucket_name, data_key, target_key, aws_access_key_id=None, aws_secret_access_key=None):
-    """Загрузка обработанных данных и целевых переменных из S3"""
-    logger.info(f"Загрузка обработанных данных из s3://{bucket_name}/{data_key} и s3://{bucket_name}/{target_key}")
-
-    # Создание клиента S3
-    if aws_access_key_id and aws_secret_access_key:
-        s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
-    else:
-        # Используем IAM роли или credentials из окружения
-        s3 = boto3.client("s3")
-
-    try:
-        # Загрузка данных
-        data_response = s3.get_object(Bucket=bucket_name, Key=data_key)
-        data_content = data_response["Body"].read().decode("utf-8")
-        X = pd.read_csv(StringIO(data_content))
-
-        # Загрузка целевых переменных
-        target_response = s3.get_object(Bucket=bucket_name, Key=target_key)
-        target_content = target_response["Body"].read().decode("utf-8")
-        y = pd.read_csv(StringIO(target_content))["target"]
-
-        return X, y
-    except Exception as e:
-        logger.error(f"Ошибка при загрузке данных из S3: {str(e)}")
-        raise
-
-
-def load_processed_data(data_path, target_path):
-    """Загрузка обработанных данных и целевых переменных (локально)"""
-    logger.info(f"Загрузка обработанных данных из {data_path}")
-    X = pd.read_csv(data_path)
-    y = pd.read_csv(target_path)["target"]
-    return X, y
-
-
-def get_model(model_name, model_params):
-    """Инициализация модели на основе конфигурации"""
-    logger.info(f"Инициализация модели {model_name}")
-
-    if model_name.lower() == "random_forest":
-        return RandomForestClassifier(**model_params)
-    elif model_name.lower() == "logistic_regression":
-        return LogisticRegression(**model_params)
-    else:
-        raise ValueError(f"Неподдерживаемый тип модели: {model_name}")
-
-
-def train_model(model, X_train, y_train):
-    """Обучение модели"""
-    logger.info("Обучение модели")
-    model.fit(X_train, y_train)
-    return model
-
-
-def save_model(model, model_path):
-    """Сохранение обученной модели на диск"""
-    logger.info(f"Сохранение модели в {model_path}")
-    os.makedirs(os.path.dirname(model_path), exist_ok=True)
-    joblib.dump(model, model_path)
-
-
-def evaluate_model(model, X_test, y_test):
-    """Оценка производительности модели"""
-    logger.info("Оценка модели")
-    y_pred = model.predict(X_test)
-
-    metrics = {
-        "accuracy": accuracy_score(y_test, y_pred),
-        "precision": precision_score(y_test, y_pred, average="weighted"),
-        "recall": recall_score(y_test, y_pred, average="weighted"),
-        "f1_score": f1_score(y_test, y_pred, average="weighted"),
-    }
-
-    return metrics
-
-
-def save_metrics(metrics, metrics_path):
-    """Сохранение метрик в JSON файл"""
-    logger.info(f"Сохранение метрик в {metrics_path}")
-    with open(metrics_path, "w") as f:
-        json.dump(metrics, f, indent=2)
-
-
-def upload_to_s3(local_file_path, bucket_name, s3_key, aws_access_key_id=None, aws_secret_access_key=None):
-    """Загрузка файла в S3"""
-    logger.info(f"Загрузка {local_file_path} в s3://{bucket_name}/{s3_key}")
-
-    # Создание клиента S3
-    if aws_access_key_id and aws_secret_access_key:
-        s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
-    else:
-        # Используем IAM роли или credentials из окружения
-        s3 = boto3.client("s3")
-
-    # Загрузка файла
-    try:
-        s3.upload_file(local_file_path, bucket_name, s3_key)
-        logger.info(f"Файл успешно загружен в S3: s3://{bucket_name}/{s3_key}")
-    except Exception as e:
-        logger.error(f"Ошибка при загрузке файла в S3: {str(e)}")
-        raise
-
-
-def main():
-    """Главная функция для обучения модели"""
-    # Загрузка конфигурации
-    with open("configs/config.yaml", "r") as f:
-        config = yaml.safe_load(f)
-
-    # Для обратной совместимости будем использовать config для всех параметров
-    params = config
-
-    # Параметры S3 из переменных окружения
-    s3_bucket = os.getenv("S3_BUCKET")
-    s3_processed_prefix = os.getenv("S3_PROCESSED_PREFIX", "data/processed/")
-    aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
-    aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
-
-    # Параметры MLflow из переменных окружения
-    mlflow_tracking_uri = os.getenv("MLFLOW_TRACKING_URI", "http://localhost:5000")
-    mlflow_experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME", "ml-project")
-
-    # Настройка MLflow
-    mlflow.set_tracking_uri(mlflow_tracking_uri)
-    mlflow.set_experiment(mlflow_experiment_name)
-
-    try:
-        # Загрузка обработанных данных из S3 или локально
-        train_data_path = os.path.join(config["data"]["processed_path"], config["data"]["train_file"])
-        train_target_path = train_data_path.replace(".csv", "_target.csv")
-
-        if s3_bucket:
-            X_train, y_train = load_processed_data_from_s3(
-                s3_bucket,
-                s3_processed_prefix + config["data"]["train_file"],
-                s3_processed_prefix + config["data"]["train_file"].replace(".csv", "_target.csv"),
-                aws_access_key_id,
-                aws_secret_access_key,
-            )
-        else:
-            X_train, y_train = load_processed_data(train_data_path, train_target_path)
-
-        # Начало эксперимента MLflow
-        with mlflow.start_run():
-            # Логирование параметров
-            mlflow.log_params(
-                {
-                    "model_name": params["model"]["name"],
-                    "test_size": params["data"]["test_size"],
-                    "random_state": params["data"]["random_state"],
-                }
-            )
-
-            # Логирование гиперпараметров модели
-            for param, value in params["model"]["params"].items():
-                mlflow.log_param(f"model_{param}", value)
-
-            # Инициализация модели
-            model = get_model(params["model"]["name"], params["model"]["params"])
-
-            # Обучение модели
-            trained_model = train_model(model, X_train, y_train)
-
-            # Сохранение модели
-            model_path = "models/model.pkl"
-            save_model(trained_model, model_path)
-
-            # Загрузка модели в S3, если указан bucket
-            if s3_bucket:
-                upload_to_s3(model_path, s3_bucket, "models/model.pkl", aws_access_key_id, aws_secret_access_key)
-
-            # Оценка на обучающем наборе (для демонстрации)
-            train_metrics = evaluate_model(trained_model, X_train, y_train)
-            logger.info(f"Метрики обучения: {train_metrics}")
-
-            # Логирование метрик в MLflow
-            mlflow.log_metrics(train_metrics)
-
-            # Сохранение метрик локально
-            save_metrics(train_metrics, "metrics.json")
-
-            # Логирование артефактов модели в MLflow
-            mlflow.sklearn.log_model(trained_model, "model")
-
-            # Логирование файла метрик как артефакт
-            mlflow.log_artifact("metrics.json")
-
-        logger.info("Обучение модели успешно завершено")
-
-    except Exception as e:
-        logger.error(f"Ошибка в обучении модели: {str(e)}")
-        raise
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
new file mode 100644
index 0000000..9690669
--- /dev/null
+++ b/src/utils/__init__.py
@@ -0,0 +1,15 @@
+from .useful_functions import extract_coordinates, move_and_remove_files, merge_tables_with_tolerance, levenshtein_distance
+from .s3 import s3_download_file, get_s3_client, s3_list_files, s3_upload_file
+from .zip import extract_zip_advanced
+
+__all__ = [
+    "extract_coordinates",
+    "move_and_remove_files",
+    "merge_tables_with_tolerance",
+    "levenshtein_distance",
+    "s3_download_file",
+    "get_s3_client",
+    "s3_list_files",
+    "s3_upload_file",
+    "extract_zip_advanced",
+]
diff --git a/src/utils/useful_functions.py b/src/utils/useful_functions.py
index 6e27b5c..5e0850e 100644
--- a/src/utils/useful_functions.py
+++ b/src/utils/useful_functions.py
@@ -1,7 +1,10 @@
+import collections
+import itertools
 import os
 import re
 import shutil
 import zipfile
+from itertools import zip_longest
 from math import atan2, cos, radians, sin, sqrt
 from pathlib import Path
 
@@ -38,16 +41,32 @@ def extract_coordinates(coord_string):
     return None, None
 
 
-def merge_tables_with_tolerance(target, 
-                                real_data, 
-                                target_lat_name: str = 'latitude',
-                                target_lot_name: str = 'longitude',
-                                real_data_lat_name: str = 'latitude',
-                                real_data_lot_name: str = 'longitude',
-                                max_distance_meters=100):
+def merge_tables_with_tolerance(
+    target,
+    real_data,
+    target_lat_name="latitude",
+    target_lot_name="longitude",
+    real_data_lat_name="latitude",
+    real_data_lot_name="longitude",
+    max_distance_meters=100,
+):
+    # Проверка существования колонок
+    if target_lat_name not in target.columns:
+        raise ValueError(f"Колонка {target_lat_name} не найдена в target")
+    if target_lot_name not in target.columns:
+        raise ValueError(f"Колонка {target_lot_name} не найдена в target")
+    if real_data_lat_name not in real_data.columns:
+        raise ValueError(f"Колонка {real_data_lat_name} не найдена в real_data")
+    if real_data_lot_name not in real_data.columns:
+        raise ValueError(f"Колонка {real_data_lot_name} не найдена в real_data")
+
     # Переименование колонок
-    df1 = target.rename(columns={target.columns[0]: "filename", target.columns[target_lat_name]: "lat_target", target.columns[target_lot_name]: "lon_target"})
-    df2 = real_data.rename(columns={real_data.columns[0]: "camera_id", real_data.columns[real_data_lat_name]: "lat_real", real_data.columns[real_data_lot_name]: "lon_real"})
+    df1 = target.rename(
+        columns={target.columns[0]: "filename", target_lat_name: "lat_target", target_lot_name: "lon_target"}
+    )
+    df2 = real_data.rename(
+        columns={real_data.columns[0]: "camera_id", real_data_lat_name: "lat_real", real_data_lot_name: "lon_real"}
+    )
 
     # Преобразование координат в радианы для сферического расстояния
     coords1 = np.radians(df1[["lat_target", "lon_target"]].values)
@@ -73,3 +92,21 @@ def merge_tables_with_tolerance(target,
     result = result[result["distance_m"] <= max_distance_meters].sort_values("distance_m")
 
     return result.reset_index(drop=True)
+
+
+def levenshtein_distance(string1, string2):
+    """
+    >>> levenshtein_distance('AATZ', 'AAAZ')
+    1
+    >>> levenshtein_distance('AATZZZ', 'AAAZ')
+    3
+    """
+    distance = 0
+    if len(string1) < len(string2):
+        string1, string2 = string2, string1
+
+    # Заменяем itertools.izip_longest на zip_longest для Python 3
+    for i, v in zip_longest(string1, string2, fillvalue="-"):
+        if i != v:
+            distance += 1
+    return distance