diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..e69de29 diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/.gitignore b/.gitignore index 6c10c35..0e991d3 100644 --- a/.gitignore +++ b/.gitignore @@ -55,10 +55,9 @@ MANIFEST mlruns/ # Data -data data/* # Cookiecutter hack_digital_transformation/ uv.lock -.coverage \ No newline at end of file +.coverage diff --git a/job_config.yaml b/job_config.yaml index ae1851d..59111d0 100644 --- a/job_config.yaml +++ b/job_config.yaml @@ -1,20 +1,24 @@ -name: digital_hack_ml_job # Уникальное имя задания -desc: "Джоба для проведения экспериментов по созданию CV модели для определения местоположения по фотографии" # Описание +name: digital_hack_ml_job +desc: "Джоба для проведения экспериментов по созданию CV модели для определения местоположения по фотографии" -# Команда для запуска. ${VARIABLE} - шаблоны, заменяемые на значения из `inputs` -cmd: python src/engine/main.py #--input ${INPUT_FILE} +# Команда для запуска +cmd: python src/engine/main.py --csv-path csv_file_mob --images-dir images_dir_dzk/ -# передать в DataSphere код и зависимости pip +# Используйте python: auto для автоматического определения версии Python env: python: auto -# Входные данные (файлы или каталоги) -#inputs: -# - input_data.txt: INPUT_FILE +# Входные данные (обновите алиасы) +inputs: + - data/processed_data/merged_data.csv: csv_file_mob + - data/raw_data/data/metadata/INC/united_image: images_dir_dzk + - src/models: models + - src/utils: utils -# Выходные данные (файлы или каталоги, которые вернутся с облака) -#outputs: -# - output_results.zip -# - logs.txt +# Выходные данные +outputs: + - optuna_study.pkl: optuna_study + - ocr_model_params.json: ocr_model_params + - test_results.json: test_results cloud-instance-type: gt4.1 \ No newline at end of file diff --git a/notebooks/1_data_exploration/1_1_download_data.ipynb b/notebooks/1_data_exploration/1_1_download_data.ipynb index 430accf..2f15849 100644 --- a/notebooks/1_data_exploration/1_1_download_data.ipynb +++ b/notebooks/1_data_exploration/1_1_download_data.ipynb @@ -615,6 +615,56 @@ " file_s3_dst='processed_data/merge_data.csv',\n", " bucket_name='s3-dvc',)" ] + }, + { + "cell_type": "markdown", + "id": "64d2d0d0", + "metadata": {}, + "source": [ + "# Перемещяем все фотки в единую папку " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1f56653", + "metadata": {}, + "outputs": [], + "source": [ + "os.mkdir(ROOT_DIR / 'data/raw_data/data/metadata/INC/united_image')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35417e5a", + "metadata": {}, + "outputs": [], + "source": [ + "move_and_remove_files(source_dir=ROOT_DIR / 'data/raw_data/data/metadata/INC/18-001_gin_building_echd_19.08.25', \n", + " destination_dir=ROOT_DIR / 'data/raw_data/data/metadata/INC/united_image',\n", + " remove_after_move=True,)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b123a07e", + "metadata": {}, + "outputs": [], + "source": [ + "move_and_remove_files(source_dir=ROOT_DIR / 'data/raw_data/data/metadata/INC/19-001_gin_garbage_echd_19.08.25', \n", + " destination_dir=ROOT_DIR / 'data/raw_data/data/metadata/INC/united_image',\n", + " remove_after_move=True,)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b193a00", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/notebooks/1_data_exploration/1_2_prepare_data.ipynb b/notebooks/1_data_exploration/1_2_prepare_data.ipynb index 605b691..7956bd0 100644 --- a/notebooks/1_data_exploration/1_2_prepare_data.ipynb +++ b/notebooks/1_data_exploration/1_2_prepare_data.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "97feaebc", "metadata": {}, "outputs": [], @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "cd10fe31", "metadata": {}, "outputs": [], @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "e7ef1558", "metadata": {}, "outputs": [], @@ -100,8 +100,8 @@ "source": [ "# Создание датасета с автоматическим разделением\n", "dataset = PrepareData(\n", - " excel_path=ROOT_DIR / \"data/raw_data/data/metadata/INC/18-001_gin_building_echd_19.08.25.xlsx\",\n", - " images_dir=ROOT_DIR / \"data/raw_data/data/metadata/INC/18-001_gin_building_echd_19.08.25/\",\n", + " csv_path=ROOT_DIR / \"data/processed_data/merged_data.csv\",\n", + " images_dir=ROOT_DIR / \"data/raw_data/data/metadata/INC/united_image/\",\n", " test_size=0.2,\n", " random_state=42\n", ")\n", @@ -128,7 +128,18 @@ "execution_count": null, "id": "5fb26062", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best ROI: left_bottom | conf=0.93\n", + "joined: MMChdUZAO_112301\n", + "norm : MMC_hd_UZAO_112301\n", + "final : MMC_hd_UZAO_1123_0_1\n" + ] + } + ], "source": [ "import cv2\n", "import numpy as np\n", @@ -329,7 +340,7 @@ "# ===================== пример использования =====================\n", "\n", "if __name__ == \"__main__\":\n", - " IMG = r'/home/lizardapn/Hack_digital/hack_digital_transformation/data/raw_data/data/metadata/INC/19-001_gin_garbage_echd_19.08.25/ffccf36c-075a-43d9-8570-a01f3afcaf76.jpg'\n", + " IMG = r'/home/lizardapn/Hack_digital/hack_digital_transformation/data/raw_data/data/metadata/INC/united_image/0a0ee2fb-b7ad-4430-97d7-281e2c293041.jpg'\n", "\n", " ocr = OverlayOCR(\n", " langs=['en'], # ['en','ru'] если нужна кириллица\n", @@ -353,63 +364,284 @@ { "cell_type": "code", "execution_count": null, - "id": "d56cc01f", + "id": "99b46a62", "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "299610c2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[I 2025-09-25 20:53:37,368] A new study created in memory with name: no-name-ad852449-0235-45d1-a60b-e535dcbb5227\n", + "[I 2025-09-25 20:53:47,455] Trial 0 finished with value: 3.0 and parameters: {'gap_mult': 2.8068367308375892, 'canvas_size': 3600, 'mag_ratio': 2.8965996021592537, 'add_margin': 0.48089093342192746, 'text_threshold': 0.30768847754184525, 'low_text': 0.5186617904372725, 'link_threshold': 0.3295188034326136}. Best is trial 0 with value: 3.0.\n", + "[I 2025-09-25 20:53:56,705] Trial 1 finished with value: 3.0 and parameters: {'gap_mult': 0.4616970821705757, 'canvas_size': 7200, 'mag_ratio': 3.9151446432090062, 'add_margin': 0.29848539656963197, 'text_threshold': 0.7381783711907166, 'low_text': 0.6797473138005795, 'link_threshold': 0.31919910593867973}. Best is trial 0 with value: 3.0.\n", + "[I 2025-09-25 20:54:00,930] Trial 2 finished with value: 1.0 and parameters: {'gap_mult': 2.95068046948933, 'canvas_size': 1800, 'mag_ratio': 1.6335501479294163, 'add_margin': 0.23019075051561422, 'text_threshold': 0.5364046793357755, 'low_text': 0.14346526109875232, 'link_threshold': 0.4549901859491947}. Best is trial 2 with value: 1.0.\n", + "[I 2025-09-25 20:54:03,561] Trial 3 finished with value: 2.0 and parameters: {'gap_mult': 0.7098592466029646, 'canvas_size': 1800, 'mag_ratio': 4.337665270375808, 'add_margin': 0.2092324992312398, 'text_threshold': 0.5999943848050431, 'low_text': 0.5505604402067924, 'link_threshold': 0.3594102577222953}. Best is trial 2 with value: 1.0.\n", + "[I 2025-09-25 20:54:11,619] Trial 4 finished with value: 8.0 and parameters: {'gap_mult': 2.6082037341148663, 'canvas_size': 10000, 'mag_ratio': 3.858049154855571, 'add_margin': 0.023438922614506454, 'text_threshold': 0.7258377412928896, 'low_text': 0.3718230280903212, 'link_threshold': 0.41824734965498844}. Best is trial 2 with value: 1.0.\n", + "[I 2025-09-25 20:54:14,502] Trial 5 finished with value: 6.0 and parameters: {'gap_mult': 2.6706992390768276, 'canvas_size': 3600, 'mag_ratio': 3.235255263782191, 'add_margin': 0.26475654378958785, 'text_threshold': 0.4536236374494962, 'low_text': 0.5543332082573351, 'link_threshold': 0.28821655261683743}. Best is trial 2 with value: 1.0.\n", + "[I 2025-09-25 20:54:17,267] Trial 6 finished with value: 2.0 and parameters: {'gap_mult': 0.582283392877384, 'canvas_size': 7200, 'mag_ratio': 2.7611406531446723, 'add_margin': 0.38970235970730355, 'text_threshold': 0.7295818110214943, 'low_text': 0.6309958719665154, 'link_threshold': 0.1740186779693806}. Best is trial 2 with value: 1.0.\n", + "[I 2025-09-25 20:54:19,088] Trial 7 finished with value: 0.0 and parameters: {'gap_mult': 0.1148899697656891, 'canvas_size': 1800, 'mag_ratio': 2.0909813860745077, 'add_margin': 0.38880953861561235, 'text_threshold': 0.5046870997550761, 'low_text': 0.2758043304152779, 'link_threshold': 0.4121455607297143}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:54:21,489] Trial 8 finished with value: 6.0 and parameters: {'gap_mult': 0.306489544901438, 'canvas_size': 3600, 'mag_ratio': 2.431735944829139, 'add_margin': 0.12216310769525666, 'text_threshold': 0.4190839052600482, 'low_text': 0.27219395882445296, 'link_threshold': 0.37805487357277634}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:54:23,559] Trial 9 finished with value: 6.0 and parameters: {'gap_mult': 0.7405645718677436, 'canvas_size': 1800, 'mag_ratio': 2.7070172489585387, 'add_margin': 0.4140575014567742, 'text_threshold': 0.6300710361851376, 'low_text': 0.33516922393069803, 'link_threshold': 0.17953131386104623}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:54:25,982] Trial 10 finished with value: 0.0 and parameters: {'gap_mult': 1.4093994989267604, 'canvas_size': 10000, 'mag_ratio': 1.711106955826645, 'add_margin': 0.34762496219028105, 'text_threshold': 0.3909935170351837, 'low_text': 0.11283459845938454, 'link_threshold': 0.6099373327834569}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:54:27,935] Trial 11 finished with value: 0.0 and parameters: {'gap_mult': 1.4792872699039796, 'canvas_size': 10000, 'mag_ratio': 1.5223010354107902, 'add_margin': 0.33563162563951027, 'text_threshold': 0.37565955475152524, 'low_text': 0.1168980126968841, 'link_threshold': 0.6325641683059166}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:54:30,647] Trial 12 finished with value: 0.0 and parameters: {'gap_mult': 1.363569314546833, 'canvas_size': 10000, 'mag_ratio': 2.1897337958053624, 'add_margin': 0.469594695826354, 'text_threshold': 0.48388464532737874, 'low_text': 0.21658316372857112, 'link_threshold': 0.5597509755711151}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:54:32,654] Trial 13 finished with value: 0.0 and parameters: {'gap_mult': 2.0469309820631105, 'canvas_size': 1800, 'mag_ratio': 2.006596797142189, 'add_margin': 0.3626671511083453, 'text_threshold': 0.36655356940398864, 'low_text': 0.22865266846485077, 'link_threshold': 0.5216438095511898}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:54:37,208] Trial 14 finished with value: 6.0 and parameters: {'gap_mult': 1.1069357731698148, 'canvas_size': 10000, 'mag_ratio': 1.8969889704811436, 'add_margin': 0.42390499933071407, 'text_threshold': 0.5157168099224722, 'low_text': 0.10164163381677017, 'link_threshold': 0.6704962998573097}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:54:40,128] Trial 15 finished with value: 2.0 and parameters: {'gap_mult': 1.8996243187385848, 'canvas_size': 1800, 'mag_ratio': 4.870066126775307, 'add_margin': 0.32483866469189737, 'text_threshold': 0.585824466151432, 'low_text': 0.4191998906943226, 'link_threshold': 0.5162837033151318}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:54:42,649] Trial 16 finished with value: 6.0 and parameters: {'gap_mult': 0.15078914260503726, 'canvas_size': 10000, 'mag_ratio': 2.336137777441351, 'add_margin': 0.15215492650074974, 'text_threshold': 0.41400283625495554, 'low_text': 0.18781709662258905, 'link_threshold': 0.5934394365460154}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:54:44,730] Trial 17 finished with value: 8.0 and parameters: {'gap_mult': 2.013170127993129, 'canvas_size': 7200, 'mag_ratio': 1.8268150446726152, 'add_margin': 0.448717840682068, 'text_threshold': 0.31527561348760785, 'low_text': 0.30206312727673806, 'link_threshold': 0.4580136822784667}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:54:50,832] Trial 18 finished with value: 7.0 and parameters: {'gap_mult': 1.059265015642122, 'canvas_size': 10000, 'mag_ratio': 3.336471276803885, 'add_margin': 0.282447622846455, 'text_threshold': 0.45135572176395305, 'low_text': 0.4253467296918087, 'link_threshold': 0.26399052554383395}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:54:53,151] Trial 19 finished with value: 6.0 and parameters: {'gap_mult': 2.381889433799428, 'canvas_size': 1800, 'mag_ratio': 2.455352248219552, 'add_margin': 0.3617385824967006, 'text_threshold': 0.5375199556741124, 'low_text': 0.17247623825345915, 'link_threshold': 0.5205447782602558}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:54:58,417] Trial 20 finished with value: 0.0 and parameters: {'gap_mult': 1.7314693373323178, 'canvas_size': 10000, 'mag_ratio': 3.1603467746819867, 'add_margin': 0.17358331235688695, 'text_threshold': 0.6662549037510657, 'low_text': 0.27133518926858646, 'link_threshold': 0.6857614766409486}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:55:00,525] Trial 21 finished with value: 0.0 and parameters: {'gap_mult': 1.4760274400451803, 'canvas_size': 10000, 'mag_ratio': 1.505120282497312, 'add_margin': 0.33500238184214015, 'text_threshold': 0.3640023294357934, 'low_text': 0.10303266997568411, 'link_threshold': 0.6226069867707704}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:55:02,569] Trial 22 finished with value: 3.0 and parameters: {'gap_mult': 1.0719868998063475, 'canvas_size': 10000, 'mag_ratio': 1.5514445085936461, 'add_margin': 0.38165340541644804, 'text_threshold': 0.36735195851134994, 'low_text': 0.15146302969794537, 'link_threshold': 0.6318307720851756}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:55:04,977] Trial 23 finished with value: 0.0 and parameters: {'gap_mult': 1.2827856065009335, 'canvas_size': 10000, 'mag_ratio': 2.0522093394602043, 'add_margin': 0.33364441087307467, 'text_threshold': 0.4009411598080816, 'low_text': 0.22636549446097454, 'link_threshold': 0.6997776020712903}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:55:07,408] Trial 24 finished with value: 6.0 and parameters: {'gap_mult': 1.688073858124835, 'canvas_size': 10000, 'mag_ratio': 1.8147929044605347, 'add_margin': 0.41187921882338346, 'text_threshold': 0.48018628231507815, 'low_text': 0.14505741985334714, 'link_threshold': 0.5725158142189293}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:55:12,225] Trial 25 finished with value: 3.0 and parameters: {'gap_mult': 2.2273244859507426, 'canvas_size': 1800, 'mag_ratio': 2.1619788064541146, 'add_margin': 0.29680817784938923, 'text_threshold': 0.34280209793757394, 'low_text': 0.3598483802527688, 'link_threshold': 0.47532698356188297}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:55:14,578] Trial 26 finished with value: 2.0 and parameters: {'gap_mult': 1.6266005039888947, 'canvas_size': 7200, 'mag_ratio': 1.8054129677884196, 'add_margin': 0.4967591135107159, 'text_threshold': 0.40062455240630057, 'low_text': 0.27142466964041806, 'link_threshold': 0.6268694818093563}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:55:17,799] Trial 27 finished with value: 0.0 and parameters: {'gap_mult': 0.8716046775322024, 'canvas_size': 3600, 'mag_ratio': 2.591535568961233, 'add_margin': 0.23398228930290385, 'text_threshold': 0.45887982369469826, 'low_text': 0.4700297698310453, 'link_threshold': 0.40555964970795816}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:55:20,157] Trial 28 finished with value: 3.0 and parameters: {'gap_mult': 0.12634067763172063, 'canvas_size': 10000, 'mag_ratio': 1.687707077278028, 'add_margin': 0.44168182474903117, 'text_threshold': 0.5678991209026768, 'low_text': 0.19168263433456684, 'link_threshold': 0.4849722942730099}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:55:22,844] Trial 29 finished with value: 0.0 and parameters: {'gap_mult': 0.917887107858529, 'canvas_size': 3600, 'mag_ratio': 2.982888945913079, 'add_margin': 0.3144487437683528, 'text_threshold': 0.32134529399419554, 'low_text': 0.11131753239257092, 'link_threshold': 0.6478445771500324}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:55:24,929] Trial 30 finished with value: 6.0 and parameters: {'gap_mult': 1.2354572298871604, 'canvas_size': 1800, 'mag_ratio': 2.192116830092118, 'add_margin': 0.3752636196109105, 'text_threshold': 0.49763505040728695, 'low_text': 0.3099171209105759, 'link_threshold': 0.5630321790923893}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:55:27,347] Trial 31 finished with value: 0.0 and parameters: {'gap_mult': 1.4152985716317703, 'canvas_size': 10000, 'mag_ratio': 2.271704485180925, 'add_margin': 0.46402968781748494, 'text_threshold': 0.43744132250758155, 'low_text': 0.22150449012133064, 'link_threshold': 0.5866215734802223}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:55:29,694] Trial 32 finished with value: 3.0 and parameters: {'gap_mult': 1.3470046288290496, 'canvas_size': 10000, 'mag_ratio': 2.019964977098176, 'add_margin': 0.4809199347109301, 'text_threshold': 0.48927782801688746, 'low_text': 0.2061179495820208, 'link_threshold': 0.5576719211378318}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:55:31,811] Trial 33 finished with value: 3.0 and parameters: {'gap_mult': 0.42628313834266474, 'canvas_size': 10000, 'mag_ratio': 1.5404390532162884, 'add_margin': 0.40748830793442736, 'text_threshold': 0.38476933858540535, 'low_text': 0.1395917836080438, 'link_threshold': 0.6056158927267469}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:55:34,106] Trial 34 finished with value: 6.0 and parameters: {'gap_mult': 1.5326447367463771, 'canvas_size': 10000, 'mag_ratio': 1.72870248378863, 'add_margin': 0.34528521463794626, 'text_threshold': 0.5150794337111221, 'low_text': 0.24899664426731927, 'link_threshold': 0.6577784494576011}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:55:46,792] Trial 35 finished with value: 3.0 and parameters: {'gap_mult': 1.8110834294493454, 'canvas_size': 7200, 'mag_ratio': 3.6702375016738, 'add_margin': 0.4364732116125293, 'text_threshold': 0.537895085676295, 'low_text': 0.16136650600797486, 'link_threshold': 0.5402750834526241}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:55:50,265] Trial 36 finished with value: 6.0 and parameters: {'gap_mult': 1.2219988072354946, 'canvas_size': 10000, 'mag_ratio': 2.482952487867, 'add_margin': 0.26693338583603277, 'text_threshold': 0.4302885444872687, 'low_text': 0.13132228009588398, 'link_threshold': 0.32944012665242317}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:55:52,689] Trial 37 finished with value: 0.0 and parameters: {'gap_mult': 2.38129567347274, 'canvas_size': 1800, 'mag_ratio': 2.8968484899989493, 'add_margin': 0.4686449723543468, 'text_threshold': 0.302028768728002, 'low_text': 0.19115299871272867, 'link_threshold': 0.37876224726940994}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:56:09,392] Trial 38 finished with value: 6.0 and parameters: {'gap_mult': 2.8763678218532216, 'canvas_size': 10000, 'mag_ratio': 4.438316910484946, 'add_margin': 0.29829477938301685, 'text_threshold': 0.4700875822689006, 'low_text': 0.25353183963236203, 'link_threshold': 0.4213216239998583}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:56:12,401] Trial 39 finished with value: 2.0 and parameters: {'gap_mult': 0.6026962750957594, 'canvas_size': 3600, 'mag_ratio': 2.0123631514680036, 'add_margin': 0.3923298820411264, 'text_threshold': 0.5656381172012847, 'low_text': 0.6985243561080292, 'link_threshold': 0.44513454191041124}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:56:15,008] Trial 40 finished with value: 6.0 and parameters: {'gap_mult': 1.552605334791691, 'canvas_size': 1800, 'mag_ratio': 2.1836186583312256, 'add_margin': 0.24118715165087418, 'text_threshold': 0.34225003730198067, 'low_text': 0.30669141039365844, 'link_threshold': 0.29277314219252115}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:56:19,676] Trial 41 finished with value: 0.0 and parameters: {'gap_mult': 2.032777215476117, 'canvas_size': 1800, 'mag_ratio': 1.9646027862079287, 'add_margin': 0.36009363571123254, 'text_threshold': 0.36392934369845953, 'low_text': 0.22829120356983643, 'link_threshold': 0.5138704932472115}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:56:21,678] Trial 42 finished with value: 1.0 and parameters: {'gap_mult': 2.176387089985841, 'canvas_size': 1800, 'mag_ratio': 1.7019410486508, 'add_margin': 0.3540410973587066, 'text_threshold': 0.38904691450406964, 'low_text': 0.173755568533145, 'link_threshold': 0.5359395965294611}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:56:24,049] Trial 43 finished with value: 0.0 and parameters: {'gap_mult': 1.9814160602074264, 'canvas_size': 1800, 'mag_ratio': 2.6618241803684515, 'add_margin': 0.0866096719455687, 'text_threshold': 0.3297646130497309, 'low_text': 0.12249114396162103, 'link_threshold': 0.4813419467114378}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:56:26,529] Trial 44 finished with value: 6.0 and parameters: {'gap_mult': 2.413259320694561, 'canvas_size': 1800, 'mag_ratio': 1.9177687920847541, 'add_margin': 0.31469920463221435, 'text_threshold': 0.4377968866124756, 'low_text': 0.3696136534062048, 'link_threshold': 0.607201945666187}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:56:29,283] Trial 45 finished with value: 0.0 and parameters: {'gap_mult': 2.64463349486065, 'canvas_size': 7200, 'mag_ratio': 2.3460275763329173, 'add_margin': 0.39700866313058947, 'text_threshold': 0.3774153417241256, 'low_text': 0.24211527857587653, 'link_threshold': 0.3648129541396672}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:56:31,494] Trial 46 finished with value: 3.0 and parameters: {'gap_mult': 1.836878721870324, 'canvas_size': 1800, 'mag_ratio': 2.126890360324983, 'add_margin': 0.20726513840245164, 'text_threshold': 0.4137371364867357, 'low_text': 0.3342805619353377, 'link_threshold': 0.3978205666085285}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:56:33,911] Trial 47 finished with value: 2.0 and parameters: {'gap_mult': 0.9236009105572118, 'canvas_size': 10000, 'mag_ratio': 1.7057803405365435, 'add_margin': 0.36889361538736837, 'text_threshold': 0.35131987552000854, 'low_text': 0.6331189497946297, 'link_threshold': 0.5031830624015114}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:56:36,036] Trial 48 finished with value: 0.0 and parameters: {'gap_mult': 0.7075348045987095, 'canvas_size': 1800, 'mag_ratio': 1.8754699231214058, 'add_margin': 0.4201108898173849, 'text_threshold': 0.6401314045523575, 'low_text': 0.20134221483850864, 'link_threshold': 0.5473098961171385}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:56:43,775] Trial 49 finished with value: 6.0 and parameters: {'gap_mult': 1.4055293711987713, 'canvas_size': 10000, 'mag_ratio': 3.4829554961756273, 'add_margin': 0.2727130551864318, 'text_threshold': 0.5107363738699169, 'low_text': 0.2931375550010872, 'link_threshold': 0.5858997098239191}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:56:46,294] Trial 50 finished with value: 16.0 and parameters: {'gap_mult': 0.28271106449673233, 'canvas_size': 3600, 'mag_ratio': 1.6064784759584878, 'add_margin': 0.49696348828068293, 'text_threshold': 0.5542307215159156, 'low_text': 0.1614448436114824, 'link_threshold': 0.4435283721263239}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:56:54,785] Trial 51 finished with value: 6.0 and parameters: {'gap_mult': 1.6865773792850522, 'canvas_size': 10000, 'mag_ratio': 3.0544204754949122, 'add_margin': 0.1930166327695941, 'text_threshold': 0.6943934752754893, 'low_text': 0.2687102769555533, 'link_threshold': 0.6833448732377796}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:57:05,363] Trial 52 finished with value: 6.0 and parameters: {'gap_mult': 1.7335815942306727, 'canvas_size': 10000, 'mag_ratio': 4.0263794400950585, 'add_margin': 0.16928223850489543, 'text_threshold': 0.7483832969099119, 'low_text': 0.3418904310684578, 'link_threshold': 0.6450346895704159}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:57:24,354] Trial 53 finished with value: 9.0 and parameters: {'gap_mult': 2.0730357287645598, 'canvas_size': 10000, 'mag_ratio': 4.9661237759392405, 'add_margin': 0.06628189141235036, 'text_threshold': 0.6993694651671535, 'low_text': 0.39641751411068, 'link_threshold': 0.6913533032142294}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:57:29,650] Trial 54 finished with value: 6.0 and parameters: {'gap_mult': 1.8378208995218732, 'canvas_size': 10000, 'mag_ratio': 2.8058557292492172, 'add_margin': 0.14877145505589487, 'text_threshold': 0.6067678630149258, 'low_text': 0.28406293177241143, 'link_threshold': 0.6084234792788263}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:57:36,053] Trial 55 finished with value: 6.0 and parameters: {'gap_mult': 1.6077956791294892, 'canvas_size': 10000, 'mag_ratio': 3.184685723255018, 'add_margin': 0.33362136342078796, 'text_threshold': 0.6586094330040534, 'low_text': 0.47617920380466766, 'link_threshold': 0.6612731611178574}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:57:39,722] Trial 56 finished with value: 6.0 and parameters: {'gap_mult': 1.140093656057085, 'canvas_size': 10000, 'mag_ratio': 2.5390512016510867, 'add_margin': 0.43015222762309546, 'text_threshold': 0.45677922363281287, 'low_text': 0.22243443589468803, 'link_threshold': 0.6776769331383322}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:57:42,584] Trial 57 finished with value: 0.0 and parameters: {'gap_mult': 2.2173003288074886, 'canvas_size': 7200, 'mag_ratio': 2.30125078762444, 'add_margin': 0.31399758400455013, 'text_threshold': 0.407354067246686, 'low_text': 0.1741364232105632, 'link_threshold': 0.5703705499828124}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:57:45,367] Trial 58 finished with value: 6.0 and parameters: {'gap_mult': 1.4348361111451478, 'canvas_size': 1800, 'mag_ratio': 4.119267428393518, 'add_margin': 0.3921988549011919, 'text_threshold': 0.6108450546952728, 'low_text': 0.10297918293515232, 'link_threshold': 0.6423078633822412}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:05,731] Trial 59 finished with value: 19.0 and parameters: {'gap_mult': 1.3146190886260487, 'canvas_size': 10000, 'mag_ratio': 4.517561455513468, 'add_margin': 0.46035628899869246, 'text_threshold': 0.4989918282450292, 'low_text': 0.13611074435464954, 'link_threshold': 0.2370550150876778}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:08,652] Trial 60 finished with value: 0.0 and parameters: {'gap_mult': 2.4903957108815717, 'canvas_size': 10000, 'mag_ratio': 1.7991544479501766, 'add_margin': 0.28447853167616255, 'text_threshold': 0.43034949021994956, 'low_text': 0.32262981212166886, 'link_threshold': 0.6261847736969334}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:11,166] Trial 61 finished with value: 0.0 and parameters: {'gap_mult': 1.5420067502330739, 'canvas_size': 10000, 'mag_ratio': 1.6027903475171934, 'add_margin': 0.3483023224854118, 'text_threshold': 0.36393439736200833, 'low_text': 0.10422663391107893, 'link_threshold': 0.6208477423703296}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:13,672] Trial 62 finished with value: 3.0 and parameters: {'gap_mult': 1.9141280029919703, 'canvas_size': 10000, 'mag_ratio': 1.5213430371729078, 'add_margin': 0.3347560723493002, 'text_threshold': 0.3330972928489088, 'low_text': 0.12091536219081386, 'link_threshold': 0.6690028802757306}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:15,964] Trial 63 finished with value: 0.0 and parameters: {'gap_mult': 1.4423427725511269, 'canvas_size': 10000, 'mag_ratio': 1.5108889381954187, 'add_margin': 0.3806898775360643, 'text_threshold': 0.38740713862944387, 'low_text': 0.15382441727496243, 'link_threshold': 0.5866630175683626}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:18,364] Trial 64 finished with value: 6.0 and parameters: {'gap_mult': 1.734875836674064, 'canvas_size': 10000, 'mag_ratio': 1.7948695199875504, 'add_margin': 0.011307090053004093, 'text_threshold': 0.3507411762015381, 'low_text': 0.20246618057597243, 'link_threshold': 0.601242330820969}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:20,854] Trial 65 finished with value: 2.0 and parameters: {'gap_mult': 1.1757405169069743, 'canvas_size': 10000, 'mag_ratio': 2.086165783438007, 'add_margin': 0.2456882382996693, 'text_threshold': 0.4741634337377318, 'low_text': 0.2684493145468628, 'link_threshold': 0.6333830867101429}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:23,446] Trial 66 finished with value: 0.0 and parameters: {'gap_mult': 1.0583138290250695, 'canvas_size': 3600, 'mag_ratio': 1.9833204433007152, 'add_margin': 0.4078906425246266, 'text_threshold': 0.37507193446364606, 'low_text': 0.17773740261098023, 'link_threshold': 0.6550365183841861}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:25,700] Trial 67 finished with value: 0.0 and parameters: {'gap_mult': 2.1113828186904646, 'canvas_size': 1800, 'mag_ratio': 1.627737129670728, 'add_margin': 0.31184312655078883, 'text_threshold': 0.5238607673015194, 'low_text': 0.24788455119303154, 'link_threshold': 0.6997993963347023}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:28,419] Trial 68 finished with value: 0.0 and parameters: {'gap_mult': 0.9773717590997882, 'canvas_size': 10000, 'mag_ratio': 2.4010170219452776, 'add_margin': 0.21829639411230875, 'text_threshold': 0.30988261906742653, 'low_text': 0.1398668395141619, 'link_threshold': 0.5298214659650317}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:31,028] Trial 69 finished with value: 0.0 and parameters: {'gap_mult': 1.619967386716393, 'canvas_size': 7200, 'mag_ratio': 1.8845116539294495, 'add_margin': 0.37112199407447244, 'text_threshold': 0.42276720376960203, 'low_text': 0.21533980924102802, 'link_threshold': 0.5734968318657384}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:33,749] Trial 70 finished with value: 3.0 and parameters: {'gap_mult': 2.751326216417243, 'canvas_size': 10000, 'mag_ratio': 2.2278059789258613, 'add_margin': 0.4494222828148234, 'text_threshold': 0.4452982706384931, 'low_text': 0.11598893420937323, 'link_threshold': 0.49921300401957514}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:38,611] Trial 71 finished with value: 0.0 and parameters: {'gap_mult': 1.2993737247556747, 'canvas_size': 10000, 'mag_ratio': 2.043840714353784, 'add_margin': 0.3350083247295932, 'text_threshold': 0.35748588299055994, 'low_text': 0.2265551804542083, 'link_threshold': 0.6748472701551019}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:40,711] Trial 72 finished with value: 0.0 and parameters: {'gap_mult': 1.487951326924774, 'canvas_size': 10000, 'mag_ratio': 1.7554868354126691, 'add_margin': 0.28706797466207484, 'text_threshold': 0.4008671166219604, 'low_text': 0.23519385817676244, 'link_threshold': 0.6174831773618225}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:43,253] Trial 73 finished with value: 0.0 and parameters: {'gap_mult': 1.287024339527743, 'canvas_size': 10000, 'mag_ratio': 1.6619019790647513, 'add_margin': 0.35253473628188264, 'text_threshold': 0.3995679960639146, 'low_text': 0.18665460608037232, 'link_threshold': 0.6997429613770489}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:49,743] Trial 74 finished with value: 0.0 and parameters: {'gap_mult': 0.7917739839237641, 'canvas_size': 1800, 'mag_ratio': 3.691191180464256, 'add_margin': 0.256227708458774, 'text_threshold': 0.588628250867445, 'low_text': 0.15526564029661177, 'link_threshold': 0.6388558339492125}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:52,332] Trial 75 finished with value: 6.0 and parameters: {'gap_mult': 1.3836209109248103, 'canvas_size': 10000, 'mag_ratio': 1.9303637543024748, 'add_margin': 0.3266009581459893, 'text_threshold': 0.32642898614871146, 'low_text': 0.26056379346676267, 'link_threshold': 0.5581534261963809}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:54,961] Trial 76 finished with value: 2.0 and parameters: {'gap_mult': 1.2096224319499855, 'canvas_size': 1800, 'mag_ratio': 2.1238326363224926, 'add_margin': 0.3017380262152115, 'text_threshold': 0.37394001822801004, 'low_text': 0.28678593358242277, 'link_threshold': 0.6599286178042858}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:58:57,430] Trial 77 finished with value: 6.0 and parameters: {'gap_mult': 1.7802378509775458, 'canvas_size': 10000, 'mag_ratio': 1.8816946343259815, 'add_margin': 0.36388919602950753, 'text_threshold': 0.34090268515882893, 'low_text': 0.5479980945347005, 'link_threshold': 0.3892473209789098}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:00,036] Trial 78 finished with value: 6.0 and parameters: {'gap_mult': 1.6665504567091305, 'canvas_size': 3600, 'mag_ratio': 2.054080103730055, 'add_margin': 0.3410899908917761, 'text_threshold': 0.5498926207140523, 'low_text': 0.2100280220064789, 'link_threshold': 0.35186590062085316}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:02,646] Trial 79 finished with value: 6.0 and parameters: {'gap_mult': 2.2858125900796873, 'canvas_size': 10000, 'mag_ratio': 2.598654169175222, 'add_margin': 0.1096485806039654, 'text_threshold': 0.39632565711047857, 'low_text': 0.13232194942047554, 'link_threshold': 0.46478059695144425}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:04,819] Trial 80 finished with value: 6.0 and parameters: {'gap_mult': 1.9478604127774308, 'canvas_size': 1800, 'mag_ratio': 1.7354733093971937, 'add_margin': 0.3808447412146811, 'text_threshold': 0.48479170584830067, 'low_text': 0.3556634621163809, 'link_threshold': 0.5922711205439385}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:07,509] Trial 81 finished with value: 0.0 and parameters: {'gap_mult': 0.23671399010645872, 'canvas_size': 3600, 'mag_ratio': 2.2341864972807914, 'add_margin': 0.1768337152281278, 'text_threshold': 0.5006948515280124, 'low_text': 0.43160854679965444, 'link_threshold': 0.44315830511907794}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:12,406] Trial 82 finished with value: 0.0 and parameters: {'gap_mult': 0.48961947690267943, 'canvas_size': 3600, 'mag_ratio': 2.4105107256619434, 'add_margin': 0.22217208323937157, 'text_threshold': 0.4656720325400188, 'low_text': 0.4844647385891675, 'link_threshold': 0.4065883472444202}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:15,689] Trial 83 finished with value: 3.0 and parameters: {'gap_mult': 0.7997340915545504, 'canvas_size': 3600, 'mag_ratio': 3.329512876745448, 'add_margin': 0.47893630067320886, 'text_threshold': 0.42471000078598803, 'low_text': 0.4466149722453414, 'link_threshold': 0.3563209303791829}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:19,726] Trial 84 finished with value: 3.0 and parameters: {'gap_mult': 1.486979265948101, 'canvas_size': 3600, 'mag_ratio': 3.0996875437037343, 'add_margin': 0.1406713123567832, 'text_threshold': 0.41231281178387974, 'low_text': 0.3988143194327536, 'link_threshold': 0.6880342966933964}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:22,964] Trial 85 finished with value: 6.0 and parameters: {'gap_mult': 1.0932715433629328, 'canvas_size': 10000, 'mag_ratio': 2.7965070007470225, 'add_margin': 0.3037215646374164, 'text_threshold': 0.46391666668681963, 'low_text': 0.5906737765487706, 'link_threshold': 0.43021720939102975}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:25,413] Trial 86 finished with value: 6.0 and parameters: {'gap_mult': 1.5809283981134732, 'canvas_size': 1800, 'mag_ratio': 2.903098445629802, 'add_margin': 0.323204629547763, 'text_threshold': 0.4475691348897395, 'low_text': 0.4917204876920026, 'link_threshold': 0.409723948089749}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:28,208] Trial 87 finished with value: 7.0 and parameters: {'gap_mult': 1.0241737060926785, 'canvas_size': 7200, 'mag_ratio': 2.170019590734165, 'add_margin': 0.39982023963456487, 'text_threshold': 0.38733220577207395, 'low_text': 0.5104734664626996, 'link_threshold': 0.4278163892556177}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:30,772] Trial 88 finished with value: 6.0 and parameters: {'gap_mult': 1.268613828376473, 'canvas_size': 10000, 'mag_ratio': 2.501792093226784, 'add_margin': 0.19746104473498233, 'text_threshold': 0.533720206705651, 'low_text': 0.3001198513836001, 'link_threshold': 0.547632988263816}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:32,882] Trial 89 finished with value: 3.0 and parameters: {'gap_mult': 1.3478705357481844, 'canvas_size': 3600, 'mag_ratio': 1.5703952619051842, 'add_margin': 0.2383571152444115, 'text_threshold': 0.4911376231475963, 'low_text': 0.3204541870778804, 'link_threshold': 0.6127908600439828}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:35,365] Trial 90 finished with value: 0.0 and parameters: {'gap_mult': 1.490753514396979, 'canvas_size': 10000, 'mag_ratio': 2.6871785603774363, 'add_margin': 0.2705280840013191, 'text_threshold': 0.35332971931310364, 'low_text': 0.19192155927759122, 'link_threshold': 0.650409989571328}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:37,984] Trial 91 finished with value: 18.0 and parameters: {'gap_mult': 0.6059944269888327, 'canvas_size': 3600, 'mag_ratio': 3.057057959857495, 'add_margin': 0.35778874256558385, 'text_threshold': 0.3212941201056074, 'low_text': 0.1174286089721853, 'link_threshold': 0.6696455061067778}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:40,618] Trial 92 finished with value: 6.0 and parameters: {'gap_mult': 0.3775662373815719, 'canvas_size': 3600, 'mag_ratio': 2.925896099570541, 'add_margin': 0.34378187458078385, 'text_threshold': 0.30183448603810026, 'low_text': 0.16525888651370374, 'link_threshold': 0.6300161950690972}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:43,262] Trial 93 finished with value: 0.0 and parameters: {'gap_mult': 0.8521897751266599, 'canvas_size': 3600, 'mag_ratio': 3.4865557187596905, 'add_margin': 0.2894311604971045, 'text_threshold': 0.3717456107531875, 'low_text': 0.11135697155385867, 'link_threshold': 0.601458513802612}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:47,807] Trial 94 finished with value: 0.0 and parameters: {'gap_mult': 0.6891387968747075, 'canvas_size': 3600, 'mag_ratio': 2.3574090634090874, 'add_margin': 0.3717055471560436, 'text_threshold': 0.5113421563825482, 'low_text': 0.1454440500741882, 'link_threshold': 0.5800254192403381}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:50,163] Trial 95 finished with value: 0.0 and parameters: {'gap_mult': 0.9404696206232311, 'canvas_size': 10000, 'mag_ratio': 2.5879800085298745, 'add_margin': 0.3178008713854836, 'text_threshold': 0.3188847847129784, 'low_text': 0.10302789858767883, 'link_threshold': 0.6469583229932883}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:52,175] Trial 96 finished with value: 2.0 and parameters: {'gap_mult': 1.1576708263622413, 'canvas_size': 1800, 'mag_ratio': 3.16407741524424, 'add_margin': 0.32888131722555225, 'text_threshold': 0.3388924348605037, 'low_text': 0.24394707724107206, 'link_threshold': 0.6660346756545124}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:54,166] Trial 97 finished with value: 6.0 and parameters: {'gap_mult': 1.3644247440046287, 'canvas_size': 10000, 'mag_ratio': 1.8352085571892742, 'add_margin': 0.42619636062008526, 'text_threshold': 0.3800261620480223, 'low_text': 0.1296581027202471, 'link_threshold': 0.6787481334559138}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:56,295] Trial 98 finished with value: 0.0 and parameters: {'gap_mult': 0.5284987720929559, 'canvas_size': 10000, 'mag_ratio': 1.967725610612323, 'add_margin': 0.3076465681849014, 'text_threshold': 0.3605602907368438, 'low_text': 0.14851815752665998, 'link_threshold': 0.6373355528450428}. Best is trial 7 with value: 0.0.\n", + "[I 2025-09-25 20:59:58,253] Trial 99 finished with value: 0.0 and parameters: {'gap_mult': 1.8840728444738277, 'canvas_size': 1800, 'mag_ratio': 3.44988791521599, 'add_margin': 0.17183901821211403, 'text_threshold': 0.4381693314913302, 'low_text': 0.18140874209964664, 'link_threshold': 0.4916185895177267}. Best is trial 7 with value: 0.0.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Лучшие параметры:\n", + "{'gap_mult': 0.1148899697656891, 'canvas_size': 1800, 'mag_ratio': 2.0909813860745077, 'add_margin': 0.38880953861561235, 'text_threshold': 0.5046870997550761, 'low_text': 0.2758043304152779, 'link_threshold': 0.4121455607297143}\n", + "Лучшее расстояние: 0.0\n" + ] + } + ], "source": [ - "import requests\n", - "\n", - "# Базовый URL API портала открытых данных Москвы\n", - "base_url = \"https://api.data.mos.ru/v1/1498/rows\"\n", + "import optuna \n", + "import pandas as pd\n", + "from pathlib import Path\n", + "import itertools\n", + "import collections\n", + "\n", + "# Ваша реализация расстояния Левенштейна\n", + "def levenshtein_distance(string1, string2):\n", + " \"\"\"\n", + " >>> levenshtein_distance('AATZ', 'AAAZ')\n", + " 1\n", + " >>> levenshtein_distance('AATZZZ', 'AAAZ')\n", + " 3\n", + " \"\"\"\n", + " distance = 0\n", + " if len(string1) < len(string2):\n", + " string1, string2 = string2, string1\n", + " \n", + " # Заменяем itertools.izip_longest на zip_longest для Python 3\n", + " from itertools import zip_longest\n", + " for i, v in zip_longest(string1, string2, fillvalue='-'):\n", + " if i != v:\n", + " distance += 1\n", + " return distance\n", + "\n", + "# Определите ROOT_DIR (добавьте ваш путь)\n", + "ROOT_DIR = Path('/home/lizardapn/Hack_digital/hack_digital_transformation')\n", + "\n", + "data = pd.read_csv(filepath_or_buffer=ROOT_DIR / 'data/processed_data/merged_data.csv')\n", + "\n", + "def objective(trial):\n", + " params = {\n", + " 'gap_mult': trial.suggest_float('gap_mult', 0.1, 3.0),\n", + " 'canvas_size': trial.suggest_categorical('canvas_size', [1800, 3600, 7200, 10000]),\n", + " 'mag_ratio': trial.suggest_float('mag_ratio', 1.5, 5.0),\n", + " 'add_margin': trial.suggest_float('add_margin', 0.01, 0.5),\n", + " 'text_threshold': trial.suggest_float('text_threshold', 0.3, 0.75), # Исправлено: text_treshold -> text_threshold\n", + " 'low_text': trial.suggest_float('low_text', 0.1, 0.7),\n", + " 'link_threshold': trial.suggest_float('link_threshold', 0.15, 0.7),\n", + " 'langs': ['en'], \n", + " 'gpu': True,\n", + " }\n", + " \n", + " ocr = OverlayOCR(**params)\n", "\n", - "# Если требуется API-KEY, его нужно добавить в заголовки\n", - "headers = {\n", - " \"api-key\": os.getenv('API_KEY_DATA_MOS') \n", - "}\n", + " IMG_PATH = ROOT_DIR / 'data/raw_data/data/metadata/INC/united_image/0a0ee2fb-b7ad-4430-97d7-281e2c293041.jpg'\n", "\n", - "try:\n", - " print('Начали')\n", - " # Выполняем GET-запрос к API для получения списка датасетов\n", - " response = requests.get(base_url, headers=headers)\n", - " print(response)\n", - " response.raise_for_status() # Проверяем на ошибки HTTP\n", - " print(response)\n", - " # Парсим JSON-ответ\n", - " datasets = response.json()\n", + " # Получаем реальные ID для этого изображения\n", + " real_ids = data[data['camera_id'] == '0a0ee2fb-b7ad-4430-97d7-281e2c293041.jpg'].filename.values\n", " \n", - " # Выводим информацию о первых нескольких датасетах\n", - " for i, dataset in enumerate(datasets[:5]): # Ограничиваем вывод первыми 5\n", - " print(f\"Датасет {i+1}: ID - {dataset.get('Id', 'N/A')}, Название - {dataset.get('Caption', 'N/A')}\")\n", - "\n", - "except requests.exceptions.RequestException as e:\n", - " print(f\"Ошибка при выполнении запроса: {e}\")\n", - "except ValueError as e:\n", - " print(f\"Ошибка при парсинге JSON: {e}\")" + " # Распознаем текст с текущими параметрами\n", + " final, norm, joined, conf, roi_name = ocr.run_on_image(str(IMG_PATH))\n", + " \n", + " # Вычисляем расстояния до всех реальных ID\n", + " distances = []\n", + " for real_id in real_ids:\n", + " distance = levenshtein_distance(str(real_id), final)\n", + " distances.append(distance)\n", + " \n", + " # Находим минимальное расстояние (наиболее близкий ID)\n", + " if distances:\n", + " min_distance = min(distances)\n", + " else:\n", + " min_distance = float('inf') # Если нет реальных ID для сравнения\n", + " \n", + " return min_distance\n", + "\n", + "# Создаем и запускаем исследование\n", + "study = optuna.create_study(direction='minimize')\n", + "study.optimize(objective, n_trials=100) # Можно указать количество trials\n", + "\n", + "# Выводим лучшие параметры\n", + "print(\"Лучшие параметры:\")\n", + "print(study.best_params)\n", + "print(f\"Лучшее расстояние: {study.best_value}\")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "d85e9c25", + "execution_count": 41, + "id": "6f887d35", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "best_params = study.best_params\n", + "\n", + "best_params.update(\n", + " langs=['en'], \n", + " gpu=True,\n", + " )" + ] }, { "cell_type": "code", - "execution_count": null, - "id": "7934bd0c", + "execution_count": 42, + "id": "a549736d", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "{'gap_mult': 0.1148899697656891,\n", + " 'canvas_size': 1800,\n", + " 'mag_ratio': 2.0909813860745077,\n", + " 'add_margin': 0.38880953861561235,\n", + " 'text_threshold': 0.5046870997550761,\n", + " 'low_text': 0.2758043304152779,\n", + " 'link_threshold': 0.4121455607297143,\n", + " 'langs': ['en'],\n", + " 'gpu': True}" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "best_params" + ] }, { "cell_type": "code", - "execution_count": null, - "id": "33b6f8f8", + "execution_count": 46, + "id": "e0b6bb75", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best ROI: left_bottom | conf=0.57\n", + "joined: DUN_Ka_Vao_2_50151\n", + "norm : DUN_Ka_Vao_2_50151\n", + "final : DUN_Ka_Vao_2_50151\n" + ] + } + ], + "source": [ + "IMG = r'/home/lizardapn/Hack_digital/hack_digital_transformation/data/raw_data/data/metadata/INC/united_image/ffeeecc1-407c-4976-a625-280e8c987f66.jpg'\n", + "\n", + "ocr = OverlayOCR(**best_params)\n", + "\n", + "final, norm, joined, conf, roi_name = ocr.run_on_image(IMG)\n", + "print(f\"Best ROI: {roi_name} | conf={conf:.2f}\")\n", + "print(\"joined:\", joined)\n", + "print(\"norm :\", norm)\n", + "print(\"final :\", final)\n" + ] } ], "metadata": { diff --git a/requirements.txt b/requirements.txt index 0e60daa..8784e57 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,110 +1,47 @@ -arrow==1.2.3 -attrs==23.2.0 -Automat==22.10.0 -Babel==2.10.3 -bcrypt==3.2.2 -binaryornot==0.4.4 -blinker==1.7.0 -build==1.0.3 -CacheControl==0.14.0 -certifi==2023.11.17 -chardet==5.2.0 -cleo==2.1.0 -click==8.1.6 -cloud-init==25.1.4 +Mako==1.3.10 +MarkupSafe==3.0.2 +PyYAML==6.0.2 +Pygments==2.19.2 +SQLAlchemy==2.0.43 +alembic==1.16.5 +charset-normalizer==3.4.3 +cmaes==0.12.0 colorama==0.4.6 -command-not-found==0.3 -configobj==5.0.8 -constantly==23.10.4 -cookiecutter==2.6.0 -crashtest==0.4.1 -cryptography==41.0.7 -dbus-python==1.3.2 -distlib==0.3.8 -distro==1.9.0 -distro-info==1.7+build1 -dulwich==0.21.6 -fastimport==0.9.14 -fastjsonschema==2.19.0 -filelock==3.13.1 -h11==0.14.0 -httplib2==0.20.4 -hyperlink==21.0.0 -idna==3.6 -importlib-metadata==4.12.0 -incremental==22.10.0 -installer==0.7.0 -jaraco.classes==3.2.1 -jeepney==0.8.0 -Jinja2==3.1.2 -jsonpatch==1.32 -jsonpointer==2.0 -jsonschema==4.10.3 -keyring==24.3.1 -launchpadlib==1.11.0 -lazr.restfulclient==0.14.6 -lazr.uri==1.0.6 -lockfile==0.12.2 -markdown-it-py==3.0.0 -MarkupSafe==2.1.5 -mdurl==0.1.2 -more-itertools==10.2.0 -msgpack==1.0.3 -netifaces==0.11.0 -oauthlib==3.2.2 -packaging==24.0 -pexpect==4.9.0 -pkginfo==1.9.6 -platformdirs==4.2.0 -poetry==1.8.2 -poetry-core==1.9.0 -poetry-plugin-export==1.6.0 -ptyprocess==0.7.0 -pyasn1==0.4.8 -pyasn1-modules==0.2.8 -pycurl==7.45.3 -Pygments==2.17.2 -PyGObject==3.48.2 -PyHamcrest==2.1.0 -PyJWT==2.7.0 -pylev==1.4.0 -pyOpenSSL==23.2.0 -pyparsing==3.1.1 -pyproject_hooks==1.0.0 -pyrsistent==0.20.0 -pyserial==3.5 -python-apt==2.7.7+ubuntu5 -python-dateutil==2.8.2 -python-magic==0.4.27 -python-slugify==8.0.4 -pytz==2024.1 -PyYAML==6.0.1 -requests==2.31.0 -requests-toolbelt==1.0.0 -rich==13.7.1 -ruamel.yaml==0.17.21 -ruamel.yaml.clib==0.2.8 -s3cmd==2.4.0 -SecretStorage==3.3.3 -service-identity==24.1.0 -setuptools==68.1.2 -shellingham==1.5.4 -six==1.16.0 -systemd-python==235 -toml==0.10.2 -tomlkit==0.12.4 -trove-classifiers==2024.1.31 -Twisted==24.3.0 -typing_extensions==4.10.0 -ubuntu-pro-client==8001 -unattended-upgrades==0.1 -Unidecode==1.3.8 -urllib3==2.0.7 -uvicorn==0.27.1 -uvloop==0.19.0 -virtualenv==20.25.0+ds -wadllib==1.3.6 -wheel==0.42.0 -wsproto==1.2.0 -zipp==1.0.0 -zope.interface==6.1 +colorlog==6.9.0 +defusedxml==0.7.1 +easyocr==1.7.2 +googleapis-common-protos==1.70.0 +greenlet==3.2.4 +grpcio==1.75.0 +imageio==2.37.0 +importlib-metadata==6.11.0 +joblib==1.5.2 +mpmath==1.3.0 +networkx==3.5 +numpy==2.2.6 +opencv-python==4.12.0.88 +optuna==4.5.0 +packaging==23.2 +pandas==2.3.2 +pillow==11.3.0 +platformdirs==4.4.0 +protobuf==6.32.1 +psutil==7.1.0 +pyarrow==21.0.0 +pydot==4.0.1 +python-bidi==0.6.6 +python-dateutil==2.9.0.post0 +pytz==2025.2 +scikit-image==0.25.2 +scikit-learn==1.7.2 +scipy==1.16.2 +six==1.17.0 +sympy==1.14.0 +threadpoolctl==3.6.0 +tifffile==2025.9.20 +torch==2.8.0 +torchvision==0.23.0 +tqdm==4.67.1 +triton==3.4.0 +typing-extensions==4.15.0 +zipp==3.23.0 \ No newline at end of file diff --git a/setup.py b/setup.py index aa8aee6..22b1805 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,6 @@ "scikit-learn>=1.0.0", "matplotlib>=3.5.0", "seaborn>=0.11.0", - "dvc>=2.0.0", ], extras_require={ "dev": [ diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..076a757 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,3 @@ +from . import utils + +__all__ = ['utils'] \ No newline at end of file diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py deleted file mode 100644 index 13fb721..0000000 --- a/src/data/make_dataset.py +++ /dev/null @@ -1,225 +0,0 @@ -import logging -import os -import zipfile -from io import BytesIO, StringIO - -import boto3 -import numpy as np -import pandas as pd -import yaml -from sklearn.model_selection import train_test_split - -# Настройка логирования -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -def load_data_from_s3( - bucket_name, - file_key, - endpoint_url=None, - aws_access_key_id=None, - aws_secret_access_key=None, - region_name="ru-central1", -): - """Загрузка данных из S3-совместимого хранилища""" - logger.info(f"Загрузка данных из s3://{bucket_name}/{file_key}") - - try: - s3 = boto3.client( - "s3", - endpoint_url=endpoint_url or "https://storage.yandexcloud.net", - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - region_name=region_name, - ) - - # Загрузка данных - s3.download_file(bucket_name, file_s3_src, file_local) - - except Exception as e: - logger.error(f"Ошибка при загрузке данных из S3: {str(e)}") - raise - - -def load_data(file_path): - """Загрузка данных из CSV файла (локально)""" - logger.info(f"Загрузка данных из {file_path}") - if not os.path.exists(file_path): - raise FileNotFoundError(f"Файл данных не найден: {file_path}") - return pd.read_csv(file_path) - - -def preprocess_data(df, numeric_features, categorical_features): - """Предобработка данных путем обработки пропущенных значений и кодирования категориальных признаков""" - logger.info("Предобработка данных") - - # Обработка пропущенных значений - for col in numeric_features: - if col in df.columns: - df[col].fillna(df[col].median(), inplace=True) - - for col in categorical_features: - if col in df.columns: - df[col].fillna("Неизвестно", inplace=True) - - # Кодирование категориальных переменных - df_processed = pd.get_dummies(df, columns=categorical_features, drop_first=True) - - return df_processed - - -def split_data(df, target_column, test_size=0.2, random_state=42): - """Разделение данных на обучающую и тестовую выборки""" - logger.info("Разделение данных на обучающую и тестовую выборки") - - if target_column not in df.columns: - raise ValueError(f"Целевой столбец '{target_column}' не найден в датафрейме") - - X = df.drop(columns=[target_column]) - y = df[target_column] - - return train_test_split(X, y, test_size=test_size, random_state=random_state) - - -def save_data(X_train, X_test, y_train, y_test, train_path, test_path): - """Сохранение обучающей и тестовой выборок в CSV файлы""" - logger.info(f"Сохранение обучающих данных в {train_path}") - logger.info(f"Сохранение тестовых данных в {test_path}") - - # Создание директорий, если они не существуют - os.makedirs(os.path.dirname(train_path), exist_ok=True) - os.makedirs(os.path.dirname(test_path), exist_ok=True) - - # Сохранение данных - X_train.to_csv(train_path, index=False) - X_test.to_csv(test_path, index=False) - - # Сохранение целевых переменных - pd.DataFrame({"target": y_train}).to_csv(train_path.replace(".csv", "_target.csv"), index=False) - pd.DataFrame({"target": y_test}).to_csv(test_path.replace(".csv", "_target.csv"), index=False) - - -def upload_to_s3( - local_file_path, - bucket_name, - s3_key, - endpoint_url=None, - aws_access_key_id=None, - aws_secret_access_key=None, - region_name="ru-central1", -): - """Загрузка файла в S3-совместимое хранилище""" - logger.info(f"Загрузка {local_file_path} в s3://{bucket_name}/{s3_key}") - - try: - # Создание клиента S3 - s3 = boto3.client( - "s3", - endpoint_url=endpoint_url or f"https://storage.yandexcloud.net", - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - region_name=region_name, - ) - - # Загрузка файла - s3.upload_file(local_file_path, bucket_name, s3_key) - logger.info(f"Файл успешно загружен в S3: s3://{bucket_name}/{s3_key}") - - except Exception as e: - logger.error(f"Ошибка при загрузке файла в S3: {str(e)}") - raise - - -def main(): - """Главная функция для подготовки набора данных""" - # Загрузка конфигурации - with open("configs/config.yaml", "r") as f: - config = yaml.safe_load(f) - - # Параметры из конфигурации - params = config - - # Параметры S3 из переменных окружения - s3_bucket = os.getenv("S3_BUCKET", "s3-dvc") - s3_raw_data_key = os.getenv("S3_RAW_DATA_KEY", "Датасет.zip") - s3_processed_prefix = os.getenv("S3_PROCESSED_PREFIX", "data/processed/") - aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID") - aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") - endpoint_url = os.getenv("AWS_ENDPOINT_URL", "https://storage.yandexcloud.net") - region_name = os.getenv("AWS_REGION", "ru-central1") - - try: - # Загрузка сырых данных из S3 или локально - if s3_bucket and aws_access_key_id and aws_secret_access_key: - df = load_data_from_s3( - s3_bucket, s3_raw_data_key, endpoint_url, aws_access_key_id, aws_secret_access_key, region_name - ) - else: - raw_data_path = os.path.join(config["data"]["raw_path"], "data.csv") - df = load_data(raw_data_path) - - # Предобработка данных - df_processed = preprocess_data( - df, params["features"]["numeric_features"], params["features"]["categorical_features"] - ) - - # Разделение данных - X_train, X_test, y_train, y_test = split_data( - df_processed, config["data"]["target_column"], params["data"]["test_size"], params["data"]["random_state"] - ) - - # Сохранение обработанных данных локально - train_path = os.path.join(config["data"]["processed_path"], config["data"]["train_file"]) - test_path = os.path.join(config["data"]["processed_path"], config["data"]["test_file"]) - - save_data(X_train, X_test, y_train, y_test, train_path, test_path) - - # Загрузка обработанных данных в S3, если указаны credentials - if s3_bucket and aws_access_key_id and aws_secret_access_key: - upload_to_s3( - train_path, - s3_bucket, - s3_processed_prefix + config["data"]["train_file"], - endpoint_url, - aws_access_key_id, - aws_secret_access_key, - region_name, - ) - upload_to_s3( - test_path, - s3_bucket, - s3_processed_prefix + config["data"]["test_file"], - endpoint_url, - aws_access_key_id, - aws_secret_access_key, - region_name, - ) - upload_to_s3( - train_path.replace(".csv", "_target.csv"), - s3_bucket, - s3_processed_prefix + config["data"]["train_file"].replace(".csv", "_target.csv"), - endpoint_url, - aws_access_key_id, - aws_secret_access_key, - region_name, - ) - upload_to_s3( - test_path.replace(".csv", "_target.csv"), - s3_bucket, - s3_processed_prefix + config["data"]["test_file"].replace(".csv", "_target.csv"), - endpoint_url, - aws_access_key_id, - aws_secret_access_key, - region_name, - ) - - logger.info("Подготовка данных успешно завершена") - - except Exception as e: - logger.error(f"Ошибка в подготовке данных: {str(e)}") - raise - - -if __name__ == "__main__": - main() diff --git a/src/data/prepare_data.py b/src/data/prepare_data.py new file mode 100644 index 0000000..01ff08c --- /dev/null +++ b/src/data/prepare_data.py @@ -0,0 +1,183 @@ +from pathlib import Path + +import pandas as pd +import torch +import torchvision.transforms as transforms +from PIL import Image +from sklearn.model_selection import train_test_split +from torch.utils.data import DataLoader, Dataset + + +class PrepareData(Dataset): + """ + Класс для подготовки датасета из изображений и их географических координат. + Автоматически разделяет данные на тренировочную и тестовую выборки с разными трансформациями. + + Attributes + ---------- + images_dir : str + Путь к директории с изображениями. + transform : callable or None + Трансформации для применения к изображениям. + df : pandas.DataFrame + Данные из CSV-файла. + train_indices : list + Индексы тренировочной выборки. + test_indices : list + Индексы тестовой выборки. + """ + + def __init__(self, csv_path, images_dir, test_size=0.2, random_state=42): + """ + Инициализация датасета. + + Parameters + ---------- + csv_path : str or Path + Путь к CSV-файлу с данными. + images_dir : str or Path + Путь к директории с изображениями. + test_size : float, optional + Доля тестовой выборки, by default 0.2 + random_state : int, optional + Random state для воспроизводимости, by default 42 + """ + self.images_dir = Path(images_dir) + self.transform = transforms.Compose( + [ + transforms.Resize((224, 224)), + transforms.ToTensor(), + ] + ) + + # Загрузка данных из CSV + self.df = pd.read_csv(csv_path) + print(f"Загружено {len(self.df)} записей из CSV-файла") + + # Фильтрация данных по наличию изображений + self._filter_by_images() + print(f"После фильтрации по изображениям осталось {len(self.df)} записей") + + # Разделение данных + if len(self.df) > 0: + self._split_data(test_size, random_state) + else: + raise ValueError("После фильтрации не осталось ни одной записи") + + def _filter_by_images(self): + """ + Фильтрация данных по наличию изображений. + """ + # Проверяем, что файлы изображений существуют + image_files = set( + f.name for f in self.images_dir.iterdir() if f.is_file() and f.suffix.lower() in [".jpg", ".jpeg", ".png"] + ) + self.df = self.df[self.df["camera_id"].isin(image_files)] + self.df = self.df.reset_index(drop=True) + + def _split_data(self, test_size, random_state): + """ + Разделение данных на тренировочную и тестовую выборки. + + Parameters + ---------- + test_size : float + Доля тестовой выборки. + random_state : int + Random state для воспроизводимости. + """ + if len(self.df) == 0: + self.train_indices = [] + self.test_indices = [] + return + + # Разделение индексов + indices = list(range(len(self.df))) + self.train_indices, self.test_indices = train_test_split( + indices, test_size=test_size, random_state=random_state + ) + print(f"Разделение данных: {len(self.train_indices)} тренировочных, {len(self.test_indices)} тестовых") + + def get_train_dataset(self): + """ + Получение тренировочного датасета. + + Returns + ------- + PrepareData + Тренировочный датасет. + """ + return self._create_subset(self.train_indices) + + def get_test_dataset(self): + """ + Получение тестового датасета. + + Returns + ------- + PrepareData + Тестовый датасет. + """ + return self._create_subset(self.test_indices) + + def _create_subset(self, indices): + """ + Создание подмножества датасета по индексам. + + Parameters + ---------- + indices : list + Список индексов. + + Returns + ------- + PrepareData + Подмножество датасета. + """ + # Создаем копию объекта + subset = PrepareData.__new__(PrepareData) + subset.images_dir = self.images_dir + subset.transform = self.transform + subset.df = self.df.iloc[indices].reset_index(drop=True) + # Для подмножества не нужно разделять данные + subset.train_indices = list(range(len(subset.df))) + subset.test_indices = [] + return subset + + def __len__(self): + """ + Получение длины датасета. + + Returns + ------- + int + Длина датасета. + """ + return len(self.df) + + def __getitem__(self, idx): + """ + Получение элемента датасета по индексу. + + Parameters + ---------- + idx : int + Индекс элемента. + + Returns + ------- + tuple + Кортеж из изображения и координат (тензор изображения, тензор координат). + """ + row = self.df.iloc[idx] + image_path = self.images_dir / row["camera_id"] + + # Загрузка изображения + image = Image.open(image_path).convert("RGB") + if self.transform: + image = self.transform(image) + + # Получение координат + coordinates = torch.tensor([row["lat_real"], row["lon_real"]], dtype=torch.float32) + + return image, coordinates diff --git a/src/engine/main.py b/src/engine/main.py index 378fa8f..e51e21d 100644 --- a/src/engine/main.py +++ b/src/engine/main.py @@ -1,2 +1,686 @@ +import argparse +import json +import os +import sys +import warnings +from pathlib import Path +from typing import Any, Dict, List, Tuple + +import cv2 +import joblib +import numpy as np +import optuna +import pandas as pd +import torch +import torch.nn as nn +from sklearn.model_selection import train_test_split +from torch.utils.data import DataLoader, Dataset + +# ============================================================================= +# КРИТИЧЕСКИЕ ИСПРАВЛЕНИЯ ДЛЯ DATASPHERE +# ============================================================================= + + +# Динамическое определение путей для работы в DataSphere +current_file = Path(__file__).resolve() +project_root_in_cloud = Path("/job") # Явно указываем корень в облаке +local_project_root = current_file.parent.parent + +# Выбираем корень в зависимости от окружения +# Проверяем, находимся ли мы в среде DataSphere (существует ли папка /job) +if project_root_in_cloud.exists(): + ROOT_DIR = project_root_in_cloud + print("✓ Обнаружена среда DataSphere. Используем путь /job") +else: + ROOT_DIR = local_project_root + print("✓ Обнаружена локальная среда. Используем локальный путь") + +# Добавляем возможные пути к модулям в sys.path +possible_paths_to_models = [ + ROOT_DIR / "models", # Папка models в корне + ROOT_DIR / "src" / "models", # Папка models внутри src + ROOT_DIR, # Сам корень проекта + ROOT_DIR / "src", # Папка src + ROOT_DIR / "utils", # Папка utils в корне + ROOT_DIR / "src" / "utils", # Папка utils внутри src +] + +for path in possible_paths_to_models: + path_str = str(path) + if path.exists() and path_str not in sys.path: + sys.path.insert(0, path_str) + print(f"✓ Добавлен путь: {path}") + +# Также добавляем родительскую директорию текущего файла +current_parent = str(current_file.parent) +if current_parent not in sys.path: + sys.path.insert(0, current_parent) + +print("=" * 60) +print("FINAL ENVIRONMENT INFO:") +print(f"Current file: {current_file}") +print(f"ROOT_DIR: {ROOT_DIR}") +print(f"Current working directory: {Path.cwd()}") +print(f"Python will look for modules in:") +for i, path in enumerate(sys.path[:10]): # Показываем первые 10 путей + print(f" {i+1}. {path}") +print("=" * 60) + +# Диагностика: что действительно есть в облаке +print("\nCHECKING CLOUD ENVIRONMENT STRUCTURE:") +check_paths = [ROOT_DIR, Path(".")] +for path in check_paths: + if path.exists(): + print(f"\nСодержимое {path}:") + try: + items = list(path.iterdir()) + if not items: + print(" [EMPTY]") + for item in items: + item_type = "DIR" if item.is_dir() else "FILE" + print(f" [{item_type}] {item.name}") + except Exception as e: + print(f" Ошибка доступа: {e}") +print("=" * 60) + +# Теперь пробуем импортировать +try: + from models.OCR_model import OverlayOCR + + print("✓ Модуль models.OCR_model успешно импортирован") +except ImportError as e: + print(f"✗ Ошибка импорта models.OCR_model: {e}") + # Попробуем альтернативный путь + try: + # Если модуль в той же директории, что и main.py + from OCR_model import OverlayOCR + + print("✓ Модуль OCR_model успешно импортирован из текущей директории") + except ImportError as e2: + print(f"✗ Ошибка импорта OCR_model: {e2}") + raise + +try: + from utils.useful_functions import levenshtein_distance + + print("✓ Модуль utils.useful_functions успешно импортирован") +except ImportError as e: + print(f"✗ Ошибка импорта utils.useful_functions: {e}") + # Попробуем альтернативный путь + try: + from useful_functions import levenshtein_distance + + print("✓ Модуль useful_functions успешно импортирован из текущей директории") + except ImportError as e2: + print(f"✗ Ошибка импорта useful_functions: {e2}") + + # Создаем заглушку, если функция не найдена + def levenshtein_distance(s1, s2): + print(f"WARNING: Using dummy levenshtein_distance for '{s1}' and '{s2}'") + return abs(len(s1) - len(s2)) + + print("✓ Создана заглушка для levenshtein_distance") + +warnings.filterwarnings("ignore") + +# Директория для сохранения результатов +save_dir = ROOT_DIR / "models" / "ocr_model" +save_dir.mkdir(parents=True, exist_ok=True) +print(f"Save directory: {save_dir}") + + +def parse_args(): + """Парсинг аргументов командной строки""" + parser = argparse.ArgumentParser(description="OCR Model Training") + parser.add_argument( + "--csv-path", type=str, default="data/processed_data/merged_data.csv", help="Путь к CSV файлу с данными" + ) + parser.add_argument( + "--images-dir", + type=str, + default="data/raw_data/data/metadata/INC/united_image/", + help="Директория с изображениями", + ) + parser.add_argument("--optuna-study", type=str, default="optuna_study.pkl", help="Имя файла для сохранения study") + parser.add_argument( + "--ocr-model-params", type=str, default="ocr_model_params.json", help="Имя файла для сохранения параметров" + ) + parser.add_argument("--n-trials", type=int, default=50, help="Количество trials для Optuna") + parser.add_argument("--max-samples", type=int, default=200, help="Максимальное количество образцов для оценки") + return parser.parse_args() + + +class OCRDataset(Dataset): + """Датасет для обучения OCR модели""" + + def __init__(self, image_paths: List[Path], labels: List[str], transform=None): + self.image_paths = image_paths + self.labels = labels + self.transform = transform + + def __len__(self): + return len(self.image_paths) + + def __getitem__(self, idx): + image_path = self.image_paths[idx] + image = cv2.imread(str(image_path)) + if image is None: + image = np.zeros((1000, 1000, 3), dtype=np.uint8) + + image = image.astype(np.float32) / 255.0 + image = torch.from_numpy(image).permute(2, 0, 1) + label = self.labels[idx] + + return image, label, str(image_path) + + +class PrepareData: + """Подготовка данных для обучения""" + + def __init__(self, csv_path: Path, images_dir: Path, test_size: float = 0.2, random_state: int = 42): + self.csv_path = csv_path + self.images_dir = images_dir + self.test_size = test_size + self.random_state = random_state + self.data = None + self._prepare_data() + + def _prepare_data(self): + # Загружаем данные + print(f"Загрузка данных из {self.csv_path}") + self.data = pd.read_csv(self.csv_path) + print(f"Загружено {len(self.data)} записей") + + # Создаем словарь для группировки меток по camera_id + image_to_labels = {} + missing_count = 0 + + for _, row in self.data.iterrows(): + img_name = row["camera_id"] + label = row["filename"] + + # Проверяем различные возможные расширения + possible_extensions = ["", ".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"] + img_found = False + + for ext in possible_extensions: + if ext and img_name.endswith(ext): + test_name = img_name + else: + test_name = img_name + ext + + img_path = self.images_dir / test_name + if img_path.exists(): + if str(img_path) not in image_to_labels: + image_to_labels[str(img_path)] = [] + image_to_labels[str(img_path)].append(label) + img_found = True + break + + if not img_found: + missing_count += 1 + img_path = self.images_dir / img_name + if img_path.exists(): + if str(img_path) not in image_to_labels: + image_to_labels[str(img_path)] = [] + image_to_labels[str(img_path)].append(label) + else: + print(f"Изображение не найдено: {img_name}") + + print( + f"Найдено {len(image_to_labels)} уникальных изображений с {sum(len(labels) for labels in image_to_labels.values())} метками" + ) + print(f"Пропущено {missing_count} изображений") + + if len(image_to_labels) == 0: + raise ValueError("Не найдено ни одного валидного изображения!") + + # Создаем списки для разделения + image_paths = list(image_to_labels.keys()) + labels_lists = list(image_to_labels.values()) + + # Для стратификации используем первую метку каждого изображения + first_labels = [labels[0] for labels in labels_lists] + + # Проверяем, можно ли стратифицировать + from collections import Counter + + label_counts = Counter(first_labels) + min_samples = min(label_counts.values()) if label_counts else 0 + + if min_samples < 2 or len(set(first_labels)) == len(first_labels): + stratify = None + print("Стратификация отключена (уникальные метки или недостаточно образцов)") + else: + stratify = first_labels + print(f"Стратификация включена, классов: {len(set(first_labels))}") + + # Разделяем данные + (self.train_paths, self.test_paths, self.train_labels, self.test_labels) = train_test_split( + image_paths, labels_lists, test_size=self.test_size, random_state=self.random_state, stratify=stratify + ) + + print(f"Тренировочный набор: {len(self.train_paths)} изображений") + print(f"Тестовый набор: {len(self.test_paths)} изображений") + + def get_train_dataset(self): + # Для обратной совместимости используем первую метку + train_single_labels = [labels[0] for labels in self.train_labels] + return OCRDataset([Path(p) for p in self.train_paths], train_single_labels) + + def get_test_dataset(self): + test_single_labels = [labels[0] for labels in self.test_labels] + return OCRDataset([Path(p) for p in self.test_paths], test_single_labels) + + def get_train_dataset_with_all_labels(self): + """Возвращает датасет со всеми метками для каждого изображения""" + return MultiLabelDataset([Path(p) for p in self.train_paths], self.train_labels) + + def get_test_dataset_with_all_labels(self): + """Возвращает датасет со всеми метками для каждого изображения""" + return MultiLabelDataset([Path(p) for p in self.test_paths], self.test_labels) + + +class MultiLabelDataset(Dataset): + """Датасет с несколькими метками для каждого изображения""" + + def __init__(self, image_paths: List[Path], labels_lists: List[List[str]], transform=None): + self.image_paths = image_paths + self.labels_lists = labels_lists + self.transform = transform + + def __len__(self): + return len(self.image_paths) + + def __getitem__(self, idx): + image_path = self.image_paths[idx] + image = cv2.imread(str(image_path)) + if image is None: + image = np.zeros((1000, 1000, 3), dtype=np.uint8) + + image = image.astype(np.float32) / 255.0 + image = torch.from_numpy(image).permute(2, 0, 1) + labels = self.labels_lists[idx] + + return image, labels, str(image_path) + + +class OCRModel: + """Обертка для OCR системы с возможностью обучения гиперпараметров""" + + def __init__(self): + self.best_params = None + self.best_score = float("inf") + self.study = None + + @staticmethod + def check_data_quality(dataset, num_samples=5): + """Проверка качества данных""" + print("=== ДИАГНОСТИКА ДАННЫХ ===") + + for i in range(min(num_samples, len(dataset))): + if isinstance(dataset, MultiLabelDataset): + _, labels, path = dataset[i] + print(f"{i+1}. {Path(path).name}") + print(f" Метки ({len(labels)}): {labels}") + else: + _, label, path = dataset[i] + print(f"{i+1}. {Path(path).name}") + print(f" Метка: '{label}'") + + original = cv2.imread(str(path)) + print(f" Размер оригинала: {original.shape if original is not None else 'N/A'}") + + if original is not None: + h, w = original.shape[:2] + roi_height = min(150, h // 5) + roi = original[h - roi_height : h, : min(600, w)] + + debug_path = f"debug_sample_{i}.jpg" + cv2.imwrite(debug_path, roi) + print(f" Превью сохранено: {debug_path}") + print("---") + + def find_best_match_distance(self, predicted_text, true_labels): + """Находит минимальное расстояние Левенштейна между предсказанным текстом и всеми возможными метками""" + if not true_labels: + return float("inf"), None + + min_distance = float("inf") + best_match = None + + for true_label in true_labels: + distance = levenshtein_distance(str(true_label), predicted_text) + if distance < min_distance: + min_distance = distance + best_match = true_label + + return min_distance, best_match + + def evaluate_params(self, params: Dict[str, Any], dataset, max_samples: int = 200) -> float: + """Оценка параметров на датасете""" + try: + ocr = OverlayOCR(**params) + total_distance = 0 + count = 0 + + # Используем подвыборку для ускорения + n_samples = min(max_samples, len(dataset)) + indices = np.random.choice(len(dataset), n_samples, replace=False) + + print(f"Оценка параметров на {n_samples} образцах...") + + for i, idx in enumerate(indices): + if isinstance(dataset, MultiLabelDataset): + _, true_labels, image_path = dataset[idx] + else: + _, true_label, image_path = dataset[idx] + true_labels = [true_label] # Преобразуем в список для единообразия + + # Всегда используем оригинальный путь к изображению + try: + final, norm, joined, conf, roi_name = ocr.run_on_image(str(image_path)) + + # Находим наилучшее соответствие среди всех меток + distance, best_match = self.find_best_match_distance(final, true_labels) + total_distance += distance + count += 1 + + if i % 20 == 0: # Логируем каждые 20 образцов + print( + f" [{i+1}/{n_samples}] Лучшее соответствие: '{best_match}' -> '{final}', dist: {distance}" + ) + + except Exception as e: + print(f" Ошибка при обработке {image_path}: {e}") + continue + + avg_distance = total_distance / count if count > 0 else float("inf") + print(f" Среднее минимальное расстояние Левенштейна: {avg_distance:.2f}") + return avg_distance + + except Exception as e: + print(f"Ошибка оценки параметров: {e}") + return float("inf") + + def objective(self, trial, train_dataset, max_samples: int): + """Целевая функция для Optuna""" + params = { + "gap_mult": trial.suggest_float("gap_mult", 1.0, 2.0), + "canvas_size": trial.suggest_categorical("canvas_size", [3600, 4800, 6000]), + "mag_ratio": trial.suggest_float("mag_ratio", 2.0, 4.0), + "add_margin": trial.suggest_float("add_margin", 0.05, 0.2), + "text_threshold": trial.suggest_float("text_threshold", 0.45, 0.7), + "low_text": trial.suggest_float("low_text", 0.2, 0.4), + "link_threshold": trial.suggest_float("link_threshold", 0.3, 0.5), + "langs": ["en"], + "gpu": torch.cuda.is_available(), + } + + score = self.evaluate_params(params, train_dataset, max_samples) + + if score < self.best_score: + self.best_score = score + self.best_params = params.copy() + print(f" Новый лучший результат: {score:.2f}") + + return score + + def train(self, train_dataset, n_trials: int = 50, max_samples: int = 200): + """Обучение модели""" + print(f"Запуск оптимизации гиперпараметров ({n_trials} trials)...") + + self.study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=42)) + + self.study.optimize( + lambda trial: self.objective(trial, train_dataset, max_samples), n_trials=n_trials, show_progress_bar=True + ) + + print("Оптимизация завершена!") + print(f"Лучшие параметры: {self.study.best_params}") + print(f"Лучшее расстояние: {self.study.best_value:.2f}") + + return self.study.best_params + + def save_model(self, save_path: Path, args=None): + """Сохранение модели и параметров""" + if self.best_params is None: + raise ValueError("Модель не обучена. Сначала вызовите train()") + + save_path.mkdir(parents=True, exist_ok=True) + + if args is not None: + params_filename = args.ocr_model_params + study_filename = args.optuna_study + else: + params_filename = "ocr_model_params.json" + study_filename = "optuna_study.pkl" + + model_info = { + "best_params": self.best_params, + "best_score": self.best_score, + "study_trials": len(self.study.trials) if self.study else 0, + "timestamp": pd.Timestamp.now().isoformat(), + } + + with open(save_path / params_filename, "w") as f: + json.dump(model_info, f, indent=2, ensure_ascii=False) + + if self.study: + joblib.dump(self.study, save_path / study_filename) + + print(f"Модель сохранена в: {save_path}") + + def load_model(self, load_path: Path, args=None): + """Загрузка модели""" + if args is not None: + params_filename = args.ocr_model_params + study_filename = args.optuna_study + else: + params_filename = "ocr_model_params.json" + study_filename = "optuna_study.pkl" + + with open(load_path / params_filename, "r") as f: + model_info = json.load(f) + + self.best_params = model_info["best_params"] + self.best_score = model_info["best_score"] + + study_path = load_path / study_filename + if study_path.exists(): + self.study = joblib.load(study_path) + + return OverlayOCR(**self.best_params) + + +def find_file_by_pattern(directory, pattern): + """ + Ищет файл в директории по шаблону имени. + Возвращает Path к первому найденному файлу или None. + """ + path = Path(directory) + if not path.exists(): + return None + for file_path in path.iterdir(): + if file_path.is_file() and pattern in file_path.name: + return file_path + return None + + +def find_dir_by_pattern(directory, pattern): + """ + Ищет директорию по шаблону имени. + Возвращает Path к первой найденной директории или None. + """ + path = Path(directory) + if not path.exists(): + return None + for dir_path in path.iterdir(): + if dir_path.is_dir() and pattern in dir_path.name: + return dir_path + return None + + +def main(): + """Основная функция обучения""" + args = parse_args() + + try: + # Диагностика путей в DataSphere + print("\n" + "=" * 60) + print("DATASPHERE PATH DIAGNOSTICS:") + print(f"Original CSV path: {args.csv_path}") + print(f"Original Images dir: {args.images_dir}") + + # 1. Определяем корневую директорию для поиска + search_root = ROOT_DIR + + # 2. Гибкий поиск CSV-файла + csv_path = Path(args.csv_path) + if not csv_path.exists(): + # Пробуем найти файл, содержащий в имени ключевые слова + possible_csv = find_file_by_pattern(search_root, "csv_file") + if possible_csv: + csv_path = possible_csv + print(f"Найден CSV-файл по шаблону: {csv_path}") + else: + # Если по шаблону не нашли, пробуем просто взять первый файл в корне с расширением .csv + for item in search_root.iterdir(): + if item.is_file() and item.suffix.lower() == ".csv": + csv_path = item + print(f"Найден CSV-файл по расширению: {csv_path}") + break + else: + raise FileNotFoundError( + f"CSV файл не найден: {args.csv_path}. Доступные файлы в {search_root}: {list(search_root.iterdir())}" + ) + + # 3. Гибкий поиск директории с изображениями + images_dir = Path(args.images_dir) + if not images_dir.exists(): + # Пробуем найти директорию, содержащую в имени ключевые слова + possible_images_dir = find_dir_by_pattern(search_root, "images_dir") + if possible_images_dir: + images_dir = possible_images_dir + print(f"Найдена директория с изображениями по шаблону: {images_dir}") + else: + raise FileNotFoundError( + f"Директория с изображениями не найдена: {args.images_dir}. Доступные директории в {search_root}: {[d.name for d in search_root.iterdir() if d.is_dir()]}" + ) + + print(f"Final CSV path: {csv_path}") + print(f"Final Images dir: {images_dir}") + print(f"CSV exists: {csv_path.exists()}") + print(f"Images dir exists: {images_dir.exists()}") + print("=" * 60 + "\n") + + # Создание датасета с исправленными путями + print("Подготовка данных...") + dataset = PrepareData(csv_path=csv_path, images_dir=images_dir, test_size=0.2, random_state=42) + + # Получаем датасеты со всеми метками + train_dataset = dataset.get_train_dataset_with_all_labels() + test_dataset = dataset.get_test_dataset_with_all_labels() + + print(f"Размер тренировочного датасета: {len(train_dataset)}") + print(f"Размер тестового датасета: {len(test_dataset)}") + + # Диагностика данных перед обучением + OCRModel.check_data_quality(train_dataset, num_samples=3) + + # Тестовый прогон на одном изображении + if len(train_dataset) > 0: + _, test_labels, test_path = train_dataset[0] + print(f"\nТестовый прогон на первом изображении:") + print(f"Путь: {test_path}") + print(f"Возможные метки ({len(test_labels)}): {test_labels}") + + # Проверяем базовый OCR + ocr = OverlayOCR() + try: + final, norm, joined, conf, roi_name = ocr.run_on_image(str(test_path)) + distance, best_match = OCRModel().find_best_match_distance(final, test_labels) + print(f"Результат OCR: '{final}'") + print(f"Лучшее соответствие: '{best_match}', расстояние: {distance}") + + if distance > 10: + print("ВНИМАНИЕ: Большая ошибка на тестовом изображении!") + except Exception as e: + print(f"Ошибка при тестовом OCR: {e}") + + # Создаем и обучаем модель + model = OCRModel() + + # Обучаем на тренировочных данных со всеми метками + best_params = model.train(train_dataset, n_trials=args.n_trials, max_samples=args.max_samples) + + # Сохраняем модель в текущую директорию (для DataSphere) + model.save_model(ROOT_DIR, args) + + # Тестируем на тестовых данных со всеми метками + print("\nТестирование на тестовом наборе...") + test_ocr = OverlayOCR(**best_params) + test_distances = [] + best_matches = [] + test_samples = min(50, len(test_dataset)) + + for i in range(test_samples): + _, true_labels, image_path = test_dataset[i] + + try: + final, norm, joined, conf, roi_name = test_ocr.run_on_image(str(image_path)) + distance, best_match = model.find_best_match_distance(final, true_labels) + test_distances.append(distance) + best_matches.append(best_match) + + if i % 10 == 0: + print(f"Тест [{i+1}/{test_samples}]: '{best_match}' -> '{final}', dist: {distance}") + except Exception as e: + print(f"Ошибка при тестировании {image_path}: {e}") + continue + + if test_distances: + avg_test_distance = np.mean(test_distances) + std_test_distance = np.std(test_distances) + + # Анализ результатов + perfect_matches = sum(1 for d in test_distances if d == 0) + good_matches = sum(1 for d in test_distances if d <= 2) + + print(f"\nРезультаты тестирования:") + print(f"Среднее минимальное расстояние: {avg_test_distance:.2f}") + print(f"Стандартное отклонение: {std_test_distance:.2f}") + print(f"Идеальные совпадения (расстояние=0): {perfect_matches}/{len(test_distances)}") + print(f"Хорошие совпадения (расстояние≤2): {good_matches}/{len(test_distances)}") + print(f"Минимальное расстояние: {np.min(test_distances):.2f}") + print(f"Максимальное расстояние: {np.max(test_distances):.2f}") + + # Сохраняем результаты тестирования + results = { + "test_avg_distance": avg_test_distance, + "test_std_distance": std_test_distance, + "perfect_matches": perfect_matches, + "good_matches": good_matches, + "test_samples_evaluated": len(test_distances), + "train_dataset_size": len(train_dataset), + "test_dataset_size": len(test_dataset), + "best_params": best_params, + } + + with open(ROOT_DIR / "test_results.json", "w") as f: + json.dump(results, f, indent=2, ensure_ascii=False) + else: + print("Не удалось получить результаты тестирования!") + + print(f"\nОбучение завершено! Результаты сохранены в: {ROOT_DIR.absolute()}") + + except Exception as e: + print(f"Критическая ошибка в main: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + if __name__ == "__main__": - print("Hello world") + main() diff --git a/src/models/OCR_model.py b/src/models/OCR_model.py new file mode 100644 index 0000000..3a797c5 --- /dev/null +++ b/src/models/OCR_model.py @@ -0,0 +1,195 @@ +import re +from typing import List, Optional, Tuple + +import cv2 +import easyocr +import numpy as np + + +class OverlayOCR: + WHITELIST_RE = re.compile(r"[A-Za-z0-9_]+") + + def __init__( + self, + langs: Optional[List[str]] = None, + gpu: bool = False, + verbose: bool = False, + gap_mult: float = 1.6, + canvas_size: int = 3600, + mag_ratio: float = 3.0, + add_margin: float = 0.10, + text_threshold: float = 0.55, + low_text: float = 0.30, + link_threshold: float = 0.30, + ): + """ + langs: языки easyocr, напр. ['en'] или ['en','ru'] + gap_mult: чувствительность к горизонтальным разрывам (меньше -> больше '_') + canvas_size/mag_ratio: масштабирование внутри easyocr + """ + self.langs = langs or ["en"] + self.reader = easyocr.Reader(self.langs, gpu=gpu, verbose=verbose) + self.gap_mult = gap_mult + self.canvas_size = canvas_size + self.mag_ratio = mag_ratio + self.add_margin = add_margin + self.text_threshold = text_threshold + self.low_text = low_text + self.link_threshold = link_threshold + + # ---------- утилиты ---------- + @staticmethod + def _clean_token(t: str) -> str: + return "".join(OverlayOCR.WHITELIST_RE.findall(t)) + + @staticmethod + def _alnum_class(ch: str) -> str: + return "D" if ch.isdigit() else ("A" if ch.isalpha() else "_") + + def _join_with_gaps(self, results, sep="_") -> Tuple[str, float, list]: + """ + Склейка токенов слева направо: + - '_' если горизонтальный зазор >> медианного, + - '_' на границах A<->D. + """ + items = [] + for bbox, text, conf in results: + t = self._clean_token(text) + if not t: + continue + x0 = min(p[0] for p in bbox) + x1 = max(p[0] for p in bbox) + items.append((x0, x1, t, float(conf))) + if not items: + return "", 0.0, [] + + items.sort(key=lambda z: z[0]) + gaps = [] + for i in range(1, len(items)): + gaps.append(items[i][0] - items[i - 1][1]) + med_gap = np.median(gaps) if gaps else 0 + + out = [] + confs = [] + prev = None + for i, (x0, x1, t, c) in enumerate(items): + if prev is not None: + gap = x0 - prev[1] + need_sep = med_gap > 0 and gap > self.gap_mult * med_gap + # буква↔️цифра – полезно отделить + if not need_sep: + prev_last = out[-1][-1] if out else "" + if prev_last and t: + need_sep = self._alnum_class(prev_last) != self._alnum_class(t[0]) + if need_sep and (not out or out[-1] != sep): + out.append(sep) + out.append(t) + confs.append(c) + prev = (x0, x1) + + text = "".join(out) + text = re.sub(r"_+", "_", text).strip("_") + avg_conf = float(sum(confs) / len(confs)) if confs else 0.0 + return text, avg_conf, items + + @staticmethod + def _normalize_overlays(s: str) -> str: + """Правки под формат MMC_hd_... и расстановка подчёркиваний.""" + s = re.sub(r"^MMC(?:_)?h(?:d)?", "MMC_hd", s, flags=re.IGNORECASE) + s = re.sub(r"^MMC_?hd_?", "MMC_hd_", s, flags=re.IGNORECASE) + s = re.sub(r"([A-Za-z])([0-9])", r"\1_\2", s) + s = re.sub(r"([0-9])([A-Za-z])", r"\1_\2", s) + s = re.sub(r"_+", "_", s).strip("_") + return s + + @staticmethod + def _snap_digits_tail(s: str) -> str: + """ + Если хвост цифр склеен, режем на 4-1-1 (типичный случай). + Пример: ...229221 -> ...2292_2_1 + """ + m = re.search(r"^(.*?)(\d{6,})$", s) + if not m: + return s + head, digits = m.group(1), m.group(2) + if len(digits) >= 6: + s = f"{head}{digits[:4]}_{digits[4:5]}_{digits[5:]}" + return re.sub(r"_+", "_", s).strip("_") + + # ---------- EasyOCR запуск на ROI ---------- + def run_on_roi(self, roi_bgr) -> Tuple[str, str, str, float]: + params = dict( + decoder="greedy", + detail=1, + paragraph=False, + contrast_ths=0.05, + adjust_contrast=0.7, + text_threshold=self.text_threshold, + low_text=self.low_text, + link_threshold=self.link_threshold, + canvas_size=self.canvas_size, + mag_ratio=self.mag_ratio, + add_margin=self.add_margin, + ) + results = self.reader.readtext(roi_bgr, **params) + joined, conf, _ = self._join_with_gaps(results, sep="_") + norm = self._normalize_overlays(joined) + final = self._snap_digits_tail(norm) + final = re.sub(r"^MMC_?hd_?", "MMC_hd_", final, flags=re.IGNORECASE) + final = re.sub(r"_+", "_", final).strip("_") + return joined, norm, final, conf + + # ---------- ROI генераторы ---------- + @staticmethod + def roi_left_bottom(img, w_frac=1 / 3, h_frac=1 / 4): + H, W = img.shape[:2] + return img[H - int(H * h_frac) : H, 0 : int(W * w_frac)] + + @staticmethod + def roi_bottom_band(img, h_frac=1 / 3): + H, _ = img.shape[:2] + y0 = H - int(H * h_frac) + return img[y0:H, :] + + @staticmethod + def roi_auto_band(img): + g = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + _, b = cv2.threshold(cv2.GaussianBlur(g, (5, 5), 0), 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) + row = (b > 0).sum(axis=1).astype(np.float32) + k = max(3, (img.shape[0] // 100) * 2 + 1) + row = cv2.GaussianBlur(row.reshape(-1, 1), (1, k), 0).ravel() + start = img.shape[0] // 2 + idx = start + int(np.argmax(row[start:])) + band_half = max(img.shape[0] // 12, 20) + y0, y1 = max(0, idx - band_half), min(img.shape[0], idx + band_half) + return img[y0:y1, :] + + # ---------- главный метод ---------- + def run_on_image(self, image_path: str) -> Tuple[str, str, str, float, str]: + """ + Возвращает: + final, norm, joined, conf, best_roi_name + """ + img = cv2.imread(image_path) + assert img is not None, f"Не удалось загрузить изображение: {image_path}" + + rois = [ + ("left_bottom", self.roi_left_bottom(img, 1 / 3, 1 / 4)), + ("bottom_band", self.roi_bottom_band(img, 1 / 3)), + ("auto_band", self.roi_auto_band(img)), + ] + + best = None + best_name = "" + best_pack = ("", "", "", 0.0) + + for name, roi in rois: + joined, norm, final, conf = self.run_on_roi(roi) + cand = (conf, len(final), (final, norm, joined, conf), name) + if (best is None) or (cand > best): + best = cand + best_pack = (final, norm, joined, conf) + best_name = name + + final, norm, joined, conf = best_pack + return final, norm, joined, conf, best_name diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..2261379 --- /dev/null +++ b/src/models/__init__.py @@ -0,0 +1,3 @@ +from .OCR_model import OverlayOCR + +__all__ = ["OverlayOCR"] diff --git a/src/models/evaluate.py b/src/models/evaluate.py deleted file mode 100644 index c0bcfb4..0000000 --- a/src/models/evaluate.py +++ /dev/null @@ -1,263 +0,0 @@ -import json -import logging -import os -from io import StringIO - -import boto3 -import joblib -import matplotlib.pyplot as plt -import mlflow -import mlflow.sklearn -import numpy as np -import pandas as pd -import seaborn as sns -import yaml -from sklearn.metrics import ( - accuracy_score, - confusion_matrix, - f1_score, - precision_score, - recall_score, -) - -# Настройка логирования -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -def load_processed_data_from_s3(bucket_name, data_key, target_key, aws_access_key_id=None, aws_secret_access_key=None): - """Загрузка обработанных данных и целевых переменных из S3""" - logger.info(f"Загрузка обработанных данных из s3://{bucket_name}/{data_key} и s3://{bucket_name}/{target_key}") - - # Создание клиента S3 - if aws_access_key_id and aws_secret_access_key: - s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) - else: - # Используем IAM роли или credentials из окружения - s3 = boto3.client("s3") - - try: - # Загрузка данных - data_response = s3.get_object(Bucket=bucket_name, Key=data_key) - data_content = data_response["Body"].read().decode("utf-8") - X = pd.read_csv(StringIO(data_content)) - - # Загрузка целевых переменных - target_response = s3.get_object(Bucket=bucket_name, Key=target_key) - target_content = target_response["Body"].read().decode("utf-8") - y = pd.read_csv(StringIO(target_content))["target"] - - return X, y - except Exception as e: - logger.error(f"Ошибка при загрузке данных из S3: {str(e)}") - raise - - -def load_processed_data(data_path, target_path): - """Загрузка обработанных данных и целевых переменных (локально)""" - logger.info(f"Загрузка обработанных данных из {data_path}") - X = pd.read_csv(data_path) - y = pd.read_csv(target_path)["target"] - return X, y - - -def load_model_from_s3(bucket_name, model_key, aws_access_key_id=None, aws_secret_access_key=None): - """Загрузка обученной модели из S3""" - logger.info(f"Загрузка модели из s3://{bucket_name}/{model_key}") - - # Создание клиента S3 - if aws_access_key_id and aws_secret_access_key: - s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) - else: - # Используем IAM роли или credentials из окружения - s3 = boto3.client("s3") - - try: - # Загрузка модели - model_response = s3.get_object(Bucket=bucket_name, Key=model_key) - model_content = model_response["Body"].read() - - # Сохранение модели во временный файл - temp_model_path = "temp_model.pkl" - with open(temp_model_path, "wb") as f: - f.write(model_content) - - # Загрузка модели - model = joblib.load(temp_model_path) - - # Удаление временного файла - os.remove(temp_model_path) - - return model - except Exception as e: - logger.error(f"Ошибка при загрузке модели из S3: {str(e)}") - raise - - -def load_model(model_path): - """Загрузка обученной модели с диска""" - logger.info(f"Загрузка модели из {model_path}") - if not os.path.exists(model_path): - raise FileNotFoundError(f"Файл модели не найден: {model_path}") - return joblib.load(model_path) - - -def evaluate_model(model, X_test, y_test): - """Оценка производительности модели""" - logger.info("Оценка модели") - y_pred = model.predict(X_test) - - metrics = { - "accuracy": float(accuracy_score(y_test, y_pred)), - "precision": float(precision_score(y_test, y_pred, average="weighted")), - "recall": float(recall_score(y_test, y_pred, average="weighted")), - "f1_score": float(f1_score(y_test, y_pred, average="weighted")), - } - - return metrics, y_pred - - -def save_metrics(metrics, metrics_path): - """Сохранение метрик в JSON файл""" - logger.info(f"Сохранение метрик в {metrics_path}") - with open(metrics_path, "w") as f: - json.dump(metrics, f, indent=2) - - -def plot_confusion_matrix(y_true, y_pred, class_names=None, save_path="reports/figures/confusion_matrix.png"): - """Построение и сохранение матрицы ошибок""" - logger.info(f"Построение матрицы ошибок в {save_path}") - - # Создание директории, если она не существует - os.makedirs(os.path.dirname(save_path), exist_ok=True) - - # Создание матрицы ошибок - cm = confusion_matrix(y_true, y_pred) - - # Построение графика - plt.figure(figsize=(8, 6)) - sns.heatmap( - cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names or "auto", yticklabels=class_names or "auto" - ) - plt.title("Матрица ошибок") - plt.xlabel("Предсказанный класс") - plt.ylabel("Истинный класс") - - # Сохранение графика - plt.tight_layout() - plt.savefig(save_path) - plt.close() - - -def upload_to_s3(local_file_path, bucket_name, s3_key, aws_access_key_id=None, aws_secret_access_key=None): - """Загрузка файла в S3""" - logger.info(f"Загрузка {local_file_path} в s3://{bucket_name}/{s3_key}") - - # Создание клиента S3 - if aws_access_key_id and aws_secret_access_key: - s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) - else: - # Используем IAM роли или credentials из окружения - s3 = boto3.client("s3") - - # Загрузка файла - try: - s3.upload_file(local_file_path, bucket_name, s3_key) - logger.info(f"Файл успешно загружен в S3: s3://{bucket_name}/{s3_key}") - except Exception as e: - logger.error(f"Ошибка при загрузке файла в S3: {str(e)}") - raise - - -def main(): - """Главная функция для оценки модели""" - # Загрузка конфигурации - with open("configs/config.yaml", "r") as f: - config = yaml.safe_load(f) - - # Параметры S3 из переменных окружения - s3_bucket = os.getenv("S3_BUCKET") - s3_processed_prefix = os.getenv("S3_PROCESSED_PREFIX", "data/processed/") - aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID") - aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") - - # Параметры MLflow из переменных окружения - mlflow_tracking_uri = os.getenv("MLFLOW_TRACKING_URI", "http://localhost:5000") - mlflow_experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME", "ml-project") - - # Настройка MLflow - mlflow.set_tracking_uri(mlflow_tracking_uri) - mlflow.set_experiment(mlflow_experiment_name) - - try: - # Загрузка тестовых данных из S3 или локально - test_data_path = os.path.join(config["data"]["processed_path"], config["data"]["test_file"]) - test_target_path = test_data_path.replace(".csv", "_target.csv") - - if s3_bucket: - X_test, y_test = load_processed_data_from_s3( - s3_bucket, - s3_processed_prefix + config["data"]["test_file"], - s3_processed_prefix + config["data"]["test_file"].replace(".csv", "_target.csv"), - aws_access_key_id, - aws_secret_access_key, - ) - else: - X_test, y_test = load_processed_data(test_data_path, test_target_path) - - # Загрузка обученной модели из S3 или локально - model_path = "models/model.pkl" - if s3_bucket: - model = load_model_from_s3(s3_bucket, "models/model.pkl", aws_access_key_id, aws_secret_access_key) - else: - model = load_model(model_path) - - # Оценка модели - test_metrics, y_pred = evaluate_model(model, X_test, y_test) - logger.info(f"Тестовые метрики: {test_metrics}") - - # Сохранение метрик - save_metrics(test_metrics, "metrics.json") - - # Построение матрицы ошибок - plot_confusion_matrix(y_test, y_pred, save_path="reports/figures/confusion_matrix.png") - - # Загрузка графиков и метрик в S3, если указан bucket - if s3_bucket: - upload_to_s3( - "reports/figures/confusion_matrix.png", - s3_bucket, - "reports/figures/confusion_matrix.png", - aws_access_key_id, - aws_secret_access_key, - ) - upload_to_s3("metrics.json", s3_bucket, "metrics.json", aws_access_key_id, aws_secret_access_key) - - # Логирование в MLflow, если URI указан - if mlflow_tracking_uri: - # Получение активного эксперимента - experiment = mlflow.get_experiment_by_name(mlflow_experiment_name) - if experiment: - # Поиск последнего запуска - runs = mlflow.search_runs( - experiment_ids=[experiment.experiment_id], order_by=["start_time DESC"], max_results=1 - ) - if not runs.empty: - run_id = runs.iloc[0]["run_id"] - with mlflow.start_run(run_id=run_id): - # Логирование метрик - mlflow.log_metrics(test_metrics) - - # Логирование артефактов - mlflow.log_artifact("reports/figures/confusion_matrix.png") - mlflow.log_artifact("metrics.json") - - logger.info("Оценка модели успешно завершена") - - except Exception as e: - logger.error(f"Ошибка в оценке модели: {str(e)}") - raise - - -if __name__ == "__main__": - main() diff --git a/src/models/train.py b/src/models/train.py deleted file mode 100644 index c657523..0000000 --- a/src/models/train.py +++ /dev/null @@ -1,218 +0,0 @@ -import json -import logging -import os -from io import StringIO - -import boto3 -import joblib -import mlflow -import mlflow.sklearn -import numpy as np -import pandas as pd -import yaml -from sklearn.ensemble import RandomForestClassifier -from sklearn.linear_model import LogisticRegression -from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score - -# Настройка логирования -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -def load_processed_data_from_s3(bucket_name, data_key, target_key, aws_access_key_id=None, aws_secret_access_key=None): - """Загрузка обработанных данных и целевых переменных из S3""" - logger.info(f"Загрузка обработанных данных из s3://{bucket_name}/{data_key} и s3://{bucket_name}/{target_key}") - - # Создание клиента S3 - if aws_access_key_id and aws_secret_access_key: - s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) - else: - # Используем IAM роли или credentials из окружения - s3 = boto3.client("s3") - - try: - # Загрузка данных - data_response = s3.get_object(Bucket=bucket_name, Key=data_key) - data_content = data_response["Body"].read().decode("utf-8") - X = pd.read_csv(StringIO(data_content)) - - # Загрузка целевых переменных - target_response = s3.get_object(Bucket=bucket_name, Key=target_key) - target_content = target_response["Body"].read().decode("utf-8") - y = pd.read_csv(StringIO(target_content))["target"] - - return X, y - except Exception as e: - logger.error(f"Ошибка при загрузке данных из S3: {str(e)}") - raise - - -def load_processed_data(data_path, target_path): - """Загрузка обработанных данных и целевых переменных (локально)""" - logger.info(f"Загрузка обработанных данных из {data_path}") - X = pd.read_csv(data_path) - y = pd.read_csv(target_path)["target"] - return X, y - - -def get_model(model_name, model_params): - """Инициализация модели на основе конфигурации""" - logger.info(f"Инициализация модели {model_name}") - - if model_name.lower() == "random_forest": - return RandomForestClassifier(**model_params) - elif model_name.lower() == "logistic_regression": - return LogisticRegression(**model_params) - else: - raise ValueError(f"Неподдерживаемый тип модели: {model_name}") - - -def train_model(model, X_train, y_train): - """Обучение модели""" - logger.info("Обучение модели") - model.fit(X_train, y_train) - return model - - -def save_model(model, model_path): - """Сохранение обученной модели на диск""" - logger.info(f"Сохранение модели в {model_path}") - os.makedirs(os.path.dirname(model_path), exist_ok=True) - joblib.dump(model, model_path) - - -def evaluate_model(model, X_test, y_test): - """Оценка производительности модели""" - logger.info("Оценка модели") - y_pred = model.predict(X_test) - - metrics = { - "accuracy": accuracy_score(y_test, y_pred), - "precision": precision_score(y_test, y_pred, average="weighted"), - "recall": recall_score(y_test, y_pred, average="weighted"), - "f1_score": f1_score(y_test, y_pred, average="weighted"), - } - - return metrics - - -def save_metrics(metrics, metrics_path): - """Сохранение метрик в JSON файл""" - logger.info(f"Сохранение метрик в {metrics_path}") - with open(metrics_path, "w") as f: - json.dump(metrics, f, indent=2) - - -def upload_to_s3(local_file_path, bucket_name, s3_key, aws_access_key_id=None, aws_secret_access_key=None): - """Загрузка файла в S3""" - logger.info(f"Загрузка {local_file_path} в s3://{bucket_name}/{s3_key}") - - # Создание клиента S3 - if aws_access_key_id and aws_secret_access_key: - s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) - else: - # Используем IAM роли или credentials из окружения - s3 = boto3.client("s3") - - # Загрузка файла - try: - s3.upload_file(local_file_path, bucket_name, s3_key) - logger.info(f"Файл успешно загружен в S3: s3://{bucket_name}/{s3_key}") - except Exception as e: - logger.error(f"Ошибка при загрузке файла в S3: {str(e)}") - raise - - -def main(): - """Главная функция для обучения модели""" - # Загрузка конфигурации - with open("configs/config.yaml", "r") as f: - config = yaml.safe_load(f) - - # Для обратной совместимости будем использовать config для всех параметров - params = config - - # Параметры S3 из переменных окружения - s3_bucket = os.getenv("S3_BUCKET") - s3_processed_prefix = os.getenv("S3_PROCESSED_PREFIX", "data/processed/") - aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID") - aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") - - # Параметры MLflow из переменных окружения - mlflow_tracking_uri = os.getenv("MLFLOW_TRACKING_URI", "http://localhost:5000") - mlflow_experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME", "ml-project") - - # Настройка MLflow - mlflow.set_tracking_uri(mlflow_tracking_uri) - mlflow.set_experiment(mlflow_experiment_name) - - try: - # Загрузка обработанных данных из S3 или локально - train_data_path = os.path.join(config["data"]["processed_path"], config["data"]["train_file"]) - train_target_path = train_data_path.replace(".csv", "_target.csv") - - if s3_bucket: - X_train, y_train = load_processed_data_from_s3( - s3_bucket, - s3_processed_prefix + config["data"]["train_file"], - s3_processed_prefix + config["data"]["train_file"].replace(".csv", "_target.csv"), - aws_access_key_id, - aws_secret_access_key, - ) - else: - X_train, y_train = load_processed_data(train_data_path, train_target_path) - - # Начало эксперимента MLflow - with mlflow.start_run(): - # Логирование параметров - mlflow.log_params( - { - "model_name": params["model"]["name"], - "test_size": params["data"]["test_size"], - "random_state": params["data"]["random_state"], - } - ) - - # Логирование гиперпараметров модели - for param, value in params["model"]["params"].items(): - mlflow.log_param(f"model_{param}", value) - - # Инициализация модели - model = get_model(params["model"]["name"], params["model"]["params"]) - - # Обучение модели - trained_model = train_model(model, X_train, y_train) - - # Сохранение модели - model_path = "models/model.pkl" - save_model(trained_model, model_path) - - # Загрузка модели в S3, если указан bucket - if s3_bucket: - upload_to_s3(model_path, s3_bucket, "models/model.pkl", aws_access_key_id, aws_secret_access_key) - - # Оценка на обучающем наборе (для демонстрации) - train_metrics = evaluate_model(trained_model, X_train, y_train) - logger.info(f"Метрики обучения: {train_metrics}") - - # Логирование метрик в MLflow - mlflow.log_metrics(train_metrics) - - # Сохранение метрик локально - save_metrics(train_metrics, "metrics.json") - - # Логирование артефактов модели в MLflow - mlflow.sklearn.log_model(trained_model, "model") - - # Логирование файла метрик как артефакт - mlflow.log_artifact("metrics.json") - - logger.info("Обучение модели успешно завершено") - - except Exception as e: - logger.error(f"Ошибка в обучении модели: {str(e)}") - raise - - -if __name__ == "__main__": - main() diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..9690669 --- /dev/null +++ b/src/utils/__init__.py @@ -0,0 +1,15 @@ +from .useful_functions import extract_coordinates, move_and_remove_files, merge_tables_with_tolerance, levenshtein_distance +from .s3 import s3_download_file, get_s3_client, s3_list_files, s3_upload_file +from .zip import extract_zip_advanced + +__all__ = [ + "extract_coordinates", + "move_and_remove_files", + "merge_tables_with_tolerance", + "levenshtein_distance", + "s3_download_file", + "get_s3_client", + "s3_list_files", + "s3_upload_file", + "extract_zip_advanced", +] diff --git a/src/utils/useful_functions.py b/src/utils/useful_functions.py index 6e27b5c..5e0850e 100644 --- a/src/utils/useful_functions.py +++ b/src/utils/useful_functions.py @@ -1,7 +1,10 @@ +import collections +import itertools import os import re import shutil import zipfile +from itertools import zip_longest from math import atan2, cos, radians, sin, sqrt from pathlib import Path @@ -38,16 +41,32 @@ def extract_coordinates(coord_string): return None, None -def merge_tables_with_tolerance(target, - real_data, - target_lat_name: str = 'latitude', - target_lot_name: str = 'longitude', - real_data_lat_name: str = 'latitude', - real_data_lot_name: str = 'longitude', - max_distance_meters=100): +def merge_tables_with_tolerance( + target, + real_data, + target_lat_name="latitude", + target_lot_name="longitude", + real_data_lat_name="latitude", + real_data_lot_name="longitude", + max_distance_meters=100, +): + # Проверка существования колонок + if target_lat_name not in target.columns: + raise ValueError(f"Колонка {target_lat_name} не найдена в target") + if target_lot_name not in target.columns: + raise ValueError(f"Колонка {target_lot_name} не найдена в target") + if real_data_lat_name not in real_data.columns: + raise ValueError(f"Колонка {real_data_lat_name} не найдена в real_data") + if real_data_lot_name not in real_data.columns: + raise ValueError(f"Колонка {real_data_lot_name} не найдена в real_data") + # Переименование колонок - df1 = target.rename(columns={target.columns[0]: "filename", target.columns[target_lat_name]: "lat_target", target.columns[target_lot_name]: "lon_target"}) - df2 = real_data.rename(columns={real_data.columns[0]: "camera_id", real_data.columns[real_data_lat_name]: "lat_real", real_data.columns[real_data_lot_name]: "lon_real"}) + df1 = target.rename( + columns={target.columns[0]: "filename", target_lat_name: "lat_target", target_lot_name: "lon_target"} + ) + df2 = real_data.rename( + columns={real_data.columns[0]: "camera_id", real_data_lat_name: "lat_real", real_data_lot_name: "lon_real"} + ) # Преобразование координат в радианы для сферического расстояния coords1 = np.radians(df1[["lat_target", "lon_target"]].values) @@ -73,3 +92,21 @@ def merge_tables_with_tolerance(target, result = result[result["distance_m"] <= max_distance_meters].sort_values("distance_m") return result.reset_index(drop=True) + + +def levenshtein_distance(string1, string2): + """ + >>> levenshtein_distance('AATZ', 'AAAZ') + 1 + >>> levenshtein_distance('AATZZZ', 'AAAZ') + 3 + """ + distance = 0 + if len(string1) < len(string2): + string1, string2 = string2, string1 + + # Заменяем itertools.izip_longest на zip_longest для Python 3 + for i, v in zip_longest(string1, string2, fillvalue="-"): + if i != v: + distance += 1 + return distance