From 1dfbad8d7d08b8a6a1db17a25d29d9052bc64444 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Thu, 28 Aug 2025 15:47:36 +0200 Subject: [PATCH 01/11] refactor: savestate --- .../FR/geo/latitude_l93/__init__.py | 1 + .../geo/latitude_wgs_fr_metropole/__init__.py | 1 + .../FR/geo/longitude_l93/__init__.py | 1 + .../longitude_wgs_fr_metropole/__init__.py | 1 + .../FR/{other => temp}/date_fr/__init__.py | 1 + csv_detective/detect_fields/__init__.py | 3 +- .../geo/json_geojson/__init__.py | 1 + .../geo/latitude_wgs/__init__.py | 1 + .../geo/longitude_wgs/__init__.py | 1 + .../temp/datetime_rfc822/__init__.py | 1 + .../detect_fields/temp/year/__init__.py | 1 + .../FR/{other => temp}/date_fr/__init__.py | 0 csv_detective/detect_labels/__init__.py | 3 +- csv_detective/detection/formats.py | 1 + csv_detective/load_tests.py | 22 +++++- csv_detective/parsing/columns.py | 70 +++++++++++++------ csv_detective/validate.py | 1 + tests/test_fields.py | 3 +- 18 files changed, 83 insertions(+), 30 deletions(-) rename csv_detective/detect_fields/FR/{other => temp}/date_fr/__init__.py (94%) rename csv_detective/detect_labels/FR/{other => temp}/date_fr/__init__.py (100%) diff --git a/csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py b/csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py index c01fc58f..64e2c654 100644 --- a/csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +++ b/csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py @@ -4,6 +4,7 @@ from csv_detective.detect_fields.other.float import float_casting PROPORTION = 0.9 +PARENT = "float" _latitudel93 = LatitudeL93() diff --git a/csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py b/csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py index 333fa182..53153d87 100644 --- a/csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +++ b/csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py @@ -1,6 +1,7 @@ from csv_detective.detect_fields.other.float import _is as is_float PROPORTION = 0.9 +PARENT = "latitude_wgs" def _is(val): diff --git a/csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py b/csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py index 58cfeed1..dfa90fc7 100644 --- a/csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +++ b/csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py @@ -4,6 +4,7 @@ from csv_detective.detect_fields.other.float import float_casting PROPORTION = 0.9 +PARENT = "float" _longitudel93 = LongitudeL93() diff --git a/csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py b/csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py index 30036066..7fa7b60d 100644 --- a/csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +++ b/csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py @@ -1,6 +1,7 @@ from csv_detective.detect_fields.other.float import _is as is_float PROPORTION = 0.9 +PARENT = "longitude_wgs" def _is(val): diff --git a/csv_detective/detect_fields/FR/other/date_fr/__init__.py b/csv_detective/detect_fields/FR/temp/date_fr/__init__.py similarity index 94% rename from csv_detective/detect_fields/FR/other/date_fr/__init__.py rename to csv_detective/detect_fields/FR/temp/date_fr/__init__.py index 1d234dda..89e8419c 100644 --- a/csv_detective/detect_fields/FR/other/date_fr/__init__.py +++ b/csv_detective/detect_fields/FR/temp/date_fr/__init__.py @@ -1,6 +1,7 @@ import re PROPORTION = 1 +PARENT = "date" regex = ( r"^\d{1,2}[ \-](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre" r"|octobre|novembre|decembre)[ \-]\d{4}$" diff --git a/csv_detective/detect_fields/__init__.py b/csv_detective/detect_fields/__init__.py index c47c0019..fb6d06e5 100644 --- a/csv_detective/detect_fields/__init__.py +++ b/csv_detective/detect_fields/__init__.py @@ -21,7 +21,6 @@ code_rna, code_waldec, csp_insee, - date_fr, insee_ape700, sexe, siren, @@ -29,7 +28,7 @@ tel_fr, uai, ) -from .FR.temp import jour_de_la_semaine, mois_de_annee +from .FR.temp import date_fr, jour_de_la_semaine, mois_de_annee from .geo import ( iso_country_code_alpha2, iso_country_code_alpha3, diff --git a/csv_detective/detect_fields/geo/json_geojson/__init__.py b/csv_detective/detect_fields/geo/json_geojson/__init__.py index 2f7a06bd..be0f80bb 100644 --- a/csv_detective/detect_fields/geo/json_geojson/__init__.py +++ b/csv_detective/detect_fields/geo/json_geojson/__init__.py @@ -1,6 +1,7 @@ import json PROPORTION = 0.9 +PARENT = "json" def _is(val): diff --git a/csv_detective/detect_fields/geo/latitude_wgs/__init__.py b/csv_detective/detect_fields/geo/latitude_wgs/__init__.py index 4ae9ef19..ab374ad5 100644 --- a/csv_detective/detect_fields/geo/latitude_wgs/__init__.py +++ b/csv_detective/detect_fields/geo/latitude_wgs/__init__.py @@ -1,6 +1,7 @@ from csv_detective.detect_fields.other.float import _is as is_float PROPORTION = 0.9 +PARENT = "float" def _is(val): diff --git a/csv_detective/detect_fields/geo/longitude_wgs/__init__.py b/csv_detective/detect_fields/geo/longitude_wgs/__init__.py index 83a5ea2a..33f3a496 100644 --- a/csv_detective/detect_fields/geo/longitude_wgs/__init__.py +++ b/csv_detective/detect_fields/geo/longitude_wgs/__init__.py @@ -1,6 +1,7 @@ from csv_detective.detect_fields.other.float import _is as is_float PROPORTION = 0.9 +PARENT = "float" def _is(val): diff --git a/csv_detective/detect_fields/temp/datetime_rfc822/__init__.py b/csv_detective/detect_fields/temp/datetime_rfc822/__init__.py index ea2f6078..2a4b3584 100644 --- a/csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +++ b/csv_detective/detect_fields/temp/datetime_rfc822/__init__.py @@ -1,6 +1,7 @@ import re PROPORTION = 1 +PARENT = "datetime_aware" def _is(val): diff --git a/csv_detective/detect_fields/temp/year/__init__.py b/csv_detective/detect_fields/temp/year/__init__.py index 79a68e1f..2975b306 100644 --- a/csv_detective/detect_fields/temp/year/__init__.py +++ b/csv_detective/detect_fields/temp/year/__init__.py @@ -1,4 +1,5 @@ PROPORTION = 1 +PARENT = "int" def _is(val): diff --git a/csv_detective/detect_labels/FR/other/date_fr/__init__.py b/csv_detective/detect_labels/FR/temp/date_fr/__init__.py similarity index 100% rename from csv_detective/detect_labels/FR/other/date_fr/__init__.py rename to csv_detective/detect_labels/FR/temp/date_fr/__init__.py diff --git a/csv_detective/detect_labels/__init__.py b/csv_detective/detect_labels/__init__.py index c78d34cb..f5ffea16 100644 --- a/csv_detective/detect_labels/__init__.py +++ b/csv_detective/detect_labels/__init__.py @@ -20,7 +20,6 @@ code_rna, code_waldec, csp_insee, - date_fr, insee_ape700, sexe, siren, @@ -28,7 +27,7 @@ tel_fr, uai, ) -from .FR.temp import jour_de_la_semaine, mois_de_annee +from .FR.temp import date_fr, jour_de_la_semaine, mois_de_annee from .geo import ( iso_country_code_alpha2, iso_country_code_alpha3, diff --git a/csv_detective/detection/formats.py b/csv_detective/detection/formats.py index 51fb52f1..d5676651 100755 --- a/csv_detective/detection/formats.py +++ b/csv_detective/detection/formats.py @@ -73,6 +73,7 @@ def detect_formats( scores_table_fields = test_col( table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose ) + print(scores_table_fields) analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output) # Perform testing on labels diff --git a/csv_detective/load_tests.py b/csv_detective/load_tests.py index e1938ad0..a3f2382d 100755 --- a/csv_detective/load_tests.py +++ b/csv_detective/load_tests.py @@ -1,10 +1,11 @@ import os +from types import ModuleType from typing import Union from csv_detective import detect_fields, detect_labels # noqa -def get_all_packages(detect_type) -> list: +def get_all_packages(detect_type) -> list[str]: root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type modules = [] for dirpath, _, filenames in os.walk(root_dir): @@ -20,7 +21,7 @@ def get_all_packages(detect_type) -> list: def return_all_tests( user_input_tests: Union[str, list], detect_type: str, -) -> list: +) -> list[ModuleType]: """ returns all tests that have a method _is and are listed in the user_input_tests the function can select a sub_package from csv_detective @@ -51,3 +52,20 @@ def return_all_tests( # to remove groups of tests all_tests = [test for test in all_tests if "_is" in dir(test)] return all_tests + + +def build_test_priorities(tests: list[ModuleType]) -> tuple[dict[str, dict], dict[str, dict]]: + tests_dict = { + test.__name__.split(".")[-1]: { + "func": test._is, + "prop": test.PROPORTION, + "parent": getattr(test, "PARENT", None), + } + for test in tests + } + parents = {v["parent"] for v in tests_dict.values() if v["parent"] is not None} + specific_tests = { + k: v for k, v in tests_dict.items() + if k not in parents + } + return tests_dict, specific_tests diff --git a/csv_detective/parsing/columns.py b/csv_detective/parsing/columns.py index b83bec25..09fdd335 100755 --- a/csv_detective/parsing/columns.py +++ b/csv_detective/parsing/columns.py @@ -1,9 +1,11 @@ +from collections import defaultdict import logging from time import time from typing import Callable import pandas as pd +from csv_detective.load_tests import build_test_priorities from csv_detective.utils import display_logs_depending_process_time MAX_ROWS_ANALYSIS = int(1e4) @@ -89,40 +91,64 @@ def test_col( if verbose: start = time() logging.info("Testing columns to get types") - test_funcs = { - test.__name__.split(".")[-1]: { - "func": test._is, - "prop": test.PROPORTION, - } - for test in all_tests - } - return_table = pd.DataFrame(columns=table.columns) - for idx, (key, value) in enumerate(test_funcs.items()): + test_funcs, specific_tests = build_test_priorities(all_tests) + results = defaultdict(dict) + nb_cols = len(table.columns) + for idx, column in enumerate(table.columns): if verbose: - start_type = time() - logging.info(f"\t- Starting with type '{key}'") - # improvement lead : put the longest tests behind and make them only if previous tests not satisfactory - # => the following needs to change, "apply" means all columns are tested for one type at once - return_table.loc[key] = table.apply( - lambda serie: test_col_val( - serie, - value["func"], - value["prop"], + start_col = time() + logging.info(f"\t- Starting with column '{column}'") + tested = set() + # testing for the most specific formats first (we have early stops in test_col_val) + for test_name, test_attr in specific_tests.items(): + results[column][test_name] = test_col_val( + table[column], + test_attr["func"], + test_attr["prop"], skipna=skipna, limited_output=limited_output, verbose=verbose, ) - ) + print(f"{test_name}: {results[column][test_name]}") + tested.add(test_name) + # should we break if one of the specific tests is successful? + # performing less and less specific tests if specific ones fail + for test_name in [test for test in specific_tests if test not in tested]: + current_test = test_name + while test_funcs[current_test]["parent"] is not None: + if test_funcs[current_test]["parent"] in results[column]: + print(f"already here {test_name}: {results[column][test_name]}") + # already tested as a parent of a previous test + break + if results[column][current_test] > 0: + # if a child test is successful, we set the parent's score to the same value + # this is not perfect: the column can be 50% child but 100% parent + # we would have to perform the parent test to know exactly, but this saves much time + results[column][test_funcs[current_test]["parent"]] = results[column][current_test] + print(f"bypassed {test_name}: {results[column][test_name]}") + else: + results[column][test_funcs[current_test]["parent"]] = test_col_val( + table[column], + test_attr["func"], + test_attr["prop"], + skipna=skipna, + limited_output=limited_output, + verbose=verbose, + ) + print(f"processed {test_name}: {results[column][test_name]}") + tested.add(current_test) + current_test = test_funcs[current_test]["parent"] if verbose: display_logs_depending_process_time( - f'\t> Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(test_funcs)})', - time() - start_type, + f'\t> Done with column "{column}" in {round(time() - start_col, 3)}s' + f' ({idx + 1}/{nb_cols}), {len(tested)} tests performed', + time() - start_col, ) if verbose: display_logs_depending_process_time( f"Done testing columns in {round(time() - start, 3)}s", time() - start ) - return return_table + return pd.DataFrame(results) def test_label(table: pd.DataFrame, all_tests: list, limited_output: bool, verbose: bool = False): diff --git a/csv_detective/validate.py b/csv_detective/validate.py index 1648b65e..2a58c48a 100755 --- a/csv_detective/validate.py +++ b/csv_detective/validate.py @@ -13,6 +13,7 @@ t.__name__.split(".")[-1]: { "func": t._is, "prop": t.PROPORTION, + "parent": getattr(t, "PARENT", None), } for t in return_all_tests("ALL", "detect_fields") } diff --git a/tests/test_fields.py b/tests/test_fields.py index c1397645..6e833912 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -29,7 +29,6 @@ code_rna, code_waldec, csp_insee, - date_fr, insee_ape700, sexe, siren, @@ -37,7 +36,7 @@ tel_fr, uai, ) -from csv_detective.detect_fields.FR.temp import jour_de_la_semaine, mois_de_annee +from csv_detective.detect_fields.FR.temp import date_fr, jour_de_la_semaine, mois_de_annee from csv_detective.detect_fields.geo import ( iso_country_code_alpha2, iso_country_code_alpha3, From 79a1e61f04b9dd840824fe297f2737d57b754eae Mon Sep 17 00:00:00 2001 From: Pierlou Date: Thu, 28 Aug 2025 17:03:12 +0200 Subject: [PATCH 02/11] fix: make it work --- csv_detective/parsing/columns.py | 30 ++++++++++++++---------------- csv_detective/validate.py | 11 ++--------- 2 files changed, 16 insertions(+), 25 deletions(-) diff --git a/csv_detective/parsing/columns.py b/csv_detective/parsing/columns.py index 09fdd335..c2027ed2 100755 --- a/csv_detective/parsing/columns.py +++ b/csv_detective/parsing/columns.py @@ -109,35 +109,33 @@ def test_col( limited_output=limited_output, verbose=verbose, ) - print(f"{test_name}: {results[column][test_name]}") tested.add(test_name) # should we break if one of the specific tests is successful? # performing less and less specific tests if specific ones fail - for test_name in [test for test in specific_tests if test not in tested]: - current_test = test_name - while test_funcs[current_test]["parent"] is not None: - if test_funcs[current_test]["parent"] in results[column]: - print(f"already here {test_name}: {results[column][test_name]}") - # already tested as a parent of a previous test + for test_name in specific_tests: + current = test_name + parent = test_funcs[current]["parent"] + while parent is not None: + if parent in results[column]: + # already tested as a parent of a previous test, no need to get higher parents break - if results[column][current_test] > 0: + if results[column][current] > 0: # if a child test is successful, we set the parent's score to the same value # this is not perfect: the column can be 50% child but 100% parent # we would have to perform the parent test to know exactly, but this saves much time - results[column][test_funcs[current_test]["parent"]] = results[column][current_test] - print(f"bypassed {test_name}: {results[column][test_name]}") + results[column][parent] = results[column][current] else: - results[column][test_funcs[current_test]["parent"]] = test_col_val( + results[column][parent] = test_col_val( table[column], - test_attr["func"], - test_attr["prop"], + test_funcs[parent]["func"], + test_funcs[parent]["prop"], skipna=skipna, limited_output=limited_output, verbose=verbose, ) - print(f"processed {test_name}: {results[column][test_name]}") - tested.add(current_test) - current_test = test_funcs[current_test]["parent"] + tested.add(parent) + current = parent + parent = test_funcs[current]["parent"] if verbose: display_logs_depending_process_time( f'\t> Done with column "{column}" in {round(time() - start_col, 3)}s' diff --git a/csv_detective/validate.py b/csv_detective/validate.py index 2a58c48a..e1ecd6b7 100755 --- a/csv_detective/validate.py +++ b/csv_detective/validate.py @@ -3,20 +3,13 @@ import pandas as pd -from csv_detective.load_tests import return_all_tests +from csv_detective.load_tests import build_test_priorities, return_all_tests from csv_detective.parsing.columns import test_col_val from csv_detective.parsing.load import load_file logging.basicConfig(level=logging.INFO) -tests = { - t.__name__.split(".")[-1]: { - "func": t._is, - "prop": t.PROPORTION, - "parent": getattr(t, "PARENT", None), - } - for t in return_all_tests("ALL", "detect_fields") -} +tests, _ = build_test_priorities(return_all_tests("ALL", "detect_fields")) def validate( From 6bcd63da95f3c089d64b17bed8cf12bdbcfbbcbc Mon Sep 17 00:00:00 2001 From: Pierlou Date: Thu, 28 Aug 2025 17:10:26 +0200 Subject: [PATCH 03/11] refactor: better name --- csv_detective/load_tests.py | 2 +- csv_detective/parsing/columns.py | 4 ++-- csv_detective/validate.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/csv_detective/load_tests.py b/csv_detective/load_tests.py index a3f2382d..9ddd2895 100755 --- a/csv_detective/load_tests.py +++ b/csv_detective/load_tests.py @@ -54,7 +54,7 @@ def return_all_tests( return all_tests -def build_test_priorities(tests: list[ModuleType]) -> tuple[dict[str, dict], dict[str, dict]]: +def build_tests_dicts(tests: list[ModuleType]) -> tuple[dict[str, dict], dict[str, dict]]: tests_dict = { test.__name__.split(".")[-1]: { "func": test._is, diff --git a/csv_detective/parsing/columns.py b/csv_detective/parsing/columns.py index c2027ed2..23d64a75 100755 --- a/csv_detective/parsing/columns.py +++ b/csv_detective/parsing/columns.py @@ -5,7 +5,7 @@ import pandas as pd -from csv_detective.load_tests import build_test_priorities +from csv_detective.load_tests import build_tests_dicts from csv_detective.utils import display_logs_depending_process_time MAX_ROWS_ANALYSIS = int(1e4) @@ -91,7 +91,7 @@ def test_col( if verbose: start = time() logging.info("Testing columns to get types") - test_funcs, specific_tests = build_test_priorities(all_tests) + test_funcs, specific_tests = build_tests_dicts(all_tests) results = defaultdict(dict) nb_cols = len(table.columns) for idx, column in enumerate(table.columns): diff --git a/csv_detective/validate.py b/csv_detective/validate.py index e1ecd6b7..8dcdd0eb 100755 --- a/csv_detective/validate.py +++ b/csv_detective/validate.py @@ -3,13 +3,13 @@ import pandas as pd -from csv_detective.load_tests import build_test_priorities, return_all_tests +from csv_detective.load_tests import build_tests_dicts, return_all_tests from csv_detective.parsing.columns import test_col_val from csv_detective.parsing.load import load_file logging.basicConfig(level=logging.INFO) -tests, _ = build_test_priorities(return_all_tests("ALL", "detect_fields")) +tests, _ = build_tests_dicts(return_all_tests("ALL", "detect_fields")) def validate( From 66b2d45749a845b0cd6f54fc9d9a4de5a1ea1646 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Thu, 28 Aug 2025 17:13:18 +0200 Subject: [PATCH 04/11] docs: update changelog --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5038efe6..6ce2b017 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,8 @@ ## Current (in progress) - Better email detection [#151](https://github.com/datagouv/csv-detective/pull/151) -- Sample can handle full NaN columns [#152](https://github.com/datagouv/csv-detective/pull/152) +- Sample can handle full NaN columns [#154](https://github.com/datagouv/csv-detective/pull/154) +- Add checks priorization to prevent testing all formats [#155](https://github.com/datagouv/csv-detective/pull/155) ## 0.9.2 (2025-08-26) From b7f5c4f9ad3a51875a0c793e77f7a6db3ec72467 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Thu, 28 Aug 2025 17:13:46 +0200 Subject: [PATCH 05/11] chore: lint --- csv_detective/load_tests.py | 5 +---- csv_detective/parsing/columns.py | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/csv_detective/load_tests.py b/csv_detective/load_tests.py index 9ddd2895..be1eb400 100755 --- a/csv_detective/load_tests.py +++ b/csv_detective/load_tests.py @@ -64,8 +64,5 @@ def build_tests_dicts(tests: list[ModuleType]) -> tuple[dict[str, dict], dict[st for test in tests } parents = {v["parent"] for v in tests_dict.values() if v["parent"] is not None} - specific_tests = { - k: v for k, v in tests_dict.items() - if k not in parents - } + specific_tests = {k: v for k, v in tests_dict.items() if k not in parents} return tests_dict, specific_tests diff --git a/csv_detective/parsing/columns.py b/csv_detective/parsing/columns.py index 23d64a75..51d13f93 100755 --- a/csv_detective/parsing/columns.py +++ b/csv_detective/parsing/columns.py @@ -1,5 +1,5 @@ -from collections import defaultdict import logging +from collections import defaultdict from time import time from typing import Callable @@ -139,7 +139,7 @@ def test_col( if verbose: display_logs_depending_process_time( f'\t> Done with column "{column}" in {round(time() - start_col, 3)}s' - f' ({idx + 1}/{nb_cols}), {len(tested)} tests performed', + f" ({idx + 1}/{nb_cols}), {len(tested)} tests performed", time() - start_col, ) if verbose: From 0e3ac232c8889bb074fd5bf213b54fa567a1f897 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Thu, 28 Aug 2025 17:32:33 +0200 Subject: [PATCH 06/11] fix: remove log --- csv_detective/detection/formats.py | 1 - 1 file changed, 1 deletion(-) diff --git a/csv_detective/detection/formats.py b/csv_detective/detection/formats.py index d5676651..51fb52f1 100755 --- a/csv_detective/detection/formats.py +++ b/csv_detective/detection/formats.py @@ -73,7 +73,6 @@ def detect_formats( scores_table_fields = test_col( table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose ) - print(scores_table_fields) analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output) # Perform testing on labels From a0cdcb278a460311180ab21683f947c15cc93d06 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Fri, 29 Aug 2025 10:11:06 +0200 Subject: [PATCH 07/11] savestate --- CHANGELOG.md | 2 +- csv_detective/parsing/columns.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ce2b017..04cca5bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ - Better email detection [#151](https://github.com/datagouv/csv-detective/pull/151) - Sample can handle full NaN columns [#154](https://github.com/datagouv/csv-detective/pull/154) -- Add checks priorization to prevent testing all formats [#155](https://github.com/datagouv/csv-detective/pull/155) +- Add tests priorization to prevent testing all formats [#155](https://github.com/datagouv/csv-detective/pull/155) ## 0.9.2 (2025-08-26) diff --git a/csv_detective/parsing/columns.py b/csv_detective/parsing/columns.py index 51d13f93..5d7fe803 100755 --- a/csv_detective/parsing/columns.py +++ b/csv_detective/parsing/columns.py @@ -112,10 +112,17 @@ def test_col( tested.add(test_name) # should we break if one of the specific tests is successful? # performing less and less specific tests if specific ones fail - for test_name in specific_tests: + # starting with highest scores to set the parents from there + for test_name in reversed([ + test for test, _ in sorted( + (tup for tup in results[column].items()), + key=lambda tup: tup[1], + ) + ]): current = test_name parent = test_funcs[current]["parent"] while parent is not None: + print(current, parent) if parent in results[column]: # already tested as a parent of a previous test, no need to get higher parents break @@ -123,6 +130,7 @@ def test_col( # if a child test is successful, we set the parent's score to the same value # this is not perfect: the column can be 50% child but 100% parent # we would have to perform the parent test to know exactly, but this saves much time + print(f"setting {parent} from {current}, score : {results[column][current]}") results[column][parent] = results[column][current] else: results[column][parent] = test_col_val( @@ -134,8 +142,7 @@ def test_col( verbose=verbose, ) tested.add(parent) - current = parent - parent = test_funcs[current]["parent"] + current, parent = parent, test_funcs[parent]["parent"] if verbose: display_logs_depending_process_time( f'\t> Done with column "{column}" in {round(time() - start_col, 3)}s' From 7d7db5ae40dd6df47c3eee7aacc55ba3de39a295 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Fri, 29 Aug 2025 10:24:22 +0200 Subject: [PATCH 08/11] refactor: savestate --- csv_detective/parsing/columns.py | 1 - 1 file changed, 1 deletion(-) diff --git a/csv_detective/parsing/columns.py b/csv_detective/parsing/columns.py index 5d7fe803..f4762da9 100755 --- a/csv_detective/parsing/columns.py +++ b/csv_detective/parsing/columns.py @@ -122,7 +122,6 @@ def test_col( current = test_name parent = test_funcs[current]["parent"] while parent is not None: - print(current, parent) if parent in results[column]: # already tested as a parent of a previous test, no need to get higher parents break From 7b167e48957b2ae52f212320d2830cd2df9b5972 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Fri, 29 Aug 2025 10:55:13 +0200 Subject: [PATCH 09/11] fix: remove log --- csv_detective/parsing/columns.py | 1 - 1 file changed, 1 deletion(-) diff --git a/csv_detective/parsing/columns.py b/csv_detective/parsing/columns.py index f4762da9..b4e4bc60 100755 --- a/csv_detective/parsing/columns.py +++ b/csv_detective/parsing/columns.py @@ -129,7 +129,6 @@ def test_col( # if a child test is successful, we set the parent's score to the same value # this is not perfect: the column can be 50% child but 100% parent # we would have to perform the parent test to know exactly, but this saves much time - print(f"setting {parent} from {current}, score : {results[column][current]}") results[column][parent] = results[column][current] else: results[column][parent] = test_col_val( From 551a3c8f5c17bffaac57e79664fadb4e967fea72 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Fri, 29 Aug 2025 10:57:03 +0200 Subject: [PATCH 10/11] chore: lint --- csv_detective/parsing/columns.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/csv_detective/parsing/columns.py b/csv_detective/parsing/columns.py index b4e4bc60..724c1213 100755 --- a/csv_detective/parsing/columns.py +++ b/csv_detective/parsing/columns.py @@ -113,12 +113,15 @@ def test_col( # should we break if one of the specific tests is successful? # performing less and less specific tests if specific ones fail # starting with highest scores to set the parents from there - for test_name in reversed([ - test for test, _ in sorted( - (tup for tup in results[column].items()), - key=lambda tup: tup[1], - ) - ]): + for test_name in reversed( + [ + test + for test, _ in sorted( + (tup for tup in results[column].items()), + key=lambda tup: tup[1], + ) + ] + ): current = test_name parent = test_funcs[current]["parent"] while parent is not None: From e05e646393a9188fc8cd45093b14cd860258ccb4 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Fri, 29 Aug 2025 11:22:49 +0200 Subject: [PATCH 11/11] refactor: better logs --- csv_detective/parsing/columns.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csv_detective/parsing/columns.py b/csv_detective/parsing/columns.py index 724c1213..c86b241c 100755 --- a/csv_detective/parsing/columns.py +++ b/csv_detective/parsing/columns.py @@ -97,7 +97,7 @@ def test_col( for idx, column in enumerate(table.columns): if verbose: start_col = time() - logging.info(f"\t- Starting with column '{column}'") + logging.info(f"\t- Starting with column '{column}' ({idx + 1}/{nb_cols})") tested = set() # testing for the most specific formats first (we have early stops in test_col_val) for test_name, test_attr in specific_tests.items(): @@ -147,7 +147,7 @@ def test_col( if verbose: display_logs_depending_process_time( f'\t> Done with column "{column}" in {round(time() - start_col, 3)}s' - f" ({idx + 1}/{nb_cols}), {len(tested)} tests performed", + f", {len(tested)} tests performed", time() - start_col, ) if verbose: