diff --git a/CHANGELOG.md b/CHANGELOG.md index 5038efe6..04cca5bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,8 @@ ## Current (in progress) - Better email detection [#151](https://github.com/datagouv/csv-detective/pull/151) -- Sample can handle full NaN columns [#152](https://github.com/datagouv/csv-detective/pull/152) +- Sample can handle full NaN columns [#154](https://github.com/datagouv/csv-detective/pull/154) +- Add tests priorization to prevent testing all formats [#155](https://github.com/datagouv/csv-detective/pull/155) ## 0.9.2 (2025-08-26) diff --git a/csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py b/csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py index c01fc58f..64e2c654 100644 --- a/csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +++ b/csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py @@ -4,6 +4,7 @@ from csv_detective.detect_fields.other.float import float_casting PROPORTION = 0.9 +PARENT = "float" _latitudel93 = LatitudeL93() diff --git a/csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py b/csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py index 333fa182..53153d87 100644 --- a/csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +++ b/csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py @@ -1,6 +1,7 @@ from csv_detective.detect_fields.other.float import _is as is_float PROPORTION = 0.9 +PARENT = "latitude_wgs" def _is(val): diff --git a/csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py b/csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py index 58cfeed1..dfa90fc7 100644 --- a/csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +++ b/csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py @@ -4,6 +4,7 @@ from csv_detective.detect_fields.other.float import float_casting PROPORTION = 0.9 +PARENT = "float" _longitudel93 = LongitudeL93() diff --git a/csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py b/csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py index 30036066..7fa7b60d 100644 --- a/csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +++ b/csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py @@ -1,6 +1,7 @@ from csv_detective.detect_fields.other.float import _is as is_float PROPORTION = 0.9 +PARENT = "longitude_wgs" def _is(val): diff --git a/csv_detective/detect_fields/FR/other/date_fr/__init__.py b/csv_detective/detect_fields/FR/temp/date_fr/__init__.py similarity index 94% rename from csv_detective/detect_fields/FR/other/date_fr/__init__.py rename to csv_detective/detect_fields/FR/temp/date_fr/__init__.py index 1d234dda..89e8419c 100644 --- a/csv_detective/detect_fields/FR/other/date_fr/__init__.py +++ b/csv_detective/detect_fields/FR/temp/date_fr/__init__.py @@ -1,6 +1,7 @@ import re PROPORTION = 1 +PARENT = "date" regex = ( r"^\d{1,2}[ \-](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre" r"|octobre|novembre|decembre)[ \-]\d{4}$" diff --git a/csv_detective/detect_fields/__init__.py b/csv_detective/detect_fields/__init__.py index c47c0019..fb6d06e5 100644 --- a/csv_detective/detect_fields/__init__.py +++ b/csv_detective/detect_fields/__init__.py @@ -21,7 +21,6 @@ code_rna, code_waldec, csp_insee, - date_fr, insee_ape700, sexe, siren, @@ -29,7 +28,7 @@ tel_fr, uai, ) -from .FR.temp import jour_de_la_semaine, mois_de_annee +from .FR.temp import date_fr, jour_de_la_semaine, mois_de_annee from .geo import ( iso_country_code_alpha2, iso_country_code_alpha3, diff --git a/csv_detective/detect_fields/geo/json_geojson/__init__.py b/csv_detective/detect_fields/geo/json_geojson/__init__.py index 2f7a06bd..be0f80bb 100644 --- a/csv_detective/detect_fields/geo/json_geojson/__init__.py +++ b/csv_detective/detect_fields/geo/json_geojson/__init__.py @@ -1,6 +1,7 @@ import json PROPORTION = 0.9 +PARENT = "json" def _is(val): diff --git a/csv_detective/detect_fields/geo/latitude_wgs/__init__.py b/csv_detective/detect_fields/geo/latitude_wgs/__init__.py index 4ae9ef19..ab374ad5 100644 --- a/csv_detective/detect_fields/geo/latitude_wgs/__init__.py +++ b/csv_detective/detect_fields/geo/latitude_wgs/__init__.py @@ -1,6 +1,7 @@ from csv_detective.detect_fields.other.float import _is as is_float PROPORTION = 0.9 +PARENT = "float" def _is(val): diff --git a/csv_detective/detect_fields/geo/longitude_wgs/__init__.py b/csv_detective/detect_fields/geo/longitude_wgs/__init__.py index 83a5ea2a..33f3a496 100644 --- a/csv_detective/detect_fields/geo/longitude_wgs/__init__.py +++ b/csv_detective/detect_fields/geo/longitude_wgs/__init__.py @@ -1,6 +1,7 @@ from csv_detective.detect_fields.other.float import _is as is_float PROPORTION = 0.9 +PARENT = "float" def _is(val): diff --git a/csv_detective/detect_fields/temp/datetime_rfc822/__init__.py b/csv_detective/detect_fields/temp/datetime_rfc822/__init__.py index ea2f6078..2a4b3584 100644 --- a/csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +++ b/csv_detective/detect_fields/temp/datetime_rfc822/__init__.py @@ -1,6 +1,7 @@ import re PROPORTION = 1 +PARENT = "datetime_aware" def _is(val): diff --git a/csv_detective/detect_fields/temp/year/__init__.py b/csv_detective/detect_fields/temp/year/__init__.py index 79a68e1f..2975b306 100644 --- a/csv_detective/detect_fields/temp/year/__init__.py +++ b/csv_detective/detect_fields/temp/year/__init__.py @@ -1,4 +1,5 @@ PROPORTION = 1 +PARENT = "int" def _is(val): diff --git a/csv_detective/detect_labels/FR/other/date_fr/__init__.py b/csv_detective/detect_labels/FR/temp/date_fr/__init__.py similarity index 100% rename from csv_detective/detect_labels/FR/other/date_fr/__init__.py rename to csv_detective/detect_labels/FR/temp/date_fr/__init__.py diff --git a/csv_detective/detect_labels/__init__.py b/csv_detective/detect_labels/__init__.py index c78d34cb..f5ffea16 100644 --- a/csv_detective/detect_labels/__init__.py +++ b/csv_detective/detect_labels/__init__.py @@ -20,7 +20,6 @@ code_rna, code_waldec, csp_insee, - date_fr, insee_ape700, sexe, siren, @@ -28,7 +27,7 @@ tel_fr, uai, ) -from .FR.temp import jour_de_la_semaine, mois_de_annee +from .FR.temp import date_fr, jour_de_la_semaine, mois_de_annee from .geo import ( iso_country_code_alpha2, iso_country_code_alpha3, diff --git a/csv_detective/load_tests.py b/csv_detective/load_tests.py index e1938ad0..be1eb400 100755 --- a/csv_detective/load_tests.py +++ b/csv_detective/load_tests.py @@ -1,10 +1,11 @@ import os +from types import ModuleType from typing import Union from csv_detective import detect_fields, detect_labels # noqa -def get_all_packages(detect_type) -> list: +def get_all_packages(detect_type) -> list[str]: root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type modules = [] for dirpath, _, filenames in os.walk(root_dir): @@ -20,7 +21,7 @@ def get_all_packages(detect_type) -> list: def return_all_tests( user_input_tests: Union[str, list], detect_type: str, -) -> list: +) -> list[ModuleType]: """ returns all tests that have a method _is and are listed in the user_input_tests the function can select a sub_package from csv_detective @@ -51,3 +52,17 @@ def return_all_tests( # to remove groups of tests all_tests = [test for test in all_tests if "_is" in dir(test)] return all_tests + + +def build_tests_dicts(tests: list[ModuleType]) -> tuple[dict[str, dict], dict[str, dict]]: + tests_dict = { + test.__name__.split(".")[-1]: { + "func": test._is, + "prop": test.PROPORTION, + "parent": getattr(test, "PARENT", None), + } + for test in tests + } + parents = {v["parent"] for v in tests_dict.values() if v["parent"] is not None} + specific_tests = {k: v for k, v in tests_dict.items() if k not in parents} + return tests_dict, specific_tests diff --git a/csv_detective/parsing/columns.py b/csv_detective/parsing/columns.py index b83bec25..c86b241c 100755 --- a/csv_detective/parsing/columns.py +++ b/csv_detective/parsing/columns.py @@ -1,9 +1,11 @@ import logging +from collections import defaultdict from time import time from typing import Callable import pandas as pd +from csv_detective.load_tests import build_tests_dicts from csv_detective.utils import display_logs_depending_process_time MAX_ROWS_ANALYSIS = int(1e4) @@ -89,40 +91,70 @@ def test_col( if verbose: start = time() logging.info("Testing columns to get types") - test_funcs = { - test.__name__.split(".")[-1]: { - "func": test._is, - "prop": test.PROPORTION, - } - for test in all_tests - } - return_table = pd.DataFrame(columns=table.columns) - for idx, (key, value) in enumerate(test_funcs.items()): + test_funcs, specific_tests = build_tests_dicts(all_tests) + results = defaultdict(dict) + nb_cols = len(table.columns) + for idx, column in enumerate(table.columns): if verbose: - start_type = time() - logging.info(f"\t- Starting with type '{key}'") - # improvement lead : put the longest tests behind and make them only if previous tests not satisfactory - # => the following needs to change, "apply" means all columns are tested for one type at once - return_table.loc[key] = table.apply( - lambda serie: test_col_val( - serie, - value["func"], - value["prop"], + start_col = time() + logging.info(f"\t- Starting with column '{column}' ({idx + 1}/{nb_cols})") + tested = set() + # testing for the most specific formats first (we have early stops in test_col_val) + for test_name, test_attr in specific_tests.items(): + results[column][test_name] = test_col_val( + table[column], + test_attr["func"], + test_attr["prop"], skipna=skipna, limited_output=limited_output, verbose=verbose, ) - ) + tested.add(test_name) + # should we break if one of the specific tests is successful? + # performing less and less specific tests if specific ones fail + # starting with highest scores to set the parents from there + for test_name in reversed( + [ + test + for test, _ in sorted( + (tup for tup in results[column].items()), + key=lambda tup: tup[1], + ) + ] + ): + current = test_name + parent = test_funcs[current]["parent"] + while parent is not None: + if parent in results[column]: + # already tested as a parent of a previous test, no need to get higher parents + break + if results[column][current] > 0: + # if a child test is successful, we set the parent's score to the same value + # this is not perfect: the column can be 50% child but 100% parent + # we would have to perform the parent test to know exactly, but this saves much time + results[column][parent] = results[column][current] + else: + results[column][parent] = test_col_val( + table[column], + test_funcs[parent]["func"], + test_funcs[parent]["prop"], + skipna=skipna, + limited_output=limited_output, + verbose=verbose, + ) + tested.add(parent) + current, parent = parent, test_funcs[parent]["parent"] if verbose: display_logs_depending_process_time( - f'\t> Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(test_funcs)})', - time() - start_type, + f'\t> Done with column "{column}" in {round(time() - start_col, 3)}s' + f", {len(tested)} tests performed", + time() - start_col, ) if verbose: display_logs_depending_process_time( f"Done testing columns in {round(time() - start, 3)}s", time() - start ) - return return_table + return pd.DataFrame(results) def test_label(table: pd.DataFrame, all_tests: list, limited_output: bool, verbose: bool = False): diff --git a/csv_detective/validate.py b/csv_detective/validate.py index 1648b65e..8dcdd0eb 100755 --- a/csv_detective/validate.py +++ b/csv_detective/validate.py @@ -3,19 +3,13 @@ import pandas as pd -from csv_detective.load_tests import return_all_tests +from csv_detective.load_tests import build_tests_dicts, return_all_tests from csv_detective.parsing.columns import test_col_val from csv_detective.parsing.load import load_file logging.basicConfig(level=logging.INFO) -tests = { - t.__name__.split(".")[-1]: { - "func": t._is, - "prop": t.PROPORTION, - } - for t in return_all_tests("ALL", "detect_fields") -} +tests, _ = build_tests_dicts(return_all_tests("ALL", "detect_fields")) def validate( diff --git a/tests/test_fields.py b/tests/test_fields.py index c1397645..6e833912 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -29,7 +29,6 @@ code_rna, code_waldec, csp_insee, - date_fr, insee_ape700, sexe, siren, @@ -37,7 +36,7 @@ tel_fr, uai, ) -from csv_detective.detect_fields.FR.temp import jour_de_la_semaine, mois_de_annee +from csv_detective.detect_fields.FR.temp import date_fr, jour_de_la_semaine, mois_de_annee from csv_detective.detect_fields.geo import ( iso_country_code_alpha2, iso_country_code_alpha3,