datagouv · Pierlou · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,8 @@
 ## Current (in progress)
 
 - Better email detection [#151](https://github.com/datagouv/csv-detective/pull/151)
-- Sample can handle full NaN columns [#152](https://github.com/datagouv/csv-detective/pull/152)
+- Sample can handle full NaN columns [#154](https://github.com/datagouv/csv-detective/pull/154)
+- Add tests priorization to prevent testing all formats [#155](https://github.com/datagouv/csv-detective/pull/155)
 
 ## 0.9.2 (2025-08-26)
 

diff --git a/csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py b/csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py
@@ -4,6 +4,7 @@
 from csv_detective.detect_fields.other.float import float_casting
 
 PROPORTION = 0.9
+PARENT = "float"
 
 _latitudel93 = LatitudeL93()
 

diff --git a/csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py b/csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py
@@ -1,6 +1,7 @@
 from csv_detective.detect_fields.other.float import _is as is_float
 
 PROPORTION = 0.9
+PARENT = "latitude_wgs"
 
 
 def _is(val):

diff --git a/csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py b/csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py
@@ -4,6 +4,7 @@
 from csv_detective.detect_fields.other.float import float_casting
 
 PROPORTION = 0.9
+PARENT = "float"
 
 _longitudel93 = LongitudeL93()
 

diff --git a/csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py b/csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py
@@ -1,6 +1,7 @@
 from csv_detective.detect_fields.other.float import _is as is_float
 
 PROPORTION = 0.9
+PARENT = "longitude_wgs"
 
 
 def _is(val):

diff --git a/...etect_fields/FR/other/date_fr/__init__.py → ...detect_fields/FR/temp/date_fr/__init__.py b/...etect_fields/FR/other/date_fr/__init__.py → ...detect_fields/FR/temp/date_fr/__init__.py
@@ -1,6 +1,7 @@
 import re
 
 PROPORTION = 1
+PARENT = "date"
 regex = (
     r"^\d{1,2}[ \-](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre"
     r"|octobre|novembre|decembre)[ \-]\d{4}$"

diff --git a/csv_detective/detect_fields/__init__.py b/csv_detective/detect_fields/__init__.py
@@ -21,15 +21,14 @@
     code_rna,
     code_waldec,
     csp_insee,
-    date_fr,
     insee_ape700,
     sexe,
     siren,
     siret,
     tel_fr,
     uai,
 )
-from .FR.temp import jour_de_la_semaine, mois_de_annee
+from .FR.temp import date_fr, jour_de_la_semaine, mois_de_annee
 from .geo import (
     iso_country_code_alpha2,
     iso_country_code_alpha3,

diff --git a/csv_detective/detect_fields/geo/json_geojson/__init__.py b/csv_detective/detect_fields/geo/json_geojson/__init__.py
@@ -1,6 +1,7 @@
 import json
 
 PROPORTION = 0.9
+PARENT = "json"
 
 
 def _is(val):

diff --git a/csv_detective/detect_fields/geo/latitude_wgs/__init__.py b/csv_detective/detect_fields/geo/latitude_wgs/__init__.py
@@ -1,6 +1,7 @@
 from csv_detective.detect_fields.other.float import _is as is_float
 
 PROPORTION = 0.9
+PARENT = "float"
 
 
 def _is(val):

diff --git a/csv_detective/detect_fields/geo/longitude_wgs/__init__.py b/csv_detective/detect_fields/geo/longitude_wgs/__init__.py
@@ -1,6 +1,7 @@
 from csv_detective.detect_fields.other.float import _is as is_float
 
 PROPORTION = 0.9
+PARENT = "float"
 
 
 def _is(val):

diff --git a/csv_detective/detect_fields/temp/datetime_rfc822/__init__.py b/csv_detective/detect_fields/temp/datetime_rfc822/__init__.py
@@ -1,6 +1,7 @@
 import re
 
 PROPORTION = 1
+PARENT = "datetime_aware"
 
 
 def _is(val):

diff --git a/csv_detective/detect_fields/temp/year/__init__.py b/csv_detective/detect_fields/temp/year/__init__.py
@@ -1,4 +1,5 @@
 PROPORTION = 1
+PARENT = "int"
 
 
 def _is(val):

diff --git a/...etect_labels/FR/other/date_fr/__init__.py → ...detect_labels/FR/temp/date_fr/__init__.py b/...etect_labels/FR/other/date_fr/__init__.py → ...detect_labels/FR/temp/date_fr/__init__.py
diff --git a/csv_detective/detect_labels/__init__.py b/csv_detective/detect_labels/__init__.py
@@ -20,15 +20,14 @@
     code_rna,
     code_waldec,
     csp_insee,
-    date_fr,
     insee_ape700,
     sexe,
     siren,
     siret,
     tel_fr,
     uai,
 )
-from .FR.temp import jour_de_la_semaine, mois_de_annee
+from .FR.temp import date_fr, jour_de_la_semaine, mois_de_annee
 from .geo import (
     iso_country_code_alpha2,
     iso_country_code_alpha3,

diff --git a/csv_detective/load_tests.py b/csv_detective/load_tests.py
@@ -1,10 +1,11 @@
 import os
+from types import ModuleType
 from typing import Union
 
 from csv_detective import detect_fields, detect_labels  # noqa
 
 
-def get_all_packages(detect_type) -> list:
+def get_all_packages(detect_type) -> list[str]:
     root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
     modules = []
     for dirpath, _, filenames in os.walk(root_dir):
@@ -20,7 +21,7 @@ def get_all_packages(detect_type) -> list:
 def return_all_tests(
     user_input_tests: Union[str, list],
     detect_type: str,
-) -> list:
+) -> list[ModuleType]:
     """
     returns all tests that have a method _is and are listed in the user_input_tests
     the function can select a sub_package from csv_detective
@@ -51,3 +52,17 @@ def return_all_tests(
     # to remove groups of tests
     all_tests = [test for test in all_tests if "_is" in dir(test)]
     return all_tests
+
+
+def build_tests_dicts(tests: list[ModuleType]) -> tuple[dict[str, dict], dict[str, dict]]:
+    tests_dict = {
+        test.__name__.split(".")[-1]: {
+            "func": test._is,
+            "prop": test.PROPORTION,
+            "parent": getattr(test, "PARENT", None),
+        }
+        for test in tests
+    }
+    parents = {v["parent"] for v in tests_dict.values() if v["parent"] is not None}
+    specific_tests = {k: v for k, v in tests_dict.items() if k not in parents}
+    return tests_dict, specific_tests
diff --git a/csv_detective/parsing/columns.py b/csv_detective/parsing/columns.py
@@ -1,9 +1,11 @@
 import logging
+from collections import defaultdict
 from time import time
 from typing import Callable
 
 import pandas as pd
 
+from csv_detective.load_tests import build_tests_dicts
 from csv_detective.utils import display_logs_depending_process_time
 
 MAX_ROWS_ANALYSIS = int(1e4)
@@ -89,40 +91,70 @@ def test_col(
     if verbose:
         start = time()
         logging.info("Testing columns to get types")
-    test_funcs = {
-        test.__name__.split(".")[-1]: {
-            "func": test._is,
-            "prop": test.PROPORTION,
-        }
-        for test in all_tests
-    }
-    return_table = pd.DataFrame(columns=table.columns)
-    for idx, (key, value) in enumerate(test_funcs.items()):
+    test_funcs, specific_tests = build_tests_dicts(all_tests)
+    results = defaultdict(dict)
+    nb_cols = len(table.columns)
+    for idx, column in enumerate(table.columns):
         if verbose:
-            start_type = time()
-            logging.info(f"\t- Starting with type '{key}'")
-        # improvement lead : put the longest tests behind and make them only if previous tests not satisfactory
-        # => the following needs to change, "apply" means all columns are tested for one type at once
-        return_table.loc[key] = table.apply(
-            lambda serie: test_col_val(
-                serie,
-                value["func"],
-                value["prop"],
+            start_col = time()
+            logging.info(f"\t- Starting with column '{column}' ({idx + 1}/{nb_cols})")
+        tested = set()
+        # testing for the most specific formats first (we have early stops in test_col_val)
+        for test_name, test_attr in specific_tests.items():
+            results[column][test_name] = test_col_val(
+                table[column],
+                test_attr["func"],
+                test_attr["prop"],
                 skipna=skipna,
                 limited_output=limited_output,
                 verbose=verbose,
             )
-        )
+            tested.add(test_name)
+            # should we break if one of the specific tests is successful?
+        # performing less and less specific tests if specific ones fail
+        # starting with highest scores to set the parents from there
+        for test_name in reversed(
+            [
+                test
+                for test, _ in sorted(
+                    (tup for tup in results[column].items()),
+                    key=lambda tup: tup[1],
+                )
+            ]
+        ):
+            current = test_name
+            parent = test_funcs[current]["parent"]
+            while parent is not None:
+                if parent in results[column]:
+                    # already tested as a parent of a previous test, no need to get higher parents
+                    break
+                if results[column][current] > 0:
+                    # if a child test is successful, we set the parent's score to the same value
+                    # this is not perfect: the column can be 50% child but 100% parent
+                    # we would have to perform the parent test to know exactly, but this saves much time
+                    results[column][parent] = results[column][current]
+                else:
+                    results[column][parent] = test_col_val(
+                        table[column],
+                        test_funcs[parent]["func"],
+                        test_funcs[parent]["prop"],
+                        skipna=skipna,
+                        limited_output=limited_output,
+                        verbose=verbose,
+                    )
+                    tested.add(parent)
+                current, parent = parent, test_funcs[parent]["parent"]
         if verbose:
             display_logs_depending_process_time(
-                f'\t> Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(test_funcs)})',
-                time() - start_type,
+                f'\t> Done with column "{column}" in {round(time() - start_col, 3)}s'
+                f", {len(tested)} tests performed",
+                time() - start_col,
             )
     if verbose:
         display_logs_depending_process_time(
             f"Done testing columns in {round(time() - start, 3)}s", time() - start
         )
-    return return_table
+    return pd.DataFrame(results)
 
 
 def test_label(table: pd.DataFrame, all_tests: list, limited_output: bool, verbose: bool = False):

diff --git a/csv_detective/validate.py b/csv_detective/validate.py
@@ -3,19 +3,13 @@
 
 import pandas as pd
 
-from csv_detective.load_tests import return_all_tests
+from csv_detective.load_tests import build_tests_dicts, return_all_tests
 from csv_detective.parsing.columns import test_col_val
 from csv_detective.parsing.load import load_file
 
 logging.basicConfig(level=logging.INFO)
 
-tests = {
-    t.__name__.split(".")[-1]: {
-        "func": t._is,
-        "prop": t.PROPORTION,
-    }
-    for t in return_all_tests("ALL", "detect_fields")
-}
+tests, _ = build_tests_dicts(return_all_tests("ALL", "detect_fields"))
 
 
 def validate(

diff --git a/tests/test_fields.py b/tests/test_fields.py
@@ -29,15 +29,14 @@
     code_rna,
     code_waldec,
     csp_insee,
-    date_fr,
     insee_ape700,
     sexe,
     siren,
     siret,
     tel_fr,
     uai,
 )
-from csv_detective.detect_fields.FR.temp import jour_de_la_semaine, mois_de_annee
+from csv_detective.detect_fields.FR.temp import date_fr, jour_de_la_semaine, mois_de_annee
 from csv_detective.detect_fields.geo import (
     iso_country_code_alpha2,
     iso_country_code_alpha3,