Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
## Current (in progress)

- Better email detection [#151](https://github.com/datagouv/csv-detective/pull/151)
- Sample can handle full NaN columns [#152](https://github.com/datagouv/csv-detective/pull/152)
- Sample can handle full NaN columns [#154](https://github.com/datagouv/csv-detective/pull/154)
- Add tests priorization to prevent testing all formats [#155](https://github.com/datagouv/csv-detective/pull/155)

## 0.9.2 (2025-08-26)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from csv_detective.detect_fields.other.float import float_casting

PROPORTION = 0.9
PARENT = "float"

_latitudel93 = LatitudeL93()

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from csv_detective.detect_fields.other.float import _is as is_float

PROPORTION = 0.9
PARENT = "latitude_wgs"


def _is(val):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from csv_detective.detect_fields.other.float import float_casting

PROPORTION = 0.9
PARENT = "float"

_longitudel93 = LongitudeL93()

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from csv_detective.detect_fields.other.float import _is as is_float

PROPORTION = 0.9
PARENT = "longitude_wgs"


def _is(val):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re

PROPORTION = 1
PARENT = "date"
regex = (
r"^\d{1,2}[ \-](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre"
r"|octobre|novembre|decembre)[ \-]\d{4}$"
Expand Down
3 changes: 1 addition & 2 deletions csv_detective/detect_fields/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,14 @@
code_rna,
code_waldec,
csp_insee,
date_fr,
insee_ape700,
sexe,
siren,
siret,
tel_fr,
uai,
)
from .FR.temp import jour_de_la_semaine, mois_de_annee
from .FR.temp import date_fr, jour_de_la_semaine, mois_de_annee
from .geo import (
iso_country_code_alpha2,
iso_country_code_alpha3,
Expand Down
1 change: 1 addition & 0 deletions csv_detective/detect_fields/geo/json_geojson/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json

PROPORTION = 0.9
PARENT = "json"


def _is(val):
Expand Down
1 change: 1 addition & 0 deletions csv_detective/detect_fields/geo/latitude_wgs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from csv_detective.detect_fields.other.float import _is as is_float

PROPORTION = 0.9
PARENT = "float"


def _is(val):
Expand Down
1 change: 1 addition & 0 deletions csv_detective/detect_fields/geo/longitude_wgs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from csv_detective.detect_fields.other.float import _is as is_float

PROPORTION = 0.9
PARENT = "float"


def _is(val):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re

PROPORTION = 1
PARENT = "datetime_aware"


def _is(val):
Expand Down
1 change: 1 addition & 0 deletions csv_detective/detect_fields/temp/year/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
PROPORTION = 1
PARENT = "int"


def _is(val):
Expand Down
3 changes: 1 addition & 2 deletions csv_detective/detect_labels/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,14 @@
code_rna,
code_waldec,
csp_insee,
date_fr,
insee_ape700,
sexe,
siren,
siret,
tel_fr,
uai,
)
from .FR.temp import jour_de_la_semaine, mois_de_annee
from .FR.temp import date_fr, jour_de_la_semaine, mois_de_annee
from .geo import (
iso_country_code_alpha2,
iso_country_code_alpha3,
Expand Down
19 changes: 17 additions & 2 deletions csv_detective/load_tests.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import os
from types import ModuleType
from typing import Union

from csv_detective import detect_fields, detect_labels # noqa


def get_all_packages(detect_type) -> list:
def get_all_packages(detect_type) -> list[str]:
root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
modules = []
for dirpath, _, filenames in os.walk(root_dir):
Expand All @@ -20,7 +21,7 @@ def get_all_packages(detect_type) -> list:
def return_all_tests(
user_input_tests: Union[str, list],
detect_type: str,
) -> list:
) -> list[ModuleType]:
"""
returns all tests that have a method _is and are listed in the user_input_tests
the function can select a sub_package from csv_detective
Expand Down Expand Up @@ -51,3 +52,17 @@ def return_all_tests(
# to remove groups of tests
all_tests = [test for test in all_tests if "_is" in dir(test)]
return all_tests


def build_tests_dicts(tests: list[ModuleType]) -> tuple[dict[str, dict], dict[str, dict]]:
tests_dict = {
test.__name__.split(".")[-1]: {
"func": test._is,
"prop": test.PROPORTION,
"parent": getattr(test, "PARENT", None),
}
for test in tests
}
parents = {v["parent"] for v in tests_dict.values() if v["parent"] is not None}
specific_tests = {k: v for k, v in tests_dict.items() if k not in parents}
return tests_dict, specific_tests
76 changes: 54 additions & 22 deletions csv_detective/parsing/columns.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import logging
from collections import defaultdict
from time import time
from typing import Callable

import pandas as pd

from csv_detective.load_tests import build_tests_dicts
from csv_detective.utils import display_logs_depending_process_time

MAX_ROWS_ANALYSIS = int(1e4)
Expand Down Expand Up @@ -89,40 +91,70 @@ def test_col(
if verbose:
start = time()
logging.info("Testing columns to get types")
test_funcs = {
test.__name__.split(".")[-1]: {
"func": test._is,
"prop": test.PROPORTION,
}
for test in all_tests
}
return_table = pd.DataFrame(columns=table.columns)
for idx, (key, value) in enumerate(test_funcs.items()):
test_funcs, specific_tests = build_tests_dicts(all_tests)
results = defaultdict(dict)
nb_cols = len(table.columns)
for idx, column in enumerate(table.columns):
if verbose:
start_type = time()
logging.info(f"\t- Starting with type '{key}'")
# improvement lead : put the longest tests behind and make them only if previous tests not satisfactory
# => the following needs to change, "apply" means all columns are tested for one type at once
return_table.loc[key] = table.apply(
lambda serie: test_col_val(
serie,
value["func"],
value["prop"],
start_col = time()
logging.info(f"\t- Starting with column '{column}' ({idx + 1}/{nb_cols})")
tested = set()
# testing for the most specific formats first (we have early stops in test_col_val)
for test_name, test_attr in specific_tests.items():
results[column][test_name] = test_col_val(
table[column],
test_attr["func"],
test_attr["prop"],
skipna=skipna,
limited_output=limited_output,
verbose=verbose,
)
)
tested.add(test_name)
# should we break if one of the specific tests is successful?
# performing less and less specific tests if specific ones fail
# starting with highest scores to set the parents from there
for test_name in reversed(
[
test
for test, _ in sorted(
(tup for tup in results[column].items()),
key=lambda tup: tup[1],
)
]
):
current = test_name
parent = test_funcs[current]["parent"]
while parent is not None:
if parent in results[column]:
# already tested as a parent of a previous test, no need to get higher parents
break
if results[column][current] > 0:
# if a child test is successful, we set the parent's score to the same value
# this is not perfect: the column can be 50% child but 100% parent
# we would have to perform the parent test to know exactly, but this saves much time
results[column][parent] = results[column][current]
else:
results[column][parent] = test_col_val(
table[column],
test_funcs[parent]["func"],
test_funcs[parent]["prop"],
skipna=skipna,
limited_output=limited_output,
verbose=verbose,
)
tested.add(parent)
current, parent = parent, test_funcs[parent]["parent"]
if verbose:
display_logs_depending_process_time(
f'\t> Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(test_funcs)})',
time() - start_type,
f'\t> Done with column "{column}" in {round(time() - start_col, 3)}s'
f", {len(tested)} tests performed",
time() - start_col,
)
if verbose:
display_logs_depending_process_time(
f"Done testing columns in {round(time() - start, 3)}s", time() - start
)
return return_table
return pd.DataFrame(results)


def test_label(table: pd.DataFrame, all_tests: list, limited_output: bool, verbose: bool = False):
Expand Down
10 changes: 2 additions & 8 deletions csv_detective/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,13 @@

import pandas as pd

from csv_detective.load_tests import return_all_tests
from csv_detective.load_tests import build_tests_dicts, return_all_tests
from csv_detective.parsing.columns import test_col_val
from csv_detective.parsing.load import load_file

logging.basicConfig(level=logging.INFO)

tests = {
t.__name__.split(".")[-1]: {
"func": t._is,
"prop": t.PROPORTION,
}
for t in return_all_tests("ALL", "detect_fields")
}
tests, _ = build_tests_dicts(return_all_tests("ALL", "detect_fields"))


def validate(
Expand Down
3 changes: 1 addition & 2 deletions tests/test_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,14 @@
code_rna,
code_waldec,
csp_insee,
date_fr,
insee_ape700,
sexe,
siren,
siret,
tel_fr,
uai,
)
from csv_detective.detect_fields.FR.temp import jour_de_la_semaine, mois_de_annee
from csv_detective.detect_fields.FR.temp import date_fr, jour_de_la_semaine, mois_de_annee
from csv_detective.detect_fields.geo import (
iso_country_code_alpha2,
iso_country_code_alpha3,
Expand Down