From e88d8045568d5ee10f67e771d7733f7f334004c5 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Tue, 18 Nov 2025 16:48:51 +0100 Subject: [PATCH 01/21] draft:raft: POC of new structure --- csv_detective/format.py | 50 +++++++++++++++++++++++++ csv_detective/formats/__init__.py | 9 +++++ csv_detective/formats/int.py | 16 ++++++++ csv_detective/formats/mois_de_lannee.py | 40 ++++++++++++++++++++ csv_detective/formats/siret.py | 43 +++++++++++++++++++++ 5 files changed, 158 insertions(+) create mode 100755 csv_detective/format.py create mode 100755 csv_detective/formats/__init__.py create mode 100755 csv_detective/formats/int.py create mode 100755 csv_detective/formats/mois_de_lannee.py create mode 100755 csv_detective/formats/siret.py diff --git a/csv_detective/format.py b/csv_detective/format.py new file mode 100755 index 00000000..2b5a2db5 --- /dev/null +++ b/csv_detective/format.py @@ -0,0 +1,50 @@ +from typing import Callable + +from .parsing.text import header_score + + +class Format: + def __init__( + self, + name: str, + func: Callable, + labels: list[str] = [], + proportion: float = 1, + tags: list[str] = [], + ) -> None: + self.name = name + self.func = func + self.labels = labels + self.proportion: float = proportion + self.tags = tags + + def is_valid_value(self, val: str) -> bool: + return self.func(val) + + def is_valid_label(self, val: str) -> float: + return header_score(val, self.labels) + + +class FormatsManager: + def __init__(self) -> None: + import csv_detective.formats as formats + format_labels = [ + f for f in dir(formats) + if "_is" in dir(getattr(formats, f)) + ] + assert len(format_labels) == len(set(format_labels)), "Format labels must be unique" + self.formats = [ + Format( + name=label, + func=(module := getattr(formats, label))._is, + **{ + attr: val + for attr in ["labels", "proportion", "tags"] + if (val := getattr(module, attr, None)) + }, + ) + for label in format_labels + ] + + def get_formats_from_tags(self, tags: list[str]) -> list[Format]: + return [f for f in self.formats if all(tag in f.tags for tag in tags)] diff --git a/csv_detective/formats/__init__.py b/csv_detective/formats/__init__.py new file mode 100755 index 00000000..105e3b43 --- /dev/null +++ b/csv_detective/formats/__init__.py @@ -0,0 +1,9 @@ +import os +import importlib + +for file in os.listdir(os.path.dirname(__file__)): + if file.endswith(".py") and not file.startswith("_"): + module_name = file[:-3] + module = importlib.import_module(f"csv_detective.formats.{module_name}") + globals()[module_name] = module + del module diff --git a/csv_detective/formats/int.py b/csv_detective/formats/int.py new file mode 100755 index 00000000..9d9ec7b0 --- /dev/null +++ b/csv_detective/formats/int.py @@ -0,0 +1,16 @@ +labels = ["nb", "nombre", "nbre"] + + +def _is(val): + """Detects integers""" + if ( + not isinstance(val, str) + or any([v in val for v in [".", "_", "+"]]) + or (val.startswith("0") and len(val) > 1) + ): + return False + try: + int(val) + return True + except ValueError: + return False diff --git a/csv_detective/formats/mois_de_lannee.py b/csv_detective/formats/mois_de_lannee.py new file mode 100755 index 00000000..334f8bb2 --- /dev/null +++ b/csv_detective/formats/mois_de_lannee.py @@ -0,0 +1,40 @@ +from unidecode import unidecode + +proportion = 1 +labels = ["fr", "temp"] +mois = { + "janvier", + "fevrier", + "mars", + "avril", + "mai", + "juin", + "juillet", + "aout", + "septembre", + "octobre", + "novembre", + "decembre", + "jan", + "fev", + "mar", + "avr", + "mai", + "jun", + "jui", + "juil", + "aou", + "sep", + "sept", + "oct", + "nov", + "dec", +} + + +def _is(val): + """Renvoie True si les champs peuvent être des mois de l'année""" + if not isinstance(val, str): + return False + val = unidecode(val.lower()) + return val in mois diff --git a/csv_detective/formats/siret.py b/csv_detective/formats/siret.py new file mode 100755 index 00000000..b99cc381 --- /dev/null +++ b/csv_detective/formats/siret.py @@ -0,0 +1,43 @@ +import re + +proportion = 0.8 + +labels = [ + "siret", + "siret d", + "num siret", + "siretacheteur", + "n° siret", + "coll siret", + "epci", +] + +tags = ["fr"] + + +def _is(val): + """Détection des identifiants SIRET (SIRENE)""" + if not isinstance(val, str): + return False + val = val.replace(" ", "") + if not bool(re.match(r"^[0-9]{14}$", val)): + return False + + # Vérification par clé de luhn du SIREN + cle = 0 + pair = False + for x in val[:9]: + y = int(x) * (1 + pair) + cle += y // 10 + y % 10 + pair = not pair + if cle % 10 != 0: + return cle % 10 == 0 + + # Vérification par clé de luhn du SIRET + cle = 0 + pair = len(val) % 2 == 0 + for x in val: + y = int(x) * (1 + pair) + cle += y // 10 + y % 10 + pair = not pair + return cle % 10 == 0 From 254f73cba3cadc422ba727ff0969df5af6a3fc32 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Fri, 28 Nov 2025 18:13:36 +0100 Subject: [PATCH 02/21] refactor: abstract all formats and adapt code --- csv_detective/detect_fields/FR/README.md | 1 - csv_detective/detect_fields/FR/__init__.py | 0 .../detect_fields/FR/geo/__init__.py | 0 .../FR/geo/code_commune_insee/__init__.py | 9 - .../FR/geo/code_fantoir/__init__.py | 9 - .../FR/geo/code_postal/__init__.py | 9 - .../FR/geo/code_region/__init__.py | 10 - .../FR/geo/departement/__init__.py | 16 - .../FR/geo/latitude_l93/__init__.py | 19 - .../geo/latitude_wgs_fr_metropole/__init__.py | 13 - .../FR/geo/longitude_l93/__init__.py | 19 - .../longitude_wgs_fr_metropole/__init__.py | 13 - .../detect_fields/FR/geo/pays/__init__.py | 16 - .../detect_fields/FR/other/__init__.py | 0 .../other/code_csp_insee/code_csp_insee.txt | 498 ------------------ .../FR/other/code_rna/__init__.py | 9 - .../FR/other/code_waldec/__init__.py | 9 - .../FR/other/date_fr/__init__.py | 12 - .../FR/other/insee_ape700/__init__.py | 19 - .../detect_fields/FR/other/siret/__init__.py | 31 -- .../detect_fields/FR/other/tel_fr/__init__.py | 17 - .../detect_fields/FR/other/uai/__init__.py | 15 - .../detect_fields/FR/temp/__init__.py | 0 .../FR/temp/jour_de_la_semaine/__init__.py | 25 - .../FR/temp/mois_de_annee/__init__.py | 39 -- csv_detective/detect_fields/README.md | 5 - csv_detective/detect_fields/__init__.py | 112 ---- csv_detective/detect_fields/geo/README.md | 1 - csv_detective/detect_fields/geo/__init__.py | 0 .../geo/iso_country_code_alpha2/__init__.py | 15 - .../geo/iso_country_code_alpha3/__init__.py | 14 - .../geo/iso_country_code_numeric/__init__.py | 15 - .../geo/json_geojson/__init__.py | 18 - .../geo/latitude_wgs/__init__.py | 13 - .../detect_fields/geo/latlon_wgs/__init__.py | 16 - .../geo/longitude_wgs/__init__.py | 13 - .../detect_fields/geo/lonlat_wgs/__init__.py | 16 - csv_detective/detect_fields/other/__init__.py | 0 .../detect_fields/other/email/__init__.py | 10 - .../detect_fields/other/int/__init__.py | 16 - .../detect_fields/other/money/__init__.py | 11 - .../other/mongo_object_id/__init__.py | 8 - .../detect_fields/other/percent/__init__.py | 9 - .../detect_fields/other/twitter/__init__.py | 8 - .../detect_fields/other/url/__init__.py | 14 - .../detect_fields/other/uuid/__init__.py | 10 - csv_detective/detect_fields/temp/README.md | 1 - csv_detective/detect_fields/temp/__init__.py | 0 .../temp/datetime_rfc822/__init__.py | 18 - .../detect_fields/temp/year/__init__.py | 10 - csv_detective/detect_labels/FR/__init__.py | 0 .../detect_labels/FR/geo/__init__.py | 0 .../detect_labels/FR/geo/adresse/__init__.py | 15 - .../FR/geo/code_commune_insee/__init__.py | 17 - .../FR/geo/code_departement/__init__.py | 15 - .../FR/geo/code_fantoir/__init__.py | 12 - .../FR/geo/code_postal/__init__.py | 16 - .../FR/geo/code_region/__init__.py | 14 - .../detect_labels/FR/geo/commune/__init__.py | 12 - .../FR/geo/departement/__init__.py | 22 - .../FR/geo/insee_canton/__init__.py | 13 - .../FR/geo/latitude_l93/__init__.py | 30 -- .../FR/geo/longitude_l93/__init__.py | 21 - .../longitude_wgs_fr_metropole/__init__.py | 21 - .../detect_labels/FR/geo/pays/__init__.py | 20 - .../detect_labels/FR/geo/region/__init__.py | 20 - .../detect_labels/FR/other/__init__.py | 0 .../FR/other/code_csp_insee/__init__.py | 8 - .../FR/other/code_rna/__init__.py | 13 - .../FR/other/code_waldec/__init__.py | 8 - .../FR/other/csp_insee/__init__.py | 13 - .../FR/other/date_fr/__init__.py | 9 - .../FR/other/insee_ape700/__init__.py | 15 - .../detect_labels/FR/other/sexe/__init__.py | 8 - .../detect_labels/FR/other/siren/__init__.py | 17 - .../detect_labels/FR/other/siret/__init__.py | 16 - .../detect_labels/FR/other/tel_fr/__init__.py | 20 - .../detect_labels/FR/temp/__init__.py | 0 .../FR/temp/jour_de_la_semaine/__init__.py | 16 - .../FR/temp/mois_de_annee/__init__.py | 8 - csv_detective/detect_labels/__init__.py | 94 ---- csv_detective/detect_labels/geo/__init__.py | 0 .../geo/iso_country_code_alpha2/__init__.py | 16 - .../geo/iso_country_code_alpha3/__init__.py | 16 - .../geo/iso_country_code_numeric/__init__.py | 16 - .../geo/json_geojson/__init__.py | 17 - .../detect_labels/geo/latlon_wgs/__init__.py | 39 -- .../geo/longitude_wgs/__init__.py | 21 - .../detect_labels/geo/lonlat_wgs/__init__.py | 23 - csv_detective/detect_labels/other/__init__.py | 0 .../detect_labels/other/booleen/__init__.py | 8 - .../detect_labels/other/email/__init__.py | 20 - .../detect_labels/other/float/__init__.py | 8 - .../detect_labels/other/int/__init__.py | 8 - .../detect_labels/other/money/__init__.py | 8 - .../other/mongo_object_id/__init__.py | 8 - .../detect_labels/other/twitter/__init__.py | 8 - .../detect_labels/other/url/__init__.py | 23 - .../detect_labels/other/uuid/__init__.py | 8 - csv_detective/detect_labels/temp/__init__.py | 0 .../detect_labels/temp/date/__init__.py | 28 - .../temp/datetime_rfc822/__init__.py | 19 - .../detect_labels/temp/year/__init__.py | 19 - csv_detective/detection/formats.py | 24 +- csv_detective/explore_csv.py | 10 +- csv_detective/format.py | 34 +- .../__init__.py => formats/adresse.py} | 216 ++++---- .../__init__.py => formats/booleen.py} | 61 ++- csv_detective/formats/code_commune_insee.py | 26 + .../__init__.py => formats/code_csp_insee.py} | 66 ++- .../code_departement.py} | 44 +- csv_detective/formats/code_fantoir.py | 21 + .../__init__.py => formats/code_import.py} | 27 +- csv_detective/formats/code_postal.py | 25 + csv_detective/formats/code_region.py | 22 + csv_detective/formats/code_rna.py | 29 + csv_detective/formats/code_waldec.py | 18 + .../__init__.py => formats/commune.py} | 43 +- .../__init__.py => formats/csp_insee.py} | 50 +- .../csp_insee => formats/data}/csp_insee.txt | 0 .../data}/insee_ape700.txt | 0 .../data}/iso_country_code_alpha2.txt | 0 .../data}/iso_country_code_alpha3.txt | 0 .../data}/iso_country_code_numeric.txt | 0 .../temp/date/__init__.py => formats/date.py} | 165 +++--- csv_detective/formats/date_fr.py | 21 + .../__init__.py => formats/datetime_aware.py} | 26 +- .../__init__.py => formats/datetime_naive.py} | 24 +- csv_detective/formats/datetime_rfc822.py | 34 ++ csv_detective/formats/departement.py | 37 ++ csv_detective/formats/email.py | 29 + .../float/__init__.py => formats/float.py} | 50 +- csv_detective/formats/geojson.py | 36 ++ csv_detective/formats/insee_ape700.py | 31 ++ .../__init__.py => formats/insee_canton.py} | 43 +- csv_detective/formats/int.py | 6 + .../formats/iso_country_code_alpha2.py | 30 ++ .../formats/iso_country_code_alpha3.py | 30 ++ .../formats/iso_country_code_numeric.py | 31 ++ csv_detective/formats/jour_de_la_semaine.py | 43 ++ .../json/__init__.py => formats/json.py} | 36 +- csv_detective/formats/latitude_l93.py | 48 ++ .../__init__.py => formats/latitude_wgs.py} | 72 +-- .../latitude_wgs_fr_metropole.py} | 72 +-- csv_detective/formats/latlon_wgs.py | 53 ++ csv_detective/formats/longitude_l93.py | 39 ++ csv_detective/formats/longitude_wgs.py | 32 ++ .../formats/longitude_wgs_fr_metropole.py | 32 ++ csv_detective/formats/lonlat_wgs.py | 36 ++ csv_detective/formats/mois_de_lannee.py | 10 +- csv_detective/formats/money.py | 19 + csv_detective/formats/mongo_object_id.py | 15 + csv_detective/formats/pays.py | 35 ++ csv_detective/formats/percent.py | 18 + .../region/__init__.py => formats/region.py} | 120 +++-- .../sexe/__init__.py => formats/sexe.py} | 30 +- .../siren/__init__.py => formats/siren.py} | 57 +- csv_detective/formats/siret.py | 10 +- csv_detective/formats/tel_fr.py | 36 ++ .../other/uai/__init__.py => formats/uai.py} | 61 ++- csv_detective/formats/url.py | 46 ++ csv_detective/formats/username.py | 16 + csv_detective/formats/uuid.py | 18 + csv_detective/formats/year.py | 28 + csv_detective/load_tests.py | 59 --- csv_detective/output/dataframe.py | 6 +- csv_detective/output/profile.py | 5 +- csv_detective/parsing/columns.py | 80 ++- csv_detective/parsing/csv.py | 4 +- csv_detective/validate.py | 8 +- 170 files changed, 1799 insertions(+), 2621 deletions(-) delete mode 100644 csv_detective/detect_fields/FR/README.md delete mode 100644 csv_detective/detect_fields/FR/__init__.py delete mode 100644 csv_detective/detect_fields/FR/geo/__init__.py delete mode 100644 csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py delete mode 100644 csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py delete mode 100644 csv_detective/detect_fields/FR/geo/code_postal/__init__.py delete mode 100644 csv_detective/detect_fields/FR/geo/code_region/__init__.py delete mode 100644 csv_detective/detect_fields/FR/geo/departement/__init__.py delete mode 100644 csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py delete mode 100644 csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py delete mode 100644 csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py delete mode 100644 csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py delete mode 100644 csv_detective/detect_fields/FR/geo/pays/__init__.py delete mode 100644 csv_detective/detect_fields/FR/other/__init__.py delete mode 100644 csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt delete mode 100644 csv_detective/detect_fields/FR/other/code_rna/__init__.py delete mode 100644 csv_detective/detect_fields/FR/other/code_waldec/__init__.py delete mode 100644 csv_detective/detect_fields/FR/other/date_fr/__init__.py delete mode 100644 csv_detective/detect_fields/FR/other/insee_ape700/__init__.py delete mode 100644 csv_detective/detect_fields/FR/other/siret/__init__.py delete mode 100644 csv_detective/detect_fields/FR/other/tel_fr/__init__.py delete mode 100644 csv_detective/detect_fields/FR/other/uai/__init__.py delete mode 100644 csv_detective/detect_fields/FR/temp/__init__.py delete mode 100644 csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py delete mode 100644 csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py delete mode 100644 csv_detective/detect_fields/README.md delete mode 100644 csv_detective/detect_fields/__init__.py delete mode 100644 csv_detective/detect_fields/geo/README.md delete mode 100644 csv_detective/detect_fields/geo/__init__.py delete mode 100644 csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py delete mode 100644 csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py delete mode 100644 csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py delete mode 100644 csv_detective/detect_fields/geo/json_geojson/__init__.py delete mode 100644 csv_detective/detect_fields/geo/latitude_wgs/__init__.py delete mode 100644 csv_detective/detect_fields/geo/latlon_wgs/__init__.py delete mode 100644 csv_detective/detect_fields/geo/longitude_wgs/__init__.py delete mode 100644 csv_detective/detect_fields/geo/lonlat_wgs/__init__.py delete mode 100644 csv_detective/detect_fields/other/__init__.py delete mode 100644 csv_detective/detect_fields/other/email/__init__.py delete mode 100644 csv_detective/detect_fields/other/int/__init__.py delete mode 100644 csv_detective/detect_fields/other/money/__init__.py delete mode 100644 csv_detective/detect_fields/other/mongo_object_id/__init__.py delete mode 100644 csv_detective/detect_fields/other/percent/__init__.py delete mode 100644 csv_detective/detect_fields/other/twitter/__init__.py delete mode 100644 csv_detective/detect_fields/other/url/__init__.py delete mode 100644 csv_detective/detect_fields/other/uuid/__init__.py delete mode 100644 csv_detective/detect_fields/temp/README.md delete mode 100644 csv_detective/detect_fields/temp/__init__.py delete mode 100644 csv_detective/detect_fields/temp/datetime_rfc822/__init__.py delete mode 100644 csv_detective/detect_fields/temp/year/__init__.py delete mode 100644 csv_detective/detect_labels/FR/__init__.py delete mode 100644 csv_detective/detect_labels/FR/geo/__init__.py delete mode 100644 csv_detective/detect_labels/FR/geo/adresse/__init__.py delete mode 100644 csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py delete mode 100644 csv_detective/detect_labels/FR/geo/code_departement/__init__.py delete mode 100644 csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py delete mode 100644 csv_detective/detect_labels/FR/geo/code_postal/__init__.py delete mode 100644 csv_detective/detect_labels/FR/geo/code_region/__init__.py delete mode 100644 csv_detective/detect_labels/FR/geo/commune/__init__.py delete mode 100644 csv_detective/detect_labels/FR/geo/departement/__init__.py delete mode 100644 csv_detective/detect_labels/FR/geo/insee_canton/__init__.py delete mode 100644 csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py delete mode 100644 csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py delete mode 100644 csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py delete mode 100644 csv_detective/detect_labels/FR/geo/pays/__init__.py delete mode 100644 csv_detective/detect_labels/FR/geo/region/__init__.py delete mode 100644 csv_detective/detect_labels/FR/other/__init__.py delete mode 100644 csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py delete mode 100644 csv_detective/detect_labels/FR/other/code_rna/__init__.py delete mode 100644 csv_detective/detect_labels/FR/other/code_waldec/__init__.py delete mode 100644 csv_detective/detect_labels/FR/other/csp_insee/__init__.py delete mode 100644 csv_detective/detect_labels/FR/other/date_fr/__init__.py delete mode 100644 csv_detective/detect_labels/FR/other/insee_ape700/__init__.py delete mode 100644 csv_detective/detect_labels/FR/other/sexe/__init__.py delete mode 100644 csv_detective/detect_labels/FR/other/siren/__init__.py delete mode 100644 csv_detective/detect_labels/FR/other/siret/__init__.py delete mode 100644 csv_detective/detect_labels/FR/other/tel_fr/__init__.py delete mode 100644 csv_detective/detect_labels/FR/temp/__init__.py delete mode 100644 csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py delete mode 100644 csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py delete mode 100644 csv_detective/detect_labels/__init__.py delete mode 100644 csv_detective/detect_labels/geo/__init__.py delete mode 100644 csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py delete mode 100644 csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py delete mode 100644 csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py delete mode 100644 csv_detective/detect_labels/geo/json_geojson/__init__.py delete mode 100644 csv_detective/detect_labels/geo/latlon_wgs/__init__.py delete mode 100644 csv_detective/detect_labels/geo/longitude_wgs/__init__.py delete mode 100644 csv_detective/detect_labels/geo/lonlat_wgs/__init__.py delete mode 100644 csv_detective/detect_labels/other/__init__.py delete mode 100644 csv_detective/detect_labels/other/booleen/__init__.py delete mode 100644 csv_detective/detect_labels/other/email/__init__.py delete mode 100644 csv_detective/detect_labels/other/float/__init__.py delete mode 100644 csv_detective/detect_labels/other/int/__init__.py delete mode 100644 csv_detective/detect_labels/other/money/__init__.py delete mode 100644 csv_detective/detect_labels/other/mongo_object_id/__init__.py delete mode 100644 csv_detective/detect_labels/other/twitter/__init__.py delete mode 100644 csv_detective/detect_labels/other/url/__init__.py delete mode 100644 csv_detective/detect_labels/other/uuid/__init__.py delete mode 100644 csv_detective/detect_labels/temp/__init__.py delete mode 100644 csv_detective/detect_labels/temp/date/__init__.py delete mode 100644 csv_detective/detect_labels/temp/datetime_rfc822/__init__.py delete mode 100644 csv_detective/detect_labels/temp/year/__init__.py rename csv_detective/{detect_fields/FR/geo/adresse/__init__.py => formats/adresse.py} (79%) mode change 100644 => 100755 rename csv_detective/{detect_fields/other/booleen/__init__.py => formats/booleen.py} (66%) mode change 100644 => 100755 create mode 100755 csv_detective/formats/code_commune_insee.py rename csv_detective/{detect_fields/FR/other/code_csp_insee/__init__.py => formats/code_csp_insee.py} (73%) mode change 100644 => 100755 rename csv_detective/{detect_fields/FR/geo/code_departement/__init__.py => formats/code_departement.py} (55%) mode change 100644 => 100755 create mode 100755 csv_detective/formats/code_fantoir.py rename csv_detective/{detect_fields/FR/other/code_import/__init__.py => formats/code_import.py} (51%) mode change 100644 => 100755 create mode 100755 csv_detective/formats/code_postal.py create mode 100755 csv_detective/formats/code_region.py create mode 100755 csv_detective/formats/code_rna.py create mode 100755 csv_detective/formats/code_waldec.py rename csv_detective/{detect_fields/FR/geo/commune/__init__.py => formats/commune.py} (58%) mode change 100644 => 100755 rename csv_detective/{detect_fields/FR/other/csp_insee/__init__.py => formats/csp_insee.py} (54%) mode change 100644 => 100755 rename csv_detective/{detect_fields/FR/other/csp_insee => formats/data}/csp_insee.txt (100%) rename csv_detective/{detect_fields/FR/other/insee_ape700 => formats/data}/insee_ape700.txt (100%) mode change 100644 => 100755 rename csv_detective/{detect_fields/geo/iso_country_code_alpha2 => formats/data}/iso_country_code_alpha2.txt (100%) rename csv_detective/{detect_fields/geo/iso_country_code_alpha3 => formats/data}/iso_country_code_alpha3.txt (100%) rename csv_detective/{detect_fields/geo/iso_country_code_numeric => formats/data}/iso_country_code_numeric.txt (100%) rename csv_detective/{detect_fields/temp/date/__init__.py => formats/date.py} (66%) mode change 100644 => 100755 create mode 100755 csv_detective/formats/date_fr.py rename csv_detective/{detect_fields/temp/datetime_aware/__init__.py => formats/datetime_aware.py} (63%) rename csv_detective/{detect_fields/temp/datetime_naive/__init__.py => formats/datetime_naive.py} (63%) create mode 100755 csv_detective/formats/datetime_rfc822.py create mode 100755 csv_detective/formats/departement.py create mode 100755 csv_detective/formats/email.py rename csv_detective/{detect_fields/other/float/__init__.py => formats/float.py} (67%) mode change 100644 => 100755 create mode 100755 csv_detective/formats/geojson.py create mode 100755 csv_detective/formats/insee_ape700.py rename csv_detective/{detect_fields/FR/geo/insee_canton/__init__.py => formats/insee_canton.py} (57%) mode change 100644 => 100755 create mode 100755 csv_detective/formats/iso_country_code_alpha2.py create mode 100755 csv_detective/formats/iso_country_code_alpha3.py create mode 100755 csv_detective/formats/iso_country_code_numeric.py create mode 100755 csv_detective/formats/jour_de_la_semaine.py rename csv_detective/{detect_fields/other/json/__init__.py => formats/json.py} (58%) mode change 100644 => 100755 create mode 100755 csv_detective/formats/latitude_l93.py rename csv_detective/{detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py => formats/latitude_wgs.py} (55%) mode change 100644 => 100755 rename csv_detective/{detect_labels/geo/latitude_wgs/__init__.py => formats/latitude_wgs_fr_metropole.py} (54%) mode change 100644 => 100755 create mode 100755 csv_detective/formats/latlon_wgs.py create mode 100755 csv_detective/formats/longitude_l93.py create mode 100755 csv_detective/formats/longitude_wgs.py create mode 100755 csv_detective/formats/longitude_wgs_fr_metropole.py create mode 100755 csv_detective/formats/lonlat_wgs.py create mode 100755 csv_detective/formats/money.py create mode 100755 csv_detective/formats/mongo_object_id.py create mode 100755 csv_detective/formats/pays.py create mode 100755 csv_detective/formats/percent.py rename csv_detective/{detect_fields/FR/geo/region/__init__.py => formats/region.py} (69%) mode change 100644 => 100755 rename csv_detective/{detect_fields/FR/other/sexe/__init__.py => formats/sexe.py} (55%) mode change 100644 => 100755 rename csv_detective/{detect_fields/FR/other/siren/__init__.py => formats/siren.py} (53%) mode change 100644 => 100755 create mode 100755 csv_detective/formats/tel_fr.py rename csv_detective/{detect_labels/FR/other/uai/__init__.py => formats/uai.py} (51%) mode change 100644 => 100755 create mode 100755 csv_detective/formats/url.py create mode 100755 csv_detective/formats/username.py create mode 100755 csv_detective/formats/uuid.py create mode 100755 csv_detective/formats/year.py delete mode 100755 csv_detective/load_tests.py diff --git a/csv_detective/detect_fields/FR/README.md b/csv_detective/detect_fields/FR/README.md deleted file mode 100644 index 588ffa52..00000000 --- a/csv_detective/detect_fields/FR/README.md +++ /dev/null @@ -1 +0,0 @@ -Folder for French standards. diff --git a/csv_detective/detect_fields/FR/__init__.py b/csv_detective/detect_fields/FR/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_fields/FR/geo/__init__.py b/csv_detective/detect_fields/FR/geo/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py b/csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py deleted file mode 100644 index 7975c9a2..00000000 --- a/csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from frformat import CodeCommuneInsee, Millesime - -PROPORTION = 0.75 - -_code_commune_insee = CodeCommuneInsee(Millesime.LATEST) - - -def _is(val): - return _code_commune_insee.is_valid(val) diff --git a/csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py b/csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py deleted file mode 100644 index 31be7a03..00000000 --- a/csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from frformat import CodeFantoir - -PROPORTION = 1 - -_code_fantoir = CodeFantoir() - - -def _is(val): - return isinstance(val, str) and _code_fantoir.is_valid(val) diff --git a/csv_detective/detect_fields/FR/geo/code_postal/__init__.py b/csv_detective/detect_fields/FR/geo/code_postal/__init__.py deleted file mode 100644 index fdaf9590..00000000 --- a/csv_detective/detect_fields/FR/geo/code_postal/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from frformat import CodePostal - -PROPORTION = 0.9 - -_code_postal = CodePostal() - - -def _is(val): - return _code_postal.is_valid(val) diff --git a/csv_detective/detect_fields/FR/geo/code_region/__init__.py b/csv_detective/detect_fields/FR/geo/code_region/__init__.py deleted file mode 100644 index 3a7c20dd..00000000 --- a/csv_detective/detect_fields/FR/geo/code_region/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from frformat import CodeRegion, Millesime - -PROPORTION = 1 - -_code_region = CodeRegion(Millesime.LATEST) - - -def _is(val): - """Renvoie True si val peut être un code_région, False sinon""" - return isinstance(val, str) and _code_region.is_valid(val) diff --git a/csv_detective/detect_fields/FR/geo/departement/__init__.py b/csv_detective/detect_fields/FR/geo/departement/__init__.py deleted file mode 100644 index 9df01681..00000000 --- a/csv_detective/detect_fields/FR/geo/departement/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from frformat import Departement, Millesime, Options - -PROPORTION = 0.9 - -_options = Options( - ignore_case=True, - ignore_accents=True, - replace_non_alphanumeric_with_space=True, - ignore_extra_whitespace=True, -) -_departement = Departement(Millesime.LATEST, _options) - - -def _is(val): - """Match avec le nom des departements""" - return isinstance(val, str) and _departement.is_valid(val) diff --git a/csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py b/csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py deleted file mode 100644 index 445ee164..00000000 --- a/csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from frformat import LatitudeL93 - -from csv_detective.detect_fields.other.float import _is as is_float -from csv_detective.detect_fields.other.float import float_casting - -PROPORTION = 1 - -_latitudel93 = LatitudeL93() - - -def _is(val): - try: - if isinstance(val, str) and is_float(val): - return _latitudel93.is_valid(float_casting(val)) - - return False - - except (ValueError, OverflowError): - return False diff --git a/csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py b/csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py deleted file mode 100644 index 9608e74b..00000000 --- a/csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from csv_detective.detect_fields.other.float import _is as is_float - -PROPORTION = 1 - - -def _is(val): - """Renvoie True si val peut etre une latitude en métropole""" - try: - return is_float(val) and float(val) >= 41.3 and float(val) <= 51.3 - except ValueError: - return False - except OverflowError: - return False diff --git a/csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py b/csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py deleted file mode 100644 index dc1baf22..00000000 --- a/csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from frformat import LongitudeL93 - -from csv_detective.detect_fields.other.float import _is as is_float -from csv_detective.detect_fields.other.float import float_casting - -PROPORTION = 1 - -_longitudel93 = LongitudeL93() - - -def _is(val): - try: - if isinstance(val, str) and is_float(val): - return _longitudel93.is_valid(float_casting(val)) - - return False - - except (ValueError, OverflowError): - return False diff --git a/csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py b/csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py deleted file mode 100644 index 8684398e..00000000 --- a/csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from csv_detective.detect_fields.other.float import _is as is_float - -PROPORTION = 1 - - -def _is(val): - """Renvoie True si val peut etre une longitude en métropole""" - try: - return is_float(val) and float(val) >= -5.5 and float(val) <= 9.8 - except ValueError: - return False - except OverflowError: - return False diff --git a/csv_detective/detect_fields/FR/geo/pays/__init__.py b/csv_detective/detect_fields/FR/geo/pays/__init__.py deleted file mode 100644 index 637f630c..00000000 --- a/csv_detective/detect_fields/FR/geo/pays/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from frformat import Millesime, Options, Pays - -PROPORTION = 0.6 - -_options = Options( - ignore_case=True, - ignore_accents=True, - replace_non_alphanumeric_with_space=True, - ignore_extra_whitespace=True, -) -_pays = Pays(Millesime.LATEST, _options) - - -def _is(val): - """Match avec le nom des pays""" - return isinstance(val, str) and _pays.is_valid(val) diff --git a/csv_detective/detect_fields/FR/other/__init__.py b/csv_detective/detect_fields/FR/other/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt b/csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt deleted file mode 100644 index 9bd8128e..00000000 --- a/csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +++ /dev/null @@ -1,498 +0,0 @@ -111a -111b -111c -111d -111e -111f -121a -121b -121c -121d -121e -121f -122a -122b -122c -131a -131b -131c -131d -131e -131f -211a -211b -211c -211d -211e -211f -211g -211h -211j -212a -212b -212c -212d -213a -214a -214b -214c -214d -214e -214f -215a -215b -215c -215d -216a -216b -216c -217a -217b -217c -217d -217e -218a -219a -221a -221b -222a -222b -223a -223b -223c -223d -223e -223f -223g -223h -224a -224b -224c -224d -225a -226a -226b -226c -227a -227b -227c -227d -231a -232a -233a -233b -233c -233d -311a -311b -311c -311d -311e -311f -312a -312b -312c -312d -312e -312f -312g -313a -331a -332a -332b -333a -333b -333c -333d -333e -333f -334a -335a -341a -341b -342a -342e -343a -344a -344b -344c -344d -351a -352a -352b -353a -353b -353c -354a -354b -354c -354d -354g -371a -372a -372b -372c -372d -372e -372f -373a -373b -373c -373d -374a -374b -374c -374d -375a -375b -376a -376b -376c -376d -376e -376f -376g -377a -380a -381a -382a -382b -382c -382d -383a -383b -383c -384a -384b -384c -385a -385b -385c -386a -386d -386e -387a -387b -387c -387d -387e -387f -388a -388b -388c -388d -388e -389a -389b -389c -421a -421b -422a -422b -422c -422d -422e -423a -423b -424a -425a -431a -431b -431c -431d -431e -431f -431g -432a -432b -432c -432d -433a -433b -433c -433d -434a -434b -434c -434d -434e -434f -434g -435a -435b -441a -441b -451a -451b -451c -451d -451e -451f -452a -452b -461a -461d -461e -461f -462a -462b -462c -462d -462e -463a -463b -463c -463d -463e -464a -464b -465a -465b -465c -466a -466b -466c -467a -467b -467c -467d -468a -468b -471a -471b -472a -472b -472c -472d -473a -473b -473c -474a -474b -474c -475a -475b -476a -476b -477a -477b -477c -477d -478a -478b -478c -478d -479a -479b -480a -480b -481a -481b -482a -483a -484a -484b -485a -485b -486a -486d -486e -487a -487b -488a -488b -521a -521b -522a -523a -524a -525a -525b -525c -525d -526a -526b -526c -526d -526e -531a -531b -531c -532a -532b -532c -533a -533b -533c -534a -534b -541a -541d -542a -542b -543a -543d -544a -545a -545b -545c -545d -546a -546b -546c -546d -546e -551a -552a -553a -554a -554b -554c -554d -554e -554f -554g -554h -554j -555a -556a -561a -561d -561e -561f -562a -562b -563a -563b -563c -564a -564b -621a -621b -621c -621d -621e -621f -621g -622a -622b -622g -623a -623b -623c -623f -623g -624a -624d -624e -624f -624g -625a -625b -625c -625d -625e -625h -626a -626b -626c -627a -627b -627c -627d -627e -627f -628a -628b -628c -628d -628e -628f -628g -631a -632a -632b -632c -632d -632e -632f -632g -632h -632j -632k -633a -633b -633c -633d -634a -634b -634c -634d -635a -636a -636b -636c -636d -637a -637b -637c -637d -641a -641b -642a -642b -643a -644a -651a -651b -652a -652b -653a -654a -655a -656a -671a -671b -672a -673a -673b -673c -674a -674b -674c -674d -674e -675a -675b -675c -676a -676b -676c -676d -676e -681a -681b -682a -683a -684a -684b -685a -691a -691b -691c -691d -691e -691f -692a -7100 -7200 -7400 -7500 -7700 -7800 -8100 -8300 -8400 -8500 -8600 - diff --git a/csv_detective/detect_fields/FR/other/code_rna/__init__.py b/csv_detective/detect_fields/FR/other/code_rna/__init__.py deleted file mode 100644 index 4d725f33..00000000 --- a/csv_detective/detect_fields/FR/other/code_rna/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from frformat import CodeRNA - -PROPORTION = 0.9 - -_code_rna = CodeRNA() - - -def _is(val): - return isinstance(val, str) and _code_rna.is_valid(val) diff --git a/csv_detective/detect_fields/FR/other/code_waldec/__init__.py b/csv_detective/detect_fields/FR/other/code_waldec/__init__.py deleted file mode 100644 index 5595d869..00000000 --- a/csv_detective/detect_fields/FR/other/code_waldec/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -import re - -PROPORTION = 0.9 -regex = r"^W\d[\dA-Z]\d{7}$" - - -def _is(val): - """Repere le code Waldec""" - return isinstance(val, str) and bool(re.match(regex, val)) diff --git a/csv_detective/detect_fields/FR/other/date_fr/__init__.py b/csv_detective/detect_fields/FR/other/date_fr/__init__.py deleted file mode 100644 index 1d234dda..00000000 --- a/csv_detective/detect_fields/FR/other/date_fr/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -import re - -PROPORTION = 1 -regex = ( - r"^\d{1,2}[ \-](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre" - r"|octobre|novembre|decembre)[ \-]\d{4}$" -) - - -def _is(val): - """Repere les dates textuelles FR""" - return isinstance(val, str) and bool(re.match(regex, val)) diff --git a/csv_detective/detect_fields/FR/other/insee_ape700/__init__.py b/csv_detective/detect_fields/FR/other/insee_ape700/__init__.py deleted file mode 100644 index 2accd788..00000000 --- a/csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from os.path import dirname, join - -from csv_detective.parsing.text import _process_text - -PROPORTION = 1 -f = open(join(dirname(__file__), "insee_ape700.txt"), "r") -condes_insee_ape = f.read().split("\n") -# removing empty str due to additionnal line in file -del condes_insee_ape[-1] -condes_insee_ape = set(condes_insee_ape) -f.close() - - -def _is(val): - """Repère les codes APE700 de l'INSEE""" - if not isinstance(val, str): - return False - val = _process_text(val).upper() - return val in condes_insee_ape diff --git a/csv_detective/detect_fields/FR/other/siret/__init__.py b/csv_detective/detect_fields/FR/other/siret/__init__.py deleted file mode 100644 index e8ac0e98..00000000 --- a/csv_detective/detect_fields/FR/other/siret/__init__.py +++ /dev/null @@ -1,31 +0,0 @@ -import re - -PROPORTION = 0.8 - - -def _is(val): - """Détection des identifiants SIRET (SIRENE)""" - if not isinstance(val, str): - return False - val = val.replace(" ", "") - if not bool(re.match(r"^[0-9]{14}$", val)): - return False - - # Vérification par clé de luhn du SIREN - cle = 0 - pair = False - for x in val[:9]: - y = int(x) * (1 + pair) - cle += y // 10 + y % 10 - pair = not pair - if cle % 10 != 0: - return cle % 10 == 0 - - # Vérification par clé de luhn du SIRET - cle = 0 - pair = len(val) % 2 == 0 - for x in val: - y = int(x) * (1 + pair) - cle += y // 10 + y % 10 - pair = not pair - return cle % 10 == 0 diff --git a/csv_detective/detect_fields/FR/other/tel_fr/__init__.py b/csv_detective/detect_fields/FR/other/tel_fr/__init__.py deleted file mode 100644 index a232c9b9..00000000 --- a/csv_detective/detect_fields/FR/other/tel_fr/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -import re - -PROPORTION = 0.7 - - -def _is(val): - """Repère les numeros de telephone francais""" - if not isinstance(val, str): - return False - - if len(val) < 10: - return False - - val = val.replace(".", "").replace("-", "").replace(" ", "") - - match_1 = bool(re.match(r"^(0|\+33|0033)?[0-9]{9}$", val)) - return match_1 diff --git a/csv_detective/detect_fields/FR/other/uai/__init__.py b/csv_detective/detect_fields/FR/other/uai/__init__.py deleted file mode 100644 index 26bf3beb..00000000 --- a/csv_detective/detect_fields/FR/other/uai/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -import re - -PROPORTION = 1 - - -def _is(val): - """Repere les codes UAI de l'éducation nationale""" - - # test sur la longueur - if not isinstance(val, str) or len(val) != 8: - return False - - if not bool(re.match(r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$", val)): - return False - return True diff --git a/csv_detective/detect_fields/FR/temp/__init__.py b/csv_detective/detect_fields/FR/temp/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py b/csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py deleted file mode 100644 index cc711605..00000000 --- a/csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -PROPORTION = 1 -jours = { - "lundi", - "mardi", - "mercredi", - "jeudi", - "vendredi", - "samedi", - "dimanche", - "lun", - "mar", - "mer", - "jeu", - "ven", - "sam", - "dim", -} - - -def _is(val): - """Renvoie True si les champs peuvent être des jours de la semaine""" - if not isinstance(val, str): - return False - val = val.lower() - return val in jours diff --git a/csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py b/csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py deleted file mode 100644 index 26a7449b..00000000 --- a/csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +++ /dev/null @@ -1,39 +0,0 @@ -from unidecode import unidecode - -PROPORTION = 1 -mois = { - "janvier", - "fevrier", - "mars", - "avril", - "mai", - "juin", - "juillet", - "aout", - "septembre", - "octobre", - "novembre", - "decembre", - "jan", - "fev", - "mar", - "avr", - "mai", - "jun", - "jui", - "juil", - "aou", - "sep", - "sept", - "oct", - "nov", - "dec", -} - - -def _is(val): - """Renvoie True si les champs peuvent être des mois de l'année""" - if not isinstance(val, str): - return False - val = unidecode(val.lower()) - return val in mois diff --git a/csv_detective/detect_fields/README.md b/csv_detective/detect_fields/README.md deleted file mode 100644 index 163031fa..00000000 --- a/csv_detective/detect_fields/README.md +++ /dev/null @@ -1,5 +0,0 @@ -Each country (indicated by ISO-code) folder corresponds to fields specifc to a country. Other folders contain tests for international standards (e-mails, dates ...). - -## TODO - -Update file "code_postal.txt" in FR.geo.code_postal and reset PROPORTION to 1 \ No newline at end of file diff --git a/csv_detective/detect_fields/__init__.py b/csv_detective/detect_fields/__init__.py deleted file mode 100644 index c47c0019..00000000 --- a/csv_detective/detect_fields/__init__.py +++ /dev/null @@ -1,112 +0,0 @@ -from .FR.geo import ( - adresse, - code_commune_insee, - code_departement, - code_fantoir, - code_postal, - code_region, - commune, - departement, - insee_canton, - latitude_l93, - latitude_wgs_fr_metropole, - longitude_l93, - longitude_wgs_fr_metropole, - pays, - region, -) -from .FR.other import ( - code_csp_insee, - code_import, - code_rna, - code_waldec, - csp_insee, - date_fr, - insee_ape700, - sexe, - siren, - siret, - tel_fr, - uai, -) -from .FR.temp import jour_de_la_semaine, mois_de_annee -from .geo import ( - iso_country_code_alpha2, - iso_country_code_alpha3, - iso_country_code_numeric, - json_geojson, - latitude_wgs, - latlon_wgs, - longitude_wgs, - lonlat_wgs, -) -from .other import ( - booleen, - email, - float, - int, - json, - money, - mongo_object_id, - percent, - twitter, - url, - uuid, -) -from .temp import date, datetime_aware, datetime_naive, datetime_rfc822, year - -__all__ = [ - "adresse", - "code_commune_insee", - "code_departement", - "code_fantoir", - "code_postal", - "code_region", - "commune", - "departement", - "insee_canton", - "latitude_l93", - "latitude_wgs_fr_metropole", - "longitude_l93", - "longitude_wgs_fr_metropole", - "pays", - "region", - "code_csp_insee", - "code_import", - "code_rna", - "code_waldec", - "csp_insee", - "date_fr", - "insee_ape700", - "sexe", - "siren", - "siret", - "tel_fr", - "uai", - "jour_de_la_semaine", - "mois_de_annee", - "iso_country_code_alpha2", - "iso_country_code_alpha3", - "iso_country_code_numeric", - "json_geojson", - "latitude_wgs", - "latlon_wgs", - "longitude_wgs", - "lonlat_wgs", - "booleen", - "email", - "float", - "int", - "json", - "money", - "mongo_object_id", - "percent", - "twitter", - "url", - "uuid", - "date", - "datetime_aware", - "datetime_naive", - "datetime_rfc822", - "year", -] diff --git a/csv_detective/detect_fields/geo/README.md b/csv_detective/detect_fields/geo/README.md deleted file mode 100644 index 2801b48e..00000000 --- a/csv_detective/detect_fields/geo/README.md +++ /dev/null @@ -1 +0,0 @@ -Folder for international spacial variables (international codes, spatial coordinates, etc.). diff --git a/csv_detective/detect_fields/geo/__init__.py b/csv_detective/detect_fields/geo/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py b/csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py deleted file mode 100644 index 916d6352..00000000 --- a/csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -import re -from os.path import dirname, join - -PROPORTION = 1 - -with open(join(dirname(__file__), "iso_country_code_alpha2.txt"), "r") as iofile: - liste_pays = iofile.read().split("\n") -liste_pays = set(liste_pays) - - -def _is(val): - """Renvoie True si val peut etre un code iso pays alpha-2, False sinon""" - if not isinstance(val, str) or not bool(re.match(r"[A-Z]{2}$", val)): - return False - return val in liste_pays diff --git a/csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py b/csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py deleted file mode 100644 index 9d89c15b..00000000 --- a/csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -import re -from os.path import dirname, join - -PROPORTION = 1 - -with open(join(dirname(__file__), "iso_country_code_alpha3.txt"), "r") as iofile: - liste_pays = iofile.read().split("\n") - - -def _is(val): - """Renvoie True si val peut etre un code iso pays alpha-3, False sinon""" - if not isinstance(val, str) or not bool(re.match(r"[A-Z]{3}$", val)): - return False - return val in set(liste_pays) diff --git a/csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py b/csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py deleted file mode 100644 index a420ba4f..00000000 --- a/csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -import re -from os.path import dirname, join - -PROPORTION = 1 - -with open(join(dirname(__file__), "iso_country_code_numeric.txt"), "r") as iofile: - liste_pays = iofile.read().split("\n") -liste_pays = set(liste_pays) - - -def _is(val): - """Renvoie True si val peut etre un code iso pays numerique, False sinon""" - if not isinstance(val, str) or not bool(re.match(r"[0-9]{3}$", val)): - return False - return val in liste_pays diff --git a/csv_detective/detect_fields/geo/json_geojson/__init__.py b/csv_detective/detect_fields/geo/json_geojson/__init__.py deleted file mode 100644 index 2f7a06bd..00000000 --- a/csv_detective/detect_fields/geo/json_geojson/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -import json - -PROPORTION = 0.9 - - -def _is(val): - """Renvoie True si val peut etre un geojson""" - - try: - j = json.loads(val) - if isinstance(j, dict): - if "type" in j and "coordinates" in j: - return True - if "geometry" in j and "coordinates" in j["geometry"]: - return True - except Exception: - pass - return False diff --git a/csv_detective/detect_fields/geo/latitude_wgs/__init__.py b/csv_detective/detect_fields/geo/latitude_wgs/__init__.py deleted file mode 100644 index 90a1ed7f..00000000 --- a/csv_detective/detect_fields/geo/latitude_wgs/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from csv_detective.detect_fields.other.float import _is as is_float - -PROPORTION = 1 - - -def _is(val): - """Renvoie True si val peut etre une latitude""" - try: - return is_float(val) and float(val) >= -90 and float(val) <= 90 - except ValueError: - return False - except OverflowError: - return False diff --git a/csv_detective/detect_fields/geo/latlon_wgs/__init__.py b/csv_detective/detect_fields/geo/latlon_wgs/__init__.py deleted file mode 100644 index 5bcc6fc3..00000000 --- a/csv_detective/detect_fields/geo/latlon_wgs/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from ..latitude_wgs import _is as is_lat -from ..longitude_wgs import _is as is_lon - -PROPORTION = 1 - - -def _is(val): - """Renvoie True si val peut etre une latitude,longitude""" - - if not isinstance(val, str) or val.count(",") != 1: - return False - lat, lon = val.split(",") - # handling [lat,lon] - if lat.startswith("[") and lon.endswith("]"): - lat, lon = lat[1:], lon[:-1] - return is_lat(lat) and is_lon(lon.replace(" ", "")) diff --git a/csv_detective/detect_fields/geo/longitude_wgs/__init__.py b/csv_detective/detect_fields/geo/longitude_wgs/__init__.py deleted file mode 100644 index 584e8906..00000000 --- a/csv_detective/detect_fields/geo/longitude_wgs/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from csv_detective.detect_fields.other.float import _is as is_float - -PROPORTION = 1 - - -def _is(val): - """Renvoie True si val peut etre une longitude""" - try: - return is_float(val) and float(val) >= -180 and float(val) <= 180 - except ValueError: - return False - except OverflowError: - return False diff --git a/csv_detective/detect_fields/geo/lonlat_wgs/__init__.py b/csv_detective/detect_fields/geo/lonlat_wgs/__init__.py deleted file mode 100644 index 05580850..00000000 --- a/csv_detective/detect_fields/geo/lonlat_wgs/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from ..latitude_wgs import _is as is_lat -from ..longitude_wgs import _is as is_lon - -PROPORTION = 1 - - -def _is(val): - """Renvoie True si val peut etre une longitude,latitude""" - - if not isinstance(val, str) or val.count(",") != 1: - return False - lon, lat = val.split(",") - # handling [lon,lat] - if lon.startswith("[") and lat.endswith("]"): - lon, lat = lon[1:], lat[:-1] - return is_lon(lon) and is_lat(lat.replace(" ", "")) diff --git a/csv_detective/detect_fields/other/__init__.py b/csv_detective/detect_fields/other/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_fields/other/email/__init__.py b/csv_detective/detect_fields/other/email/__init__.py deleted file mode 100644 index 667dd9f3..00000000 --- a/csv_detective/detect_fields/other/email/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -import re - -PROPORTION = 0.9 - - -def _is(val): - """Detects e-mails""" - return isinstance(val, str) and bool( - re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", val, re.IGNORECASE) - ) diff --git a/csv_detective/detect_fields/other/int/__init__.py b/csv_detective/detect_fields/other/int/__init__.py deleted file mode 100644 index 37b82c83..00000000 --- a/csv_detective/detect_fields/other/int/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -PROPORTION = 1 - - -def _is(val): - """Detects integers""" - if ( - not isinstance(val, str) - or any([v in val for v in [".", "_", "+"]]) - or (val.startswith("0") and len(val) > 1) - ): - return False - try: - int(val) - return True - except ValueError: - return False diff --git a/csv_detective/detect_fields/other/money/__init__.py b/csv_detective/detect_fields/other/money/__init__.py deleted file mode 100644 index ad9c1ef2..00000000 --- a/csv_detective/detect_fields/other/money/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -from ..float import _is as is_float - -currencies = set(["€", "$", "£", "¥"]) - -PROPORTION = 0.8 - - -def _is(val: str): - if not isinstance(val, str) or val[-1] not in currencies: - return False - return is_float(val[:-1]) diff --git a/csv_detective/detect_fields/other/mongo_object_id/__init__.py b/csv_detective/detect_fields/other/mongo_object_id/__init__.py deleted file mode 100644 index 4aca7ec2..00000000 --- a/csv_detective/detect_fields/other/mongo_object_id/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -import re - -PROPORTION = 0.8 - - -def _is(val): - """Detects Mongo ObjectIds""" - return isinstance(val, str) and bool(re.match(r"^[0-9a-fA-F]{24}$", val)) diff --git a/csv_detective/detect_fields/other/percent/__init__.py b/csv_detective/detect_fields/other/percent/__init__.py deleted file mode 100644 index 9d2620ad..00000000 --- a/csv_detective/detect_fields/other/percent/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from ..float import _is as is_float - -PROPORTION = 0.8 - - -def _is(val: str): - if not isinstance(val, str) or val[-1] != "%": - return False - return is_float(val[:-1]) diff --git a/csv_detective/detect_fields/other/twitter/__init__.py b/csv_detective/detect_fields/other/twitter/__init__.py deleted file mode 100644 index d63c541f..00000000 --- a/csv_detective/detect_fields/other/twitter/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -import re - -PROPORTION = 1 - - -def _is(val): - """Detects twitter accounts""" - return isinstance(val, str) and bool(re.match(r"^@[A-Za-z0-9_]+$", val)) diff --git a/csv_detective/detect_fields/other/url/__init__.py b/csv_detective/detect_fields/other/url/__init__.py deleted file mode 100644 index 72bb178f..00000000 --- a/csv_detective/detect_fields/other/url/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -import re - -PROPORTION = 1 -url_pattern = re.compile( - r"^((https?|ftp)://|www\.)(([A-Za-z0-9-]+\.)+[A-Za-z]{2,6})" - r"(/[A-Za-z0-9._~:/?#[@!$&'()*+,;=%-]*)?$" -) - - -def _is(val): - """Detects urls""" - if not isinstance(val, str): - return False - return bool(url_pattern.match(val)) diff --git a/csv_detective/detect_fields/other/uuid/__init__.py b/csv_detective/detect_fields/other/uuid/__init__.py deleted file mode 100644 index 75f39bdd..00000000 --- a/csv_detective/detect_fields/other/uuid/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -import re - -PROPORTION = 0.8 - - -def _is(val): - """Detects UUIDs""" - return isinstance(val, str) and bool( - re.match(r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$", val) - ) diff --git a/csv_detective/detect_fields/temp/README.md b/csv_detective/detect_fields/temp/README.md deleted file mode 100644 index e9e3c5b7..00000000 --- a/csv_detective/detect_fields/temp/README.md +++ /dev/null @@ -1 +0,0 @@ -Folder for international temporal variables (date, time, time zone, etc.). diff --git a/csv_detective/detect_fields/temp/__init__.py b/csv_detective/detect_fields/temp/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_fields/temp/datetime_rfc822/__init__.py b/csv_detective/detect_fields/temp/datetime_rfc822/__init__.py deleted file mode 100644 index ea2f6078..00000000 --- a/csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -import re - -PROPORTION = 1 - - -def _is(val): - """Renvoie True si val peut être une date au format rfc822, False sinon - Exemple: Tue, 19 Dec 2023 15:30:45 +0000""" - - return isinstance(val, str) and bool( - re.match( - r"^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} " - r"([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) " - r"(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$", - val.lower(), - re.IGNORECASE, - ) - ) diff --git a/csv_detective/detect_fields/temp/year/__init__.py b/csv_detective/detect_fields/temp/year/__init__.py deleted file mode 100644 index 79a68e1f..00000000 --- a/csv_detective/detect_fields/temp/year/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -PROPORTION = 1 - - -def _is(val): - """Returns True if val can be a year""" - try: - val = int(val) - except ValueError: - return False - return (1800 <= val) and (val <= 2100) diff --git a/csv_detective/detect_labels/FR/__init__.py b/csv_detective/detect_labels/FR/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_labels/FR/geo/__init__.py b/csv_detective/detect_labels/FR/geo/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_labels/FR/geo/adresse/__init__.py b/csv_detective/detect_labels/FR/geo/adresse/__init__.py deleted file mode 100644 index 281f2499..00000000 --- a/csv_detective/detect_labels/FR/geo/adresse/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "adresse", - "adresse postale", - "adresse geographique", - "adr", - "adresse complete", - "adresse station", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py b/csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py deleted file mode 100644 index 9421cd4e..00000000 --- a/csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "code commune insee", - "code insee", - "codes insee", - "code commune", - "code insee commune", - "insee", - "code com", - "com", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/code_departement/__init__.py b/csv_detective/detect_labels/FR/geo/code_departement/__init__.py deleted file mode 100644 index 1eaacb9b..00000000 --- a/csv_detective/detect_labels/FR/geo/code_departement/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - # "dep": Possible confusion with dep name? - words_combinations_list = [ - "code departement", - "code_departement", - "dep", - "departement", - "dept", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py b/csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py deleted file mode 100644 index 78230a42..00000000 --- a/csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "cadastre1", - "code fantoir", - "fantoir", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/code_postal/__init__.py b/csv_detective/detect_labels/FR/geo/code_postal/__init__.py deleted file mode 100644 index 7c8cfff6..00000000 --- a/csv_detective/detect_labels/FR/geo/code_postal/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "code postal", - "postal code", - "postcode", - "post code", - "cp", - "codes postaux", - "location postcode", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/code_region/__init__.py b/csv_detective/detect_labels/FR/geo/code_region/__init__.py deleted file mode 100644 index a254e7d7..00000000 --- a/csv_detective/detect_labels/FR/geo/code_region/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - # "reg" : possible confusion with region name? - words_combinations_list = [ - "code region", - "reg", - "code insee region", - "region", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/commune/__init__.py b/csv_detective/detect_labels/FR/geo/commune/__init__.py deleted file mode 100644 index de106e9b..00000000 --- a/csv_detective/detect_labels/FR/geo/commune/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "commune", - "ville", - "libelle commune", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/departement/__init__.py b/csv_detective/detect_labels/FR/geo/departement/__init__.py deleted file mode 100644 index 7d1cd08c..00000000 --- a/csv_detective/detect_labels/FR/geo/departement/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "departement", - "libelle du departement", - "deplib", - "nom dept", - "dept", - "libdepartement", - "nom departement", - "libelle dep", - "libelle departement", - "lb departements", - "dep libusage", - "lb departement", - "nom dep", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/insee_canton/__init__.py b/csv_detective/detect_labels/FR/geo/insee_canton/__init__.py deleted file mode 100644 index 451f8e1c..00000000 --- a/csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "insee canton", - "canton", - "cant", - "nom canton", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py b/csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py deleted file mode 100644 index 0da69fc3..00000000 --- a/csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +++ /dev/null @@ -1,30 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - # Does not always detect CRS - words_combinations_list = [ - "latitude", - "lat", - "y", - "yf", - "yd", - "y l93", - "coordonnee y", - "latitude lb93", - "coord y", - "ycoord", - "geocodage y gps", - "location latitude", - "ylatitude", - "ylat", - "latitude (y)", - "latitudeorg", - "coordinates.latitude", - "googlemap latitude", - "latitudelieu", - "latitude googlemap", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py b/csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py deleted file mode 100644 index ce92b90a..00000000 --- a/csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - # Does not detect CRS - words_combinations_list = [ - "longitude", - "lon", - "long", - "geocodage x gps", - "location longitude", - "xlongitude", - "lng", - "xlong", - "x", - "xf", - "xd", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py b/csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py deleted file mode 100644 index ce92b90a..00000000 --- a/csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - # Does not detect CRS - words_combinations_list = [ - "longitude", - "lon", - "long", - "geocodage x gps", - "location longitude", - "xlongitude", - "lng", - "xlong", - "x", - "xf", - "xd", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/pays/__init__.py b/csv_detective/detect_labels/FR/geo/pays/__init__.py deleted file mode 100644 index fb83bac1..00000000 --- a/csv_detective/detect_labels/FR/geo/pays/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "pays", - "payslieu", - "paysorg", - "country", - "pays lib", - "lieupays", - "pays beneficiaire", - "nom du pays", - "journey start country", - "libelle pays", - "journey end country", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/region/__init__.py b/csv_detective/detect_labels/FR/geo/region/__init__.py deleted file mode 100644 index c65603d7..00000000 --- a/csv_detective/detect_labels/FR/geo/region/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "region", - "libelle region", - "nom region", - "libelle reg", - "nom reg", - "reg libusage", - "nom de la region", - "regionorg", - "regionlieu", - "reg", - "nom officiel region", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/__init__.py b/csv_detective/detect_labels/FR/other/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py b/csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py deleted file mode 100644 index f11ff450..00000000 --- a/csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["code csp insee", "code csp"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/code_rna/__init__.py b/csv_detective/detect_labels/FR/other/code_rna/__init__.py deleted file mode 100644 index cf69f302..00000000 --- a/csv_detective/detect_labels/FR/other/code_rna/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "code rna", - "rna", - "n° inscription association", - "identifiant association", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/code_waldec/__init__.py b/csv_detective/detect_labels/FR/other/code_waldec/__init__.py deleted file mode 100644 index 9450733d..00000000 --- a/csv_detective/detect_labels/FR/other/code_waldec/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["code waldec", "waldec"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/csp_insee/__init__.py b/csv_detective/detect_labels/FR/other/csp_insee/__init__.py deleted file mode 100644 index 0cae8075..00000000 --- a/csv_detective/detect_labels/FR/other/csp_insee/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - # To improve? No specific header found in data - words_combinations_list = [ - "csp insee", - "csp", - "categorie socioprofessionnelle", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/date_fr/__init__.py b/csv_detective/detect_labels/FR/other/date_fr/__init__.py deleted file mode 100644 index 10a10891..00000000 --- a/csv_detective/detect_labels/FR/other/date_fr/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - # To improve: no header specific to "fr" found in data - words_combinations_list = ["date"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/insee_ape700/__init__.py b/csv_detective/detect_labels/FR/other/insee_ape700/__init__.py deleted file mode 100644 index 58dfb26f..00000000 --- a/csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "code ape", - "code activite (ape)", - "code naf", - "code naf organisme designe", - "code naf organisme designant", - "base sirene : code ape de l'etablissement siege", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/sexe/__init__.py b/csv_detective/detect_labels/FR/other/sexe/__init__.py deleted file mode 100644 index f4583170..00000000 --- a/csv_detective/detect_labels/FR/other/sexe/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["sexe", "sex", "civilite", "genre", "id sexe"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/siren/__init__.py b/csv_detective/detect_labels/FR/other/siren/__init__.py deleted file mode 100644 index e57aa56a..00000000 --- a/csv_detective/detect_labels/FR/other/siren/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "siren", - "siren organisme designe", - "siren organisme designant", - "n° siren", - "siren organisme", - "siren titulaire", - "numero siren", - "epci", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/siret/__init__.py b/csv_detective/detect_labels/FR/other/siret/__init__.py deleted file mode 100644 index 7741596e..00000000 --- a/csv_detective/detect_labels/FR/other/siret/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "siret", - "siret d", - "num siret", - "siretacheteur", - "n° siret", - "coll siret", - "epci", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/tel_fr/__init__.py b/csv_detective/detect_labels/FR/other/tel_fr/__init__.py deleted file mode 100644 index 2cb895a4..00000000 --- a/csv_detective/detect_labels/FR/other/tel_fr/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "telephone", - "tel", - "tel1", - "tel2", - "phone", - "num tel", - "tel mob", - "telephone sav", - "telephone1", - "coordinates.phone", - "telephone du lieu", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/temp/__init__.py b/csv_detective/detect_labels/FR/temp/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py b/csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py deleted file mode 100644 index db06549a..00000000 --- a/csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "jour semaine", - "type jour", - "jour de la semaine", - "saufjour", - "nomjour", - "jour", - "jour de fermeture", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py b/csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py deleted file mode 100644 index dd6aca72..00000000 --- a/csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["mois de annee", "mois", "month"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/__init__.py b/csv_detective/detect_labels/__init__.py deleted file mode 100644 index c78d34cb..00000000 --- a/csv_detective/detect_labels/__init__.py +++ /dev/null @@ -1,94 +0,0 @@ -from .FR.geo import ( - adresse, - code_commune_insee, - code_departement, - code_fantoir, - code_postal, - code_region, - commune, - departement, - insee_canton, - latitude_l93, - latitude_wgs_fr_metropole, - longitude_l93, - longitude_wgs_fr_metropole, - pays, - region, -) -from .FR.other import ( - code_csp_insee, - code_rna, - code_waldec, - csp_insee, - date_fr, - insee_ape700, - sexe, - siren, - siret, - tel_fr, - uai, -) -from .FR.temp import jour_de_la_semaine, mois_de_annee -from .geo import ( - iso_country_code_alpha2, - iso_country_code_alpha3, - iso_country_code_numeric, - json_geojson, - latitude_wgs, - latlon_wgs, - longitude_wgs, - lonlat_wgs, -) -from .other import booleen, email, float, int, money, mongo_object_id, twitter, url, uuid -from .temp import date, datetime_rfc822, year - -__all__ = [ - "adresse", - "code_commune_insee", - "code_departement", - "code_fantoir", - "code_postal", - "code_region", - "commune", - "departement", - "insee_canton", - "latitude_l93", - "latitude_wgs_fr_metropole", - "longitude_l93", - "longitude_wgs_fr_metropole", - "pays", - "region", - "code_csp_insee", - "code_rna", - "code_waldec", - "csp_insee", - "date_fr", - "insee_ape700", - "sexe", - "siren", - "siret", - "tel_fr", - "uai", - "iso_country_code_alpha2", - "iso_country_code_alpha3", - "iso_country_code_numeric", - "json_geojson", - "latitude_wgs", - "latlon_wgs", - "longitude_wgs", - "lonlat_wgs", - "jour_de_la_semaine", - "mois_de_annee", - "booleen", - "email", - "float", - "int", - "money", - "mongo_object_id", - "twitter", - "url", - "uuid", - "date", - "datetime_rfc822", - "year", -] diff --git a/csv_detective/detect_labels/geo/__init__.py b/csv_detective/detect_labels/geo/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py b/csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py deleted file mode 100644 index 55077b29..00000000 --- a/csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "iso country code", - "code pays", - "pays", - "country", - "nation", - "pays code", - "code pays (iso)", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py b/csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py deleted file mode 100644 index 55077b29..00000000 --- a/csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "iso country code", - "code pays", - "pays", - "country", - "nation", - "pays code", - "code pays (iso)", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py b/csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py deleted file mode 100644 index 55077b29..00000000 --- a/csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "iso country code", - "code pays", - "pays", - "country", - "nation", - "pays code", - "code pays (iso)", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/geo/json_geojson/__init__.py b/csv_detective/detect_labels/geo/json_geojson/__init__.py deleted file mode 100644 index f1b298f4..00000000 --- a/csv_detective/detect_labels/geo/json_geojson/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "json geojson", - "json", - "geojson", - "geo shape", - "geom", - "geometry", - "geo shape", - "geoshape", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/geo/latlon_wgs/__init__.py b/csv_detective/detect_labels/geo/latlon_wgs/__init__.py deleted file mode 100644 index c78c3535..00000000 --- a/csv_detective/detect_labels/geo/latlon_wgs/__init__.py +++ /dev/null @@ -1,39 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - -COMMON_COORDS_LABELS = [ - "ban", - "coordinates", - "coordonnees", - "coordonnees insee", - "geo", - "geopoint", - "geoloc", - "geolocalisation", - "geom", - "geometry", - "gps", - "localisation", - "point", - "position", - "wgs84", -] - -specific = [ - "latlon", - "lat lon", - "x y", - "xy", -] - -# we aim wide to catch exact matches if possible for the highest possible score -words = ( - COMMON_COORDS_LABELS - + specific - + [w + sep + suf for suf in specific for w in COMMON_COORDS_LABELS for sep in ["", " "]] -) - - -def _is(header: str) -> float: - return header_score(header, words) diff --git a/csv_detective/detect_labels/geo/longitude_wgs/__init__.py b/csv_detective/detect_labels/geo/longitude_wgs/__init__.py deleted file mode 100644 index a30b67b7..00000000 --- a/csv_detective/detect_labels/geo/longitude_wgs/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - # Does not detect CRS - words_combinations_list = [ - "longitude", - "lon", - "long", - "geocodage x gps", - "location longitude", - "xlongitude", - "lng", - "xlong", - "x", - "xf", - "xd", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/geo/lonlat_wgs/__init__.py b/csv_detective/detect_labels/geo/lonlat_wgs/__init__.py deleted file mode 100644 index ef529c82..00000000 --- a/csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -from csv_detective.parsing.text import header_score - -from ..latlon_wgs import COMMON_COORDS_LABELS - -PROPORTION = 0.5 - -specific = [ - "lonlat", - "lon lat", - "y x", - "yx", -] - -# we aim wide to catch exact matches if possible for the highest possible score -words = ( - COMMON_COORDS_LABELS - + specific - + [w + sep + suf for suf in specific for w in COMMON_COORDS_LABELS for sep in ["", " "]] -) - - -def _is(header: str) -> float: - return header_score(header, words) diff --git a/csv_detective/detect_labels/other/__init__.py b/csv_detective/detect_labels/other/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_labels/other/booleen/__init__.py b/csv_detective/detect_labels/other/booleen/__init__.py deleted file mode 100644 index 307378ea..00000000 --- a/csv_detective/detect_labels/other/booleen/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["is ", "has ", "est "] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/other/email/__init__.py b/csv_detective/detect_labels/other/email/__init__.py deleted file mode 100644 index de771fe8..00000000 --- a/csv_detective/detect_labels/other/email/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "email", - "mail", - "courriel", - "contact", - "mel", - "lieucourriel", - "coordinates.emailcontact", - "e mail", - "mo mail", - "adresse mail", - "adresse email", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/other/float/__init__.py b/csv_detective/detect_labels/other/float/__init__.py deleted file mode 100644 index 354814c2..00000000 --- a/csv_detective/detect_labels/other/float/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["part", "ratio", "taux"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/other/int/__init__.py b/csv_detective/detect_labels/other/int/__init__.py deleted file mode 100644 index 74b3586c..00000000 --- a/csv_detective/detect_labels/other/int/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["nb", "nombre", "nbre"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/other/money/__init__.py b/csv_detective/detect_labels/other/money/__init__.py deleted file mode 100644 index 8944b79d..00000000 --- a/csv_detective/detect_labels/other/money/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["budget", "salaire", "euro", "euros", "prêt", "montant"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/other/mongo_object_id/__init__.py b/csv_detective/detect_labels/other/mongo_object_id/__init__.py deleted file mode 100644 index b110538f..00000000 --- a/csv_detective/detect_labels/other/mongo_object_id/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["id", "objectid"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/other/twitter/__init__.py b/csv_detective/detect_labels/other/twitter/__init__.py deleted file mode 100644 index 9b6c5a31..00000000 --- a/csv_detective/detect_labels/other/twitter/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["twitter", "twitter account", "twitter username"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/other/url/__init__.py b/csv_detective/detect_labels/other/url/__init__.py deleted file mode 100644 index cc51d569..00000000 --- a/csv_detective/detect_labels/other/url/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "url", - "url source", - "site web", - "source url", - "site internet", - "remote url", - "web", - "site", - "lien", - "site data", - "lien url", - "lien vers le fichier", - "sitweb", - "interneturl", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/other/uuid/__init__.py b/csv_detective/detect_labels/other/uuid/__init__.py deleted file mode 100644 index c05eeed9..00000000 --- a/csv_detective/detect_labels/other/uuid/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["id", "uuid", "guid"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/temp/__init__.py b/csv_detective/detect_labels/temp/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_labels/temp/date/__init__.py b/csv_detective/detect_labels/temp/date/__init__.py deleted file mode 100644 index ffe2673b..00000000 --- a/csv_detective/detect_labels/temp/date/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "date", - "jour", - "date de mise a jour", - "sns date", - "date maj", - "rem date", - "periode", - "date de publication", - "dpc", - "extract date", - "date immatriculation", - "date jeu donnees", - "datemaj", - "dateouv", - "date der maj", - "dmaj", - "jour", - "yyyymmdd", - "aaaammjj", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/temp/datetime_rfc822/__init__.py b/csv_detective/detect_labels/temp/datetime_rfc822/__init__.py deleted file mode 100644 index ea968b46..00000000 --- a/csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "datetime", - "timestamp", - "osm_timestamp", - "date", - "created at", - "last update", - "date maj", - "createdat", - "date naissance", - "date donnees", - ] # Almost same as IS0, no example in data - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/temp/year/__init__.py b/csv_detective/detect_labels/temp/year/__init__.py deleted file mode 100644 index 24976e6e..00000000 --- a/csv_detective/detect_labels/temp/year/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "year", - "annee", - "annee depot", - "an nais", - "exercice", - "data year", - "annee de publication", - "exercice comptable", - "annee de naissance", - "annee ouverture", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detection/formats.py b/csv_detective/detection/formats.py index 1fe6e66d..7c6e9a28 100755 --- a/csv_detective/detection/formats.py +++ b/csv_detective/detection/formats.py @@ -7,7 +7,7 @@ detect_categorical_variable, # detect_continuous_variable, ) -from csv_detective.load_tests import return_all_tests +from csv_detective.format import Format, FormatsManager from csv_detective.output.utils import prepare_output_dict from csv_detective.parsing.columns import ( MAX_NUMBER_CATEGORICAL_VALUES, @@ -16,12 +16,13 @@ test_label, ) +fmtm = FormatsManager() def detect_formats( table: pd.DataFrame, analysis: dict, file_path: str, - user_input_tests: str | list[str] = "ALL", + tags: list[str] | None = None, limited_output: bool = True, skipna: bool = True, verbose: bool = False, @@ -29,15 +30,14 @@ def detect_formats( in_chunks = analysis.get("total_lines") is None # list testing to be performed - all_tests_fields = return_all_tests( - user_input_tests, detect_type="detect_fields" - ) # list all tests for the fields - all_tests_labels = return_all_tests( - user_input_tests, detect_type="detect_labels" - ) # list all tests for the labels + formats: dict[str, Format] = ( + fmtm.get_formats_from_tags(tags) + if tags is not None + else fmtm.formats + ) # if no testing then return - if not all_tests_fields and not all_tests_labels: + if len(formats) == 0: return analysis, None # Perform testing on fields @@ -45,7 +45,7 @@ def detect_formats( # table is small enough to be tested in one go scores_table_fields = test_col( table=table, - all_tests=all_tests_fields, + formats=formats, limited_output=limited_output, skipna=skipna, verbose=verbose, @@ -62,7 +62,7 @@ def detect_formats( table=table, file_path=file_path, analysis=analysis, - all_tests=all_tests_fields, + formats=formats, limited_output=limited_output, skipna=skipna, verbose=verbose, @@ -71,7 +71,7 @@ def detect_formats( # Perform testing on labels scores_table_labels = test_label( - analysis["header"], all_tests_labels, limited_output, verbose=verbose + analysis["header"], formats, limited_output, verbose=verbose ) analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output) diff --git a/csv_detective/explore_csv.py b/csv_detective/explore_csv.py index afd791a9..241d04a0 100644 --- a/csv_detective/explore_csv.py +++ b/csv_detective/explore_csv.py @@ -15,7 +15,7 @@ def routine( file_path: str, num_rows: int = 500, - user_input_tests: str | list[str] = "ALL", + tags: list[str] | None = None, limited_output: bool = True, save_results: bool | str = True, encoding: str | None = None, @@ -35,7 +35,7 @@ def routine( file_path: local path or URL to file num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file - user_input_tests: tests to run on the file + tags: tags to filter formats limited_output: whether or not to return all possible types or only the most likely one for each column save_results: whether or not to save the results in a json file, or the path where to dump the output output_profile: whether or not to add the 'profile' field to the output @@ -74,7 +74,7 @@ def routine( table=table, analysis=analysis, file_path=file_path, - user_input_tests=user_input_tests, + tags=tags, limited_output=limited_output, skipna=skipna, verbose=verbose, @@ -107,7 +107,7 @@ def validate_then_detect( file_path: str, previous_analysis: dict, num_rows: int = 500, - user_input_tests: str | list[str] = "ALL", + tags: list[str] | None = None, limited_output: bool = True, save_results: bool | str = True, skipna: bool = True, @@ -140,7 +140,7 @@ def validate_then_detect( table=table, analysis=analysis, file_path=file_path, - user_input_tests=user_input_tests, + tags=tags, limited_output=limited_output, skipna=skipna, verbose=verbose, diff --git a/csv_detective/format.py b/csv_detective/format.py index 2b5a2db5..c35ce99d 100755 --- a/csv_detective/format.py +++ b/csv_detective/format.py @@ -8,24 +8,25 @@ def __init__( self, name: str, func: Callable, + _test_values: dict[bool, list[str]], labels: list[str] = [], proportion: float = 1, tags: list[str] = [], ) -> None: - self.name = name - self.func = func - self.labels = labels + self.name: str = name + self.func: Callable = func + self._test_values: dict[bool, list[str]] = _test_values + self.labels: list[str] = labels self.proportion: float = proportion - self.tags = tags - - def is_valid_value(self, val: str) -> bool: - return self.func(val) + self.tags: list[str] = tags def is_valid_label(self, val: str) -> float: return header_score(val, self.labels) class FormatsManager: + formats: dict[str, Format] + def __init__(self) -> None: import csv_detective.formats as formats format_labels = [ @@ -33,10 +34,11 @@ def __init__(self) -> None: if "_is" in dir(getattr(formats, f)) ] assert len(format_labels) == len(set(format_labels)), "Format labels must be unique" - self.formats = [ - Format( + self.formats = { + label: Format( name=label, func=(module := getattr(formats, label))._is, + _test_values=module._test_values, **{ attr: val for attr in ["labels", "proportion", "tags"] @@ -44,7 +46,15 @@ def __init__(self) -> None: }, ) for label in format_labels - ] + } - def get_formats_from_tags(self, tags: list[str]) -> list[Format]: - return [f for f in self.formats if all(tag in f.tags for tag in tags)] + def get_formats_from_tags(self, tags: list[str]) -> dict[str, Format]: + # allowed to skip with -temp + return { + label: fmt + for label, fmt in self.formats.items() + if all(tag in fmt.tags for tag in tags) + } + + def available_tags(self) -> set[str]: + return set(tag for format in self.formats.values() for tag in format.tags) diff --git a/csv_detective/detect_fields/FR/geo/adresse/__init__.py b/csv_detective/formats/adresse.py old mode 100644 new mode 100755 similarity index 79% rename from csv_detective/detect_fields/FR/geo/adresse/__init__.py rename to csv_detective/formats/adresse.py index 9233ca2f..0cabdb6a --- a/csv_detective/detect_fields/FR/geo/adresse/__init__.py +++ b/csv_detective/formats/adresse.py @@ -1,100 +1,116 @@ -from csv_detective.parsing.text import _process_text - -PROPORTION = 0.55 -# ajouts d'espaces en fin de mots pour s'assurer que le str n'est pas juste une substr d'un mot plus long -voies = { - "aire ", - "allee ", - "avenue ", - "base ", - "boulevard ", - "cami ", - "carrefour ", - "chemin ", - "cheminement ", - "chaussee ", - "cite ", - "clos ", - "coin ", - "corniche ", - "cote ", - "cour ", - "cours ", - "domaine ", - "descente ", - "ecart ", - "esplanade ", - "faubourg ", - "gare ", - "grande rue", - "hameau ", - "halle ", - "ilot ", - "impasse ", - "lieu dit", - "lotissement ", - "marche ", - "montee ", - "parc ", - "passage ", - "place ", - "plan ", - "plaine ", - "plateau ", - "pont ", - "port ", - "promenade ", - "parvis ", - "quartier ", - "quai ", - "residence ", - "ruelle ", - "rocade ", - "rond point", - "route ", - "rue ", - # 'sente - sentier', - "square ", - "tour ", - # 'terre-plein', - "traverse ", - "villa ", - "village ", - "voie ", - "zone artisanale", - "zone d’amenagement concerte", - "zone d’amenagement differe", - "zone industrielle", - "zone ", - # 'r', - "av ", - "pl ", - "bd ", - "cami ", - # 'che', - "chs ", - "dom ", - "ham ", - "ld ", - # 'pro', - # 'rte', - "vlge ", - "za ", - "zac ", - "zad ", - "zi ", - # 'car', - "fg ", - # 'lot', - "imp ", - # 'qu', - "mte", -} - - -def _is(val): - """Repere des adresses""" - if not isinstance(val, str) or len(val) > 150: - return False - val = _process_text(val) - return any(x in val for x in voies) +from csv_detective.parsing.text import _process_text + +proportion = 0.55 +tags = ["fr", "geo"] +labels = [ + "adresse", + "localisation", + "adresse postale", + "adresse geographique", + "adr", + "adresse complete", + "adresse station", +] + +voies = { + "aire ", + "allee ", + "avenue ", + "base ", + "boulevard ", + "cami ", + "carrefour ", + "chemin ", + "cheminement ", + "chaussee ", + "cite ", + "clos ", + "coin ", + "corniche ", + "cote ", + "cour ", + "cours ", + "domaine ", + "descente ", + "ecart ", + "esplanade ", + "faubourg ", + "gare ", + "grande rue", + "hameau ", + "halle ", + "ilot ", + "impasse ", + "lieu dit", + "lotissement ", + "marche ", + "montee ", + "parc ", + "passage ", + "place ", + "plan ", + "plaine ", + "plateau ", + "pont ", + "port ", + "promenade ", + "parvis ", + "quartier ", + "quai ", + "residence ", + "ruelle ", + "rocade ", + "rond point", + "route ", + "rue ", + # 'sente - sentier', + "square ", + "tour ", + # 'terre-plein', + "traverse ", + "villa ", + "village ", + "voie ", + "zone artisanale", + "zone d’amenagement concerte", + "zone d’amenagement differe", + "zone industrielle", + "zone ", + # 'r', + "av ", + "pl ", + "bd ", + "cami ", + # 'che', + "chs ", + "dom ", + "ham ", + "ld ", + # 'pro', + # 'rte', + "vlge ", + "za ", + "zac ", + "zad ", + "zi ", + # 'car', + "fg ", + # 'lot', + "imp ", + # 'qu', + "mte", +} + + +def _is(val): + """Repere des adresses""" + if not isinstance(val, str) or len(val) > 150: + return False + val = _process_text(val) + return any(x in val for x in voies) + + +_test_values = { + True: ["rue du martyr"], + False: ["un batiment"], +} diff --git a/csv_detective/detect_fields/other/booleen/__init__.py b/csv_detective/formats/booleen.py old mode 100644 new mode 100755 similarity index 66% rename from csv_detective/detect_fields/other/booleen/__init__.py rename to csv_detective/formats/booleen.py index a1e7426c..077d5658 --- a/csv_detective/detect_fields/other/booleen/__init__.py +++ b/csv_detective/formats/booleen.py @@ -1,27 +1,34 @@ -PROPORTION = 1 -bool_mapping = { - "1": True, - "0": False, - "vrai": True, - "faux": False, - "true": True, - "false": False, - "oui": True, - "non": False, - "yes": True, - "no": False, - "y": True, - "n": False, - "o": True, -} - -liste_bool = set(bool_mapping.keys()) - - -def bool_casting(val: str) -> bool: - return bool_mapping.get(val.lower()) - - -def _is(val: str) -> bool: - """Détecte les booléens""" - return isinstance(val, str) and val.lower() in liste_bool +proportion = 1 +labels = ["is ", "has ", "est "] + +bool_mapping = { + "1": True, + "0": False, + "vrai": True, + "faux": False, + "true": True, + "false": False, + "oui": True, + "non": False, + "yes": True, + "no": False, + "y": True, + "n": False, + "o": True, +} + +liste_bool = set(bool_mapping.keys()) + + +def bool_casting(val: str) -> bool: + return bool_mapping.get(val.lower()) + + +def _is(val): + return isinstance(val, str) and val.lower() in liste_bool + + +_test_values = { + True: ["oui", "0", "1", "yes", "false", "True"], + False: ["nein", "ja", "2", "-0"], +} diff --git a/csv_detective/formats/code_commune_insee.py b/csv_detective/formats/code_commune_insee.py new file mode 100755 index 00000000..44f2b9f0 --- /dev/null +++ b/csv_detective/formats/code_commune_insee.py @@ -0,0 +1,26 @@ +from frformat import CodeCommuneInsee, Millesime + +proportion = 0.75 +tags = ["fr", "geo"] +labels = [ + "code commune insee", + "code insee", + "codes insee", + "code commune", + "code insee commune", + "insee", + "code com", + "com", +] + +_code_commune_insee = CodeCommuneInsee(Millesime.LATEST) + + +def _is(val): + return isinstance(val, str) and _code_commune_insee.is_valid(val) + + +_test_values = { + True: ["91471", "01053"], + False: ["914712", "01000"], +} diff --git a/csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py b/csv_detective/formats/code_csp_insee.py old mode 100644 new mode 100755 similarity index 73% rename from csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py rename to csv_detective/formats/code_csp_insee.py index abedb1cf..954d3a17 --- a/csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +++ b/csv_detective/formats/code_csp_insee.py @@ -1,29 +1,37 @@ -import re - -from csv_detective.parsing.text import _process_text - -PROPORTION = 1 - - -def _is(val): - """Repère les code csp telles que définies par l'INSEE""" - if not isinstance(val, str): - return False - val = _process_text(val) - if len(val) != 4: - return False - a = bool(re.match(r"^[123456][0-9]{2}[abcdefghijkl]$", val)) - b = val in { - "7100", - "7200", - "7400", - "7500", - "7700", - "7800", - "8100", - "8300", - "8400", - "8500", - "8600", - } - return a or b +import re + +from csv_detective.parsing.text import _process_text + +proportion = 1 +tags = ["fr"] +labels = ["code csp insee", "code csp" +] + + +def _is(val): + if not isinstance(val, str): + return False + val = _process_text(val) + if len(val) != 4: + return False + a = bool(re.match(r"^[123456][0-9]{2}[abcdefghijkl]$", val)) + b = val in { + "7100", + "7200", + "7400", + "7500", + "7700", + "7800", + "8100", + "8300", + "8400", + "8500", + "8600", + } + return a or b + + +_test_values = { + True: ["121f"], + False: ["121x"], +} diff --git a/csv_detective/detect_fields/FR/geo/code_departement/__init__.py b/csv_detective/formats/code_departement.py old mode 100644 new mode 100755 similarity index 55% rename from csv_detective/detect_fields/FR/geo/code_departement/__init__.py rename to csv_detective/formats/code_departement.py index 2edb50b8..c7823f3b --- a/csv_detective/detect_fields/FR/geo/code_departement/__init__.py +++ b/csv_detective/formats/code_departement.py @@ -1,15 +1,29 @@ -from frformat import Millesime, NumeroDepartement, Options - -PROPORTION = 1 - -_options = Options( - ignore_case=True, - ignore_accents=True, - replace_non_alphanumeric_with_space=True, - ignore_extra_whitespace=True, -) -_numero_departement = NumeroDepartement(Millesime.LATEST, _options) - - -def _is(val): - return isinstance(val, str) and _numero_departement.is_valid(val) +from frformat import Millesime, NumeroDepartement, Options + +proportion = 1 +tags = ["fr", "geo"] +labels = [ + "code departement", + "code_departement", + "dep", + "departement", + "dept", +] + +_options = Options( + ignore_case=True, + ignore_accents=True, + replace_non_alphanumeric_with_space=True, + ignore_extra_whitespace=True, +) +_numero_departement = NumeroDepartement(Millesime.LATEST, _options) + + +def _is(val): + return isinstance(val, str) and _numero_departement.is_valid(val) + + +_test_values = { + True: ["75", "2A", "2b", "974", "01"], + False: ["00", "96", "101"], +} diff --git a/csv_detective/formats/code_fantoir.py b/csv_detective/formats/code_fantoir.py new file mode 100755 index 00000000..cfbe30a5 --- /dev/null +++ b/csv_detective/formats/code_fantoir.py @@ -0,0 +1,21 @@ +from frformat import CodeFantoir + +proportion = 1 +tags = ["fr", "geo"] +labels = [ + "cadastre1", + "code fantoir", + "fantoir", +] + +_code_fantoir = CodeFantoir() + + +def _is(val): + return isinstance(val, str) and _code_fantoir.is_valid(val) + + +_test_values = { + True: ["7755A", "B150B", "ZA04C", "ZB03D"], + False: ["7755", "ZA99A"], +} diff --git a/csv_detective/detect_fields/FR/other/code_import/__init__.py b/csv_detective/formats/code_import.py old mode 100644 new mode 100755 similarity index 51% rename from csv_detective/detect_fields/FR/other/code_import/__init__.py rename to csv_detective/formats/code_import.py index 023442c4..a49c5e29 --- a/csv_detective/detect_fields/FR/other/code_import/__init__.py +++ b/csv_detective/formats/code_import.py @@ -1,9 +1,18 @@ -import re - -PROPORTION = 0.9 -regex = r"^(\d{3}[SP]\d{4,10}(.\w{1,3}\d{0,5})?|\d[A-Z0-9]\d[SP]\w(\w-?\w{0,2}\d{0,6})?)$" - - -def _is(val): - """Repere le code Import (ancien RNA)""" - return isinstance(val, str) and bool(re.match(regex, val)) +import re + +proportion = 0.9 +tags = ["fr"] +labels = [ +] + +regex = r"^(\d{3}[SP]\d{4,10}(.\w{1,3}\d{0,5})?|\d[A-Z0-9]\d[SP]\w(\w-?\w{0,2}\d{0,6})?)$" + + +def _is(val): + return isinstance(val, str) and bool(re.match(regex, val)) + + +_test_values = { + True: ["123S1871092288"], + False: ["AA751PEE00188854", "W123456789"], +} diff --git a/csv_detective/formats/code_postal.py b/csv_detective/formats/code_postal.py new file mode 100755 index 00000000..1608ebcb --- /dev/null +++ b/csv_detective/formats/code_postal.py @@ -0,0 +1,25 @@ +from frformat import CodePostal + +proportion = 0.9 +tags = ["fr", "geo"] +labels = [ + "code postal", + "postal code", + "postcode", + "post code", + "cp", + "codes postaux", + "location postcode", +] + +_code_postal = CodePostal() + + +def _is(val): + return isinstance(val, str) and _code_postal.is_valid(val) + + +_test_values = { + True: ["75020", "01000"], + False: ["77777", "018339"], +} diff --git a/csv_detective/formats/code_region.py b/csv_detective/formats/code_region.py new file mode 100755 index 00000000..72552e9f --- /dev/null +++ b/csv_detective/formats/code_region.py @@ -0,0 +1,22 @@ +from frformat import CodeRegion, Millesime + +proportion = 1 +tags = ["fr", "geo"] +labels = [ + "code region", + "reg", + "code insee region", + "region", +] + +_code_region = CodeRegion(Millesime.LATEST) + + +def _is(val): + return isinstance(val, str) and _code_region.is_valid(val) + + +_test_values = { + True: ["32"], + False: ["55"], +} diff --git a/csv_detective/formats/code_rna.py b/csv_detective/formats/code_rna.py new file mode 100755 index 00000000..32bf5c89 --- /dev/null +++ b/csv_detective/formats/code_rna.py @@ -0,0 +1,29 @@ +from frformat import CodeRNA + +proportion = 0.9 +tags = ["fr"] +labels = [ + "code rna", + "rna", + "n° inscription association", + "identifiant association", +] + +_code_rna = CodeRNA() + + +def _is(val): + return isinstance(val, str) and _code_rna.is_valid(val) + + +_test_values = { + True: ["W751515517"], + False: [ + "W111111111111111111111111111111111111", + "w143788974", + "W12", + "678W23456", + "165789325", + "Wa1#89sf&h", + ], +} diff --git a/csv_detective/formats/code_waldec.py b/csv_detective/formats/code_waldec.py new file mode 100755 index 00000000..b078035b --- /dev/null +++ b/csv_detective/formats/code_waldec.py @@ -0,0 +1,18 @@ +from frformat import CodeRNA + +proportion = 0.9 +tags = ["fr"] +labels = ["code waldec", "waldec" +] + +_code_rna = CodeRNA() + + +def _is(val): + return isinstance(val, str) and _code_rna.is_valid(val) + + +_test_values = { + True: ["W123456789", "W2D1234567"], + False: ["AA751PEE00188854"], +} diff --git a/csv_detective/detect_fields/FR/geo/commune/__init__.py b/csv_detective/formats/commune.py old mode 100644 new mode 100755 similarity index 58% rename from csv_detective/detect_fields/FR/geo/commune/__init__.py rename to csv_detective/formats/commune.py index e27bdf97..05c2f1a1 --- a/csv_detective/detect_fields/FR/geo/commune/__init__.py +++ b/csv_detective/formats/commune.py @@ -1,16 +1,27 @@ -from frformat import Commune, Millesime, Options - -PROPORTION = 0.9 - -_options = Options( - ignore_case=True, - ignore_accents=True, - replace_non_alphanumeric_with_space=True, - ignore_extra_whitespace=True, -) -_commune = Commune(Millesime.LATEST, _options) - - -def _is(val): - """Match avec le nom des communes""" - return isinstance(val, str) and _commune.is_valid(val) +from frformat import Commune, Millesime, Options + +proportion = 0.8 +tags = ["fr", "geo"] +labels = [ + "commune", + "ville", + "libelle commune", +] + +_options = Options( + ignore_case=True, + ignore_accents=True, + replace_non_alphanumeric_with_space=True, + ignore_extra_whitespace=True, +) +_commune = Commune(Millesime.LATEST, _options) + + +def _is(val): + return isinstance(val, str) and _commune.is_valid(val) + + +_test_values = { + True: ["saint denis"], + False: ["new york", "lion"], +} diff --git a/csv_detective/detect_fields/FR/other/csp_insee/__init__.py b/csv_detective/formats/csp_insee.py old mode 100644 new mode 100755 similarity index 54% rename from csv_detective/detect_fields/FR/other/csp_insee/__init__.py rename to csv_detective/formats/csp_insee.py index f2801895..ea5bffd0 --- a/csv_detective/detect_fields/FR/other/csp_insee/__init__.py +++ b/csv_detective/formats/csp_insee.py @@ -1,19 +1,31 @@ -from os.path import dirname, join - -from csv_detective.parsing.text import _process_text - -PROPORTION = 1 -f = open(join(dirname(__file__), "csp_insee.txt"), "r") -codes_insee = f.read().split("\n") -# removing empty str due to additionnal line in file -del codes_insee[-1] -codes_insee = set(codes_insee) -f.close() - - -def _is(val): - """Repère les csp telles que définies par l'INSEE""" - if not isinstance(val, str): - return False - val = _process_text(val) - return val in codes_insee +from os.path import dirname, join + +from csv_detective.parsing.text import _process_text + +proportion = 1 +tags = ["fr"] +labels = [ + "csp insee", + "csp", + "categorie socioprofessionnelle", +] + +f = open(join(dirname(__file__), "data", "csp_insee.txt"), "r") +codes_insee = f.read().split("\n") +# removing empty str due to additionnal line in file +del codes_insee[-1] +codes_insee = set(codes_insee) +f.close() + + +def _is(val): + if not isinstance(val, str): + return False + val = _process_text(val) + return val in codes_insee + + +_test_values = { + True: ["employes de la poste"], + False: ["super-heros"], +} diff --git a/csv_detective/detect_fields/FR/other/csp_insee/csp_insee.txt b/csv_detective/formats/data/csp_insee.txt similarity index 100% rename from csv_detective/detect_fields/FR/other/csp_insee/csp_insee.txt rename to csv_detective/formats/data/csp_insee.txt diff --git a/csv_detective/detect_fields/FR/other/insee_ape700/insee_ape700.txt b/csv_detective/formats/data/insee_ape700.txt old mode 100644 new mode 100755 similarity index 100% rename from csv_detective/detect_fields/FR/other/insee_ape700/insee_ape700.txt rename to csv_detective/formats/data/insee_ape700.txt diff --git a/csv_detective/detect_fields/geo/iso_country_code_alpha2/iso_country_code_alpha2.txt b/csv_detective/formats/data/iso_country_code_alpha2.txt similarity index 100% rename from csv_detective/detect_fields/geo/iso_country_code_alpha2/iso_country_code_alpha2.txt rename to csv_detective/formats/data/iso_country_code_alpha2.txt diff --git a/csv_detective/detect_fields/geo/iso_country_code_alpha3/iso_country_code_alpha3.txt b/csv_detective/formats/data/iso_country_code_alpha3.txt similarity index 100% rename from csv_detective/detect_fields/geo/iso_country_code_alpha3/iso_country_code_alpha3.txt rename to csv_detective/formats/data/iso_country_code_alpha3.txt diff --git a/csv_detective/detect_fields/geo/iso_country_code_numeric/iso_country_code_numeric.txt b/csv_detective/formats/data/iso_country_code_numeric.txt similarity index 100% rename from csv_detective/detect_fields/geo/iso_country_code_numeric/iso_country_code_numeric.txt rename to csv_detective/formats/data/iso_country_code_numeric.txt diff --git a/csv_detective/detect_fields/temp/date/__init__.py b/csv_detective/formats/date.py old mode 100644 new mode 100755 similarity index 66% rename from csv_detective/detect_fields/temp/date/__init__.py rename to csv_detective/formats/date.py index b6a66cfb..5819259e --- a/csv_detective/detect_fields/temp/date/__init__.py +++ b/csv_detective/formats/date.py @@ -1,62 +1,103 @@ -import re -from datetime import datetime - -from dateparser import parse as date_parser -from dateutil.parser import ParserError -from dateutil.parser import parse as dateutil_parser - -PROPORTION = 1 -# /!\ this is only for dates, not datetimes which are handled by other utils - - -def date_casting(val: str) -> datetime | None: - """For performance reasons, we try first with dateutil and fallback on dateparser""" - try: - return dateutil_parser(val) - except ParserError: - return date_parser(val) - except Exception: - return None - - -seps = r"[\s/\-\*_\|;.,]" -# matches JJ-MM-AAAA with any of the listed separators -jjmmaaaa_pattern = r"^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$".replace( - "SEP", seps -) -# matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR -aaaammjj_pattern = r"^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$".replace( - "SEP", seps + "?" -) -# matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR -string_month_pattern = ( - r"^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr" - r"|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|" - r"mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP" - r"([0-9]{2}$|(19|20)[0-9]{2}$)" -).replace("SEP", seps + "?") - -threshold = 0.3 - - -def _is(val): - """Renvoie True si val peut être une date, False sinon""" - # early stops, to cut processing time - if not isinstance(val, str) or len(val) > 20 or len(val) < 8: - return False - # if it's a usual date pattern - if any( - # with this syntax, if any of the first value is True, the next ones are not computed - [ - bool(re.match(jjmmaaaa_pattern, val)) - or bool(re.match(aaaammjj_pattern, val)) - or bool(re.match(string_month_pattern, val, re.IGNORECASE)) - ] - ): - return True - if sum([char.isdigit() for char in val]) / len(val) < threshold: - return False - res = date_casting(val) - if not res or res.hour or res.minute or res.second: - return False - return True +import re +from datetime import datetime + +from dateparser import parse as date_parser +from dateutil.parser import ParserError +from dateutil.parser import parse as dateutil_parser + +proportion = 1 +tags = ["temp"] +labels = [ + "date", + "jour", + "date de mise a jour", + "sns date", + "date maj", + "rem date", + "periode", + "date de publication", + "dpc", + "extract date", + "date immatriculation", + "date jeu donnees", + "datemaj", + "dateouv", + "date der maj", + "dmaj", + "jour", + "yyyymmdd", + "aaaammjj", +] + +def date_casting(val: str) -> datetime | None: + """For performance reasons, we try first with dateutil and fallback on dateparser""" + try: + return dateutil_parser(val) + except ParserError: + return date_parser(val) + except Exception: + return None + + +threshold = 0.3 +seps = r"[\s/\-\*_\|;.,]" +# matches JJ-MM-AAAA with any of the listed separators +jjmmaaaa_pattern = r"^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$".replace( + "SEP", seps +) +# matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR +aaaammjj_pattern = r"^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$".replace( + "SEP", seps + "?" +) +# matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR +string_month_pattern = ( + r"^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr" + r"|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|" + r"mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP" + r"([0-9]{2}$|(19|20)[0-9]{2}$)" +).replace("SEP", seps + "?") + + + +def _is(val): + # early stops, to cut processing time + if not isinstance(val, str) or len(val) > 20 or len(val) < 8: + return False + # if it's a usual date pattern + if any( + # with this syntax, if any of the first value is True, the next ones are not computed + [ + bool(re.match(jjmmaaaa_pattern, val)) + or bool(re.match(aaaammjj_pattern, val)) + or bool(re.match(string_month_pattern, val, re.IGNORECASE)) + ] + ): + return True + if sum([char.isdigit() for char in val]) / len(val) < threshold: + return False + res = date_casting(val) + if not res or res.hour or res.minute or res.second: + return False + return True + + +_test_values = { + True: [ + "1960-08-07", + "12/02/2007", + "15 jan 1985", + "15 décembre 1985", + "02 05 2003", + "20030502", + "1993-12/02", + ], + False: [ + "1993-1993-1993", + "39-10-1993", + "19-15-1993", + "15 tambour 1985", + "12152003", + "20031512", + "02052003", + ], +} diff --git a/csv_detective/formats/date_fr.py b/csv_detective/formats/date_fr.py new file mode 100755 index 00000000..e12ba32c --- /dev/null +++ b/csv_detective/formats/date_fr.py @@ -0,0 +1,21 @@ +import re + +proportion = 1 +tags = ["fr", "temp"] +labels = ["date" +] + +pattern = ( + r"^\d{1,2}[ \-](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre" + r"|octobre|novembre|decembre)[ \-]\d{4}$" +) + + +def _is(val): + return isinstance(val, str) and bool(re.match(pattern, val)) + + +_test_values = { + True: ["13 février 1996"], + False: ["44 march 2025"], +} diff --git a/csv_detective/detect_fields/temp/datetime_aware/__init__.py b/csv_detective/formats/datetime_aware.py similarity index 63% rename from csv_detective/detect_fields/temp/datetime_aware/__init__.py rename to csv_detective/formats/datetime_aware.py index 5f7470a6..4414bf03 100755 --- a/csv_detective/detect_fields/temp/datetime_aware/__init__.py +++ b/csv_detective/formats/datetime_aware.py @@ -1,12 +1,13 @@ import re -from typing import Any -from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting +from .date import aaaammjj_pattern, date_casting -PROPORTION = 1 -threshold = 0.7 +proportion = 1 +tags = ["temp"] +labels = [ +] -# matches AAAA-MM-JJTHH:MM:SS(.dddddd)(±HH:MM|Z) with any of the listed separators for the date OR NO SEPARATOR +threshold = 0.7 pat = ( aaaammjj_pattern.replace("$", "") + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})" @@ -14,8 +15,7 @@ ) -def _is(val: Any | None) -> bool: - """Detects timezone-aware datetimes only""" +def _is(val): # early stops, to cut processing time # 16 is the minimal length of a datetime format YYMMDDTHH:MM:SSZ # 32 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd+HH:MM, keeping some slack @@ -32,3 +32,15 @@ def _is(val: Any | None) -> bool: and bool(res.hour or res.minute or res.second or res.microsecond) and bool(res.tzinfo) ) + + +_test_values = { + True: [ + "2021-06-22 10:20:10-04:00", + "2030-06-22 00:00:00.0028+02:00", + "2000-12-21 10:20:10.1Z", + "2024-12-19T10:53:36.428000+00:00", + "1996/06/22 10:20:10 GMT", + ], + False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"], +} diff --git a/csv_detective/detect_fields/temp/datetime_naive/__init__.py b/csv_detective/formats/datetime_naive.py similarity index 63% rename from csv_detective/detect_fields/temp/datetime_naive/__init__.py rename to csv_detective/formats/datetime_naive.py index 464cb2ad..491de27b 100755 --- a/csv_detective/detect_fields/temp/datetime_naive/__init__.py +++ b/csv_detective/formats/datetime_naive.py @@ -1,9 +1,12 @@ import re from typing import Any -from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting +from .date import aaaammjj_pattern, date_casting -PROPORTION = 1 +proportion = 1 +tags = ["temp"] +labels = [ +] threshold = 0.7 # matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR @@ -27,3 +30,20 @@ def _is(val: Any | None) -> bool: return False res = date_casting(val) return res is not None and not bool(res.tzinfo) + + +_test_values = { + True: [ + "2021-06-22 10:20:10", + "2030/06-22 00:00:00", + "2030/06/22 00:00:00.0028", + ], + False: [ + "2021-06-22T30:20:10", + "Sun, 06 Nov 1994 08:49:37 GMT", + "2021-06-44 10:20:10+02:00", + "1999-12-01T00:00:00Z", + "2021-06-44", + "15 décembre 1985", + ], +} diff --git a/csv_detective/formats/datetime_rfc822.py b/csv_detective/formats/datetime_rfc822.py new file mode 100755 index 00000000..d044c475 --- /dev/null +++ b/csv_detective/formats/datetime_rfc822.py @@ -0,0 +1,34 @@ +import re + +proportion = 1 +tags = ["temp"] +labels = [ + "datetime", + "timestamp", + "osm_timestamp", + "date", + "created at", + "last update", + "date maj", + "createdat", + "date naissance", + "date donnees", +] + + +def _is(val): + return isinstance(val, str) and bool( + re.match( + r"^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} " + r"([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) " + r"(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$", + val.lower(), + re.IGNORECASE, + ) + ) + + +_test_values = { + True: ["Sun, 06 Nov 1994 08:49:37 GMT"], + False: ["2021-06-22T10:20:10"], +} diff --git a/csv_detective/formats/departement.py b/csv_detective/formats/departement.py new file mode 100755 index 00000000..e2c4930b --- /dev/null +++ b/csv_detective/formats/departement.py @@ -0,0 +1,37 @@ +from frformat import Departement, Millesime, Options + +proportion = 0.9 +tags = ["fr", "geo"] +labels = [ + "departement", + "libelle du departement", + "deplib", + "nom dept", + "dept", + "libdepartement", + "nom departement", + "libelle dep", + "libelle departement", + "lb departements", + "dep libusage", + "lb departement", + "nom dep", +] + +_options = Options( + ignore_case=True, + ignore_accents=True, + replace_non_alphanumeric_with_space=True, + ignore_extra_whitespace=True, +) +_departement = Departement(Millesime.LATEST, _options) + + +def _is(val): + return isinstance(val, str) and _departement.is_valid(val) + + +_test_values = { + True: ["essonne"], + False: ["alabama", "auvergne"], +} diff --git a/csv_detective/formats/email.py b/csv_detective/formats/email.py new file mode 100755 index 00000000..329567f3 --- /dev/null +++ b/csv_detective/formats/email.py @@ -0,0 +1,29 @@ +import re + +proportion = 0.9 +tags = [] +labels = [ + "email", + "mail", + "courriel", + "contact", + "mel", + "lieucourriel", + "coordinates.emailcontact", + "e mail", + "mo mail", + "adresse mail", + "adresse email", +] + + +def _is(val): + return isinstance(val, str) and bool( + re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", val, re.IGNORECASE) + ) + + +_test_values = { + True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"], + False: ["cdo@@gouv.sfd"], +} diff --git a/csv_detective/detect_fields/other/float/__init__.py b/csv_detective/formats/float.py old mode 100644 new mode 100755 similarity index 67% rename from csv_detective/detect_fields/other/float/__init__.py rename to csv_detective/formats/float.py index f11606e7..b903e2be --- a/csv_detective/detect_fields/other/float/__init__.py +++ b/csv_detective/formats/float.py @@ -1,21 +1,29 @@ -PROPORTION = 1 - - -def float_casting(val: str) -> float: - return float(val.replace(",", ".")) - - -def _is(val): - """Detects floats, assuming that tables will not have scientific - notations (3e6) or "+" in the string. "-" is still accepted.""" - try: - if ( - not isinstance(val, str) - or any([k in val for k in ["_", "+", "e", "E"]]) - or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","]) - ): - return False - float_casting(val) - return True - except ValueError: - return False +proportion = 1 +tags = [] +labels = ["part", "ratio", "taux"] + + +def float_casting(val: str) -> float: + return float(val.replace(",", ".")) + + +def _is(val): + """Detects floats, assuming that tables will not have scientific + notations (3e6) or "+" in the string. "-" is still accepted.""" + try: + if ( + not isinstance(val, str) + or any([k in val for k in ["_", "+", "e", "E"]]) + or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","]) + ): + return False + float_casting(val) + return True + except ValueError: + return False + + +_test_values = { + True: ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"], + False: ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"], +} diff --git a/csv_detective/formats/geojson.py b/csv_detective/formats/geojson.py new file mode 100755 index 00000000..574c4168 --- /dev/null +++ b/csv_detective/formats/geojson.py @@ -0,0 +1,36 @@ +import json + +proportion = 1 +tags = ["geo"] +labels = [ + "json geojson", + "json", + "geojson", + "geo shape", + "geom", + "geometry", + "geo shape", + "geoshape", +] + + +def _is(val) -> bool: + try: + j = json.loads(val) + if isinstance(j, dict): + if "type" in j and "coordinates" in j: + return True + if "geometry" in j and "coordinates" in j["geometry"]: + return True + except Exception: + pass + return False + + +_test_values = { + True: [ + '{"coordinates": [45.783753, 3.049342], "type": "63870"}', + '{"geometry": {"coordinates": [45.783753, 3.049342]}}', + ], + False: ['{"pomme": "fruit", "reponse": 42}'], +} diff --git a/csv_detective/formats/insee_ape700.py b/csv_detective/formats/insee_ape700.py new file mode 100755 index 00000000..e2b16201 --- /dev/null +++ b/csv_detective/formats/insee_ape700.py @@ -0,0 +1,31 @@ +from os.path import dirname, join + +from csv_detective.parsing.text import _process_text + +proportion = 0.8 +tags = ["fr"] +labels = [ + "code ape", + "code activite (ape)", + "code naf", + "code naf organisme designe", + "code naf organisme designant", + "base sirene : code ape de l'etablissement siege", +] + +f = open(join(dirname(__file__), "data", "insee_ape700.txt"), "r") +condes_insee_ape = f.read().split("\n") +# removing empty str due to additionnal line in file +del condes_insee_ape[-1] +condes_insee_ape = set(condes_insee_ape) +f.close() + + +def _is(val): + if not isinstance(val, str): + return False + val = _process_text(val).upper() + return val in condes_insee_ape + + +_test_values = {True: ["0116Z"], False: ["0116A"]} diff --git a/csv_detective/detect_fields/FR/geo/insee_canton/__init__.py b/csv_detective/formats/insee_canton.py old mode 100644 new mode 100755 similarity index 57% rename from csv_detective/detect_fields/FR/geo/insee_canton/__init__.py rename to csv_detective/formats/insee_canton.py index d18d3ac1..72c470da --- a/csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +++ b/csv_detective/formats/insee_canton.py @@ -1,15 +1,28 @@ -from frformat import Canton, Millesime, Options - -PROPORTION = 0.9 -_options = Options( - ignore_case=True, - ignore_accents=True, - replace_non_alphanumeric_with_space=True, - ignore_extra_whitespace=True, -) -_canton = Canton(Millesime.LATEST, _options) - - -def _is(val): - """Match avec le nom des cantons""" - return isinstance(val, str) and _canton.is_valid(val) +from frformat import Canton, Millesime, Options + +proportion = 0.9 +tags = ["fr", "geo"] +labels = [ + "insee canton", + "canton", + "cant", + "nom canton", +] + +_options = Options( + ignore_case=True, + ignore_accents=True, + replace_non_alphanumeric_with_space=True, + ignore_extra_whitespace=True, +) +_canton = Canton(Millesime.LATEST, _options) + + +def _is(val): + return isinstance(val, str) and _canton.is_valid(val) + + +_test_values = { + True: ["nantua"], + False: ["california"], +} diff --git a/csv_detective/formats/int.py b/csv_detective/formats/int.py index 9d9ec7b0..b6ecf328 100755 --- a/csv_detective/formats/int.py +++ b/csv_detective/formats/int.py @@ -14,3 +14,9 @@ def _is(val): return True except ValueError: return False + + +_test_values = { + True: ["1", "0", "1764", "-24"], + False: ["01053", "1.2", "123_456", "+35"], +} diff --git a/csv_detective/formats/iso_country_code_alpha2.py b/csv_detective/formats/iso_country_code_alpha2.py new file mode 100755 index 00000000..2633cff3 --- /dev/null +++ b/csv_detective/formats/iso_country_code_alpha2.py @@ -0,0 +1,30 @@ +import re +from os.path import dirname, join + +proportion = 1 +tags = ["geo"] +labels = [ + "iso country code", + "code pays", + "pays", + "country", + "nation", + "pays code", + "code pays (iso)", +] + +with open(join(dirname(__file__), "data", "iso_country_code_alpha2.txt"), "r") as iofile: + liste_pays = iofile.read().split("\n") +liste_pays = set(liste_pays) + + +def _is(val): + if not isinstance(val, str) or not bool(re.match(r"[A-Z]{2}$", val)): + return False + return val in liste_pays + + +_test_values = { + True: ["FR"], + False: ["XX", "A", "FRA"], +} diff --git a/csv_detective/formats/iso_country_code_alpha3.py b/csv_detective/formats/iso_country_code_alpha3.py new file mode 100755 index 00000000..50f745db --- /dev/null +++ b/csv_detective/formats/iso_country_code_alpha3.py @@ -0,0 +1,30 @@ +import re +from os.path import dirname, join + +proportion = 1 +tags = ["geo"] +labels = [ + "iso country code", + "code pays", + "pays", + "country", + "nation", + "pays code", + "code pays (iso)", +] + +with open(join(dirname(__file__), "data", "iso_country_code_alpha3.txt"), "r") as iofile: + liste_pays = iofile.read().split("\n") + + +def _is(val): + """Renvoie True si val peut etre un code iso pays alpha-3, False sinon""" + if not isinstance(val, str) or not bool(re.match(r"[A-Z]{3}$", val)): + return False + return val in set(liste_pays) + + +_test_values = { + True: ["FRA"], + False: ["XXX", "FR", "A"], +} diff --git a/csv_detective/formats/iso_country_code_numeric.py b/csv_detective/formats/iso_country_code_numeric.py new file mode 100755 index 00000000..32c68d3b --- /dev/null +++ b/csv_detective/formats/iso_country_code_numeric.py @@ -0,0 +1,31 @@ +import re +from os.path import dirname, join + +proportion = 1 +tags = ["geo"] +labels = [ + "iso country code", + "code pays", + "pays", + "country", + "nation", + "pays code", + "code pays (iso)", +] + +with open(join(dirname(__file__), "data", "iso_country_code_numeric.txt"), "r") as iofile: + liste_pays = iofile.read().split("\n") +liste_pays = set(liste_pays) + + +def _is(val): + """Renvoie True si val peut etre un code iso pays numerique, False sinon""" + if not isinstance(val, str) or not bool(re.match(r"[0-9]{3}$", val)): + return False + return val in liste_pays + + +_test_values = { + True: ["250"], + False: ["003"], +} diff --git a/csv_detective/formats/jour_de_la_semaine.py b/csv_detective/formats/jour_de_la_semaine.py new file mode 100755 index 00000000..2f09ac5b --- /dev/null +++ b/csv_detective/formats/jour_de_la_semaine.py @@ -0,0 +1,43 @@ + + +proportion = 0.8 +tags = ["fr", "temp"] +labels = [ + "jour semaine", + "type jour", + "jour de la semaine", + "saufjour", + "nomjour", + "jour", + "jour de fermeture", +] + +jours = { + "lundi", + "mardi", + "mercredi", + "jeudi", + "vendredi", + "samedi", + "dimanche", + "lun", + "mar", + "mer", + "jeu", + "ven", + "sam", + "dim", +} + + +def _is(val): + if not isinstance(val, str): + return False + val = val.lower() + return val in jours + + +_test_values = { + True: ["lundi"], + False: ["jour de la biere"], +} diff --git a/csv_detective/detect_fields/other/json/__init__.py b/csv_detective/formats/json.py old mode 100644 new mode 100755 similarity index 58% rename from csv_detective/detect_fields/other/json/__init__.py rename to csv_detective/formats/json.py index e4051b1f..d7f09722 --- a/csv_detective/detect_fields/other/json/__init__.py +++ b/csv_detective/formats/json.py @@ -1,14 +1,22 @@ -import json -from json import JSONDecodeError - -PROPORTION = 1 - - -def _is(val): - """Detects json""" - try: - loaded = json.loads(val) - # we don't want to consider integers for instance - return isinstance(loaded, (list, dict)) - except (JSONDecodeError, TypeError): - return False +import json +from json import JSONDecodeError + +proportion = 1 +tags = [] +labels = [ +] + + +def _is(val): + try: + loaded = json.loads(val) + # we don't want to consider integers for instance + return isinstance(loaded, (list, dict)) + except (JSONDecodeError, TypeError): + return False + + +_test_values = { + True: ['{"pomme": "fruit", "reponse": 42}', "[1,2,3,4]"], + False: ["5", '{"zefib":', '{"a"}'], +} diff --git a/csv_detective/formats/latitude_l93.py b/csv_detective/formats/latitude_l93.py new file mode 100755 index 00000000..475cc112 --- /dev/null +++ b/csv_detective/formats/latitude_l93.py @@ -0,0 +1,48 @@ +from frformat import LatitudeL93 + +from .float import _is as is_float +from .float import float_casting + +proportion = 1 +tags = ["fr", "geo"] +labels = [ + "latitude", + "lat", + "y", + "yf", + "yd", + "y l93", + "coordonnee y", + "latitude lb93", + "coord y", + "ycoord", + "geocodage y gps", + "location latitude", + "ylatitude", + "ylat", + "latitude (y)", + "latitudeorg", + "coordinates.latitude", + "googlemap latitude", + "latitudelieu", + "latitude googlemap", +] + +_latitudel93 = LatitudeL93() + + +def _is(val): + try: + if isinstance(val, str) and is_float(val): + return _latitudel93.is_valid(float_casting(val)) + + return False + + except (ValueError, OverflowError): + return False + + +_test_values = { + True: ["6037008", "7123528.5", "7124528,5"], + False: ["0", "-6734529.6", "7245669.8", "3422674,78", "32_34"], +} diff --git a/csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py b/csv_detective/formats/latitude_wgs.py old mode 100644 new mode 100755 similarity index 55% rename from csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py rename to csv_detective/formats/latitude_wgs.py index ae4b6afb..ca3e700f --- a/csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +++ b/csv_detective/formats/latitude_wgs.py @@ -1,30 +1,42 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "latitude", - "lat", - "y", - "yf", - "yd", - "coordonnee y", - "coord y", - "ycoord", - "geocodage y gps", - "location latitude", - "ylatitude", - "ylat", - "latitude (y)", - "latitudeorg", - "coordinates.latitude", - "googlemap latitude", - "latitudelieu", - "latitude googlemap", - "latitude wgs84", - "y wgs84", - "latitude (wgs84)", - ] - return header_score(header, words_combinations_list) +from .float import _is as is_float + +proportion = 1 +tags = ["geo"] +labels = [ + "latitude", + "lat", + "y", + "yf", + "yd", + "coordonnee y", + "coord y", + "ycoord", + "geocodage y gps", + "location latitude", + "ylatitude", + "ylat", + "latitude (y)", + "latitudeorg", + "coordinates.latitude", + "googlemap latitude", + "latitudelieu", + "latitude googlemap", + "latitude wgs84", + "y wgs84", + "latitude (wgs84)", +] + + +def _is(val): + try: + return is_float(val) and float(val) >= -90 and float(val) <= 90 + except ValueError: + return False + except OverflowError: + return False + + +_test_values = { + True: ["43.2", "-22"], + False: ["100"], +} diff --git a/csv_detective/detect_labels/geo/latitude_wgs/__init__.py b/csv_detective/formats/latitude_wgs_fr_metropole.py old mode 100644 new mode 100755 similarity index 54% rename from csv_detective/detect_labels/geo/latitude_wgs/__init__.py rename to csv_detective/formats/latitude_wgs_fr_metropole.py index ae4b6afb..1c77d04a --- a/csv_detective/detect_labels/geo/latitude_wgs/__init__.py +++ b/csv_detective/formats/latitude_wgs_fr_metropole.py @@ -1,30 +1,42 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "latitude", - "lat", - "y", - "yf", - "yd", - "coordonnee y", - "coord y", - "ycoord", - "geocodage y gps", - "location latitude", - "ylatitude", - "ylat", - "latitude (y)", - "latitudeorg", - "coordinates.latitude", - "googlemap latitude", - "latitudelieu", - "latitude googlemap", - "latitude wgs84", - "y wgs84", - "latitude (wgs84)", - ] - return header_score(header, words_combinations_list) +from .float import _is as is_float + +proportion = 1 +tags = ["fr", "geo"] +labels = [ + "latitude", + "lat", + "y", + "yf", + "yd", + "coordonnee y", + "coord y", + "ycoord", + "geocodage y gps", + "location latitude", + "ylatitude", + "ylat", + "latitude (y)", + "latitudeorg", + "coordinates.latitude", + "googlemap latitude", + "latitudelieu", + "latitude googlemap", + "latitude wgs84", + "y wgs84", + "latitude (wgs84)", +] + + +def _is(val): + try: + return is_float(val) and float(val) >= 41.3 and float(val) <= 51.3 + except ValueError: + return False + except OverflowError: + return False + + +_test_values = { + True: ["42.5"], + False: ["22.5", "62.5"], +} diff --git a/csv_detective/formats/latlon_wgs.py b/csv_detective/formats/latlon_wgs.py new file mode 100755 index 00000000..1d535c62 --- /dev/null +++ b/csv_detective/formats/latlon_wgs.py @@ -0,0 +1,53 @@ +from .latitude_wgs import _is as is_lat +from .longitude_wgs import _is as is_lon + +proportion = 1 +tags = ["geo"] + +COMMON_COORDS_LABELS = [ + "ban", + "coordinates", + "coordonnees", + "coordonnees insee", + "geo", + "geopoint", + "geoloc", + "geolocalisation", + "geom", + "geometry", + "gps", + "localisation", + "point", + "position", + "wgs84", +] + +specific = [ + "latlon", + "lat lon", + "x y", + "xy", +] + +# we aim wide to catch exact matches if possible for the highest possible score +labels = ( + COMMON_COORDS_LABELS + + specific + + [w + sep + suf for suf in specific for w in COMMON_COORDS_LABELS for sep in ["", " "]] +) + + +def _is(val): + if not isinstance(val, str) or val.count(",") != 1: + return False + lat, lon = val.split(",") + # handling [lat,lon] + if lat.startswith("[") and lon.endswith("]"): + lat, lon = lat[1:], lon[:-1] + return is_lat(lat) and is_lon(lon.replace(" ", "")) + + +_test_values = { + True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8", "[12,-0.28]"], + False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27"], +} diff --git a/csv_detective/formats/longitude_l93.py b/csv_detective/formats/longitude_l93.py new file mode 100755 index 00000000..8172a771 --- /dev/null +++ b/csv_detective/formats/longitude_l93.py @@ -0,0 +1,39 @@ +from frformat import LongitudeL93 + +from .float import _is as is_float +from .float import float_casting + +proportion = 1 +tags = ["fr", "geo"] +labels = [ + "longitude", + "lon", + "long", + "geocodage x gps", + "location longitude", + "xlongitude", + "lng", + "xlong", + "x", + "xf", + "xd", +] + +_longitudel93 = LongitudeL93() + + +def _is(val): + try: + if isinstance(val, str) and is_float(val): + return _longitudel93.is_valid(float_casting(val)) + + return False + + except (ValueError, OverflowError): + return False + + +_test_values = { + True: ["0", "-154", "1265783,45", "34723.4"], + False: ["1456669.8", "-776225", "346_3214"], +} diff --git a/csv_detective/formats/longitude_wgs.py b/csv_detective/formats/longitude_wgs.py new file mode 100755 index 00000000..c2de9425 --- /dev/null +++ b/csv_detective/formats/longitude_wgs.py @@ -0,0 +1,32 @@ +from .float import _is as is_float + +proportion = 1 +tags = ["geo"] +labels = [ + "longitude", + "lon", + "long", + "geocodage x gps", + "location longitude", + "xlongitude", + "lng", + "xlong", + "x", + "xf", + "xd", +] + + +def _is(val): + try: + return is_float(val) and float(val) >= -180 and float(val) <= 180 + except ValueError: + return False + except OverflowError: + return False + + +_test_values = { + True: ["120", "-20.2"], + False: ["-200"], +} diff --git a/csv_detective/formats/longitude_wgs_fr_metropole.py b/csv_detective/formats/longitude_wgs_fr_metropole.py new file mode 100755 index 00000000..7145abb3 --- /dev/null +++ b/csv_detective/formats/longitude_wgs_fr_metropole.py @@ -0,0 +1,32 @@ +from .float import _is as is_float + +proportion = 1 +tags = ["fr", "geo"] +labels = [ + "longitude", + "lon", + "long", + "geocodage x gps", + "location longitude", + "xlongitude", + "lng", + "xlong", + "x", + "xf", + "xd", +] + + +def _is(val): + try: + return is_float(val) and float(val) >= -5.5 and float(val) <= 9.8 + except ValueError: + return False + except OverflowError: + return False + + +_test_values = { + True: ["-2.5"], + False: ["12.8"], +} diff --git a/csv_detective/formats/lonlat_wgs.py b/csv_detective/formats/lonlat_wgs.py new file mode 100755 index 00000000..54de3709 --- /dev/null +++ b/csv_detective/formats/lonlat_wgs.py @@ -0,0 +1,36 @@ +from .latitude_wgs import _is as is_lat +from .longitude_wgs import _is as is_lon +from .latlon_wgs import COMMON_COORDS_LABELS + +proportion = 1 +tags = ["geo"] + +specific = [ + "lonlat", + "lon lat", + "y x", + "yx", +] + +# we aim wide to catch exact matches if possible for the highest possible score +words = ( + COMMON_COORDS_LABELS + + specific + + [w + sep + suf for suf in specific for w in COMMON_COORDS_LABELS for sep in ["", " "]] +) + + +def _is(val): + if not isinstance(val, str) or val.count(",") != 1: + return False + lon, lat = val.split(",") + # handling [lon,lat] + if lon.startswith("[") and lat.endswith("]"): + lon, lat = lon[1:], lat[:-1] + return is_lon(lon) and is_lat(lat.replace(" ", "")) + + +_test_values = { + True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"], + False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"], +} diff --git a/csv_detective/formats/mois_de_lannee.py b/csv_detective/formats/mois_de_lannee.py index 334f8bb2..d0320bdd 100755 --- a/csv_detective/formats/mois_de_lannee.py +++ b/csv_detective/formats/mois_de_lannee.py @@ -1,7 +1,9 @@ from unidecode import unidecode proportion = 1 -labels = ["fr", "temp"] +tags = ["fr", "temp"] +labels = ["mois", "month"] + mois = { "janvier", "fevrier", @@ -38,3 +40,9 @@ def _is(val): return False val = unidecode(val.lower()) return val in mois + + +_test_values = { + True: ["JUIN", "décembre"], + False: ["november"], +} diff --git a/csv_detective/formats/money.py b/csv_detective/formats/money.py new file mode 100755 index 00000000..6539fcd4 --- /dev/null +++ b/csv_detective/formats/money.py @@ -0,0 +1,19 @@ +from .float import _is as is_float + +proportion = 0.8 +labels = ["budget", "salaire", "euro", "euros", "prêt", "montant" +] + +currencies = {"€", "$", "£", "¥"} + + +def _is(val): + if not isinstance(val, str) or val[-1] not in currencies: + return False + return is_float(val[:-1]) + + +_test_values = { + True: ["120€", "-20.2$"], + False: ["200", "100 euros"], +} diff --git a/csv_detective/formats/mongo_object_id.py b/csv_detective/formats/mongo_object_id.py new file mode 100755 index 00000000..bff75806 --- /dev/null +++ b/csv_detective/formats/mongo_object_id.py @@ -0,0 +1,15 @@ +import re + +proportion = 0.8 +labels = ["id", "objectid" +] + + +def _is(val): + return isinstance(val, str) and bool(re.match(r"^[0-9a-fA-F]{24}$", val)) + + +_test_values = { + True: ["62320e50f981bc2b57bcc044"], + False: ["884762be-51f3-44c3-b811-1e14c5d89262", "0230240284a66e"], +} diff --git a/csv_detective/formats/pays.py b/csv_detective/formats/pays.py new file mode 100755 index 00000000..edb139f7 --- /dev/null +++ b/csv_detective/formats/pays.py @@ -0,0 +1,35 @@ +from frformat import Millesime, Options, Pays + +proportion = 0.6 +tags = ["fr", "geo"] +labels = [ + "pays", + "payslieu", + "paysorg", + "country", + "pays lib", + "lieupays", + "pays beneficiaire", + "nom du pays", + "journey start country", + "libelle pays", + "journey end country", +] + +_options = Options( + ignore_case=True, + ignore_accents=True, + replace_non_alphanumeric_with_space=True, + ignore_extra_whitespace=True, +) +_pays = Pays(Millesime.LATEST, _options) + + +def _is(val): + return isinstance(val, str) and _pays.is_valid(val) + + +_test_values = { + True: ["france", "italie"], + False: ["amerique", "paris"], +} diff --git a/csv_detective/formats/percent.py b/csv_detective/formats/percent.py new file mode 100755 index 00000000..1f0c0231 --- /dev/null +++ b/csv_detective/formats/percent.py @@ -0,0 +1,18 @@ +from .float import _is as is_float + +proportion = 0.8 +tags = [] +labels = [ +] + + +def _is(val): + if not isinstance(val, str) or val[-1] != "%": + return False + return is_float(val[:-1]) + + +_test_values = { + True: ["120%", "-20.2%"], + False: ["200", "100 pourcents"], +} diff --git a/csv_detective/detect_fields/FR/geo/region/__init__.py b/csv_detective/formats/region.py old mode 100644 new mode 100755 similarity index 69% rename from csv_detective/detect_fields/FR/geo/region/__init__.py rename to csv_detective/formats/region.py index dbcbc8db..0300ffbf --- a/csv_detective/detect_fields/FR/geo/region/__init__.py +++ b/csv_detective/formats/region.py @@ -1,50 +1,70 @@ -from frformat import Millesime, Options, Region - -PROPORTION = 1 - -_extra_valid_values_set = frozenset( - { - "alsace", - "aquitaine", - "ara", - "aura", - "auvergne", - "auvergne et rhone alpes", - "basse normandie", - "bfc", - "bourgogne", - "bourgogne et franche comte", - "centre", - "champagne ardenne", - "franche comte", - "ge", - "haute normandie", - "hdf", - "languedoc roussillon", - "limousin", - "lorraine", - "midi pyrenees", - "nord pas de calais", - "npdc", - "paca", - "picardie", - "poitou charentes", - "reunion", - "rhone alpes", - } -) - - -_options = Options( - ignore_case=True, - ignore_accents=True, - replace_non_alphanumeric_with_space=True, - ignore_extra_whitespace=True, - extra_valid_values=_extra_valid_values_set, -) -_region = Region(Millesime.LATEST, _options) - - -def _is(val): - """Match avec le nom des regions""" - return isinstance(val, str) and _region.is_valid(val) +from frformat import Millesime, Options, Region + +proportion = 1 +tags = ["fr", "geo"] +labels = [ + "region", + "libelle region", + "nom region", + "libelle reg", + "nom reg", + "reg libusage", + "nom de la region", + "regionorg", + "regionlieu", + "reg", + "nom officiel region", +] + +_extra_valid_values_set = frozenset( + { + "alsace", + "aquitaine", + "ara", + "aura", + "auvergne", + "auvergne et rhone alpes", + "basse normandie", + "bfc", + "bourgogne", + "bourgogne et franche comte", + "centre", + "champagne ardenne", + "franche comte", + "ge", + "haute normandie", + "hdf", + "languedoc roussillon", + "limousin", + "lorraine", + "midi pyrenees", + "nord pas de calais", + "npdc", + "paca", + "picardie", + "poitou charentes", + "reunion", + "rhone alpes", + } +) + + +_options = Options( + ignore_case=True, + ignore_accents=True, + replace_non_alphanumeric_with_space=True, + ignore_extra_whitespace=True, + extra_valid_values=_extra_valid_values_set, +) +_region = Region(Millesime.LATEST, _options) + + +def _is(val): + """Match avec le nom des regions""" + return isinstance(val, str) and _region.is_valid(val) + + +_test_values = { + True: ["bretagne", "ile-de-france"], + False: ["baviere", "overgne"], +} diff --git a/csv_detective/detect_fields/FR/other/sexe/__init__.py b/csv_detective/formats/sexe.py old mode 100644 new mode 100755 similarity index 55% rename from csv_detective/detect_fields/FR/other/sexe/__init__.py rename to csv_detective/formats/sexe.py index 185b65cf..cab962b8 --- a/csv_detective/detect_fields/FR/other/sexe/__init__.py +++ b/csv_detective/formats/sexe.py @@ -1,11 +1,19 @@ -from csv_detective.parsing.text import _process_text - -PROPORTION = 1 - - -def _is(val): - """Repère le sexe""" - if not isinstance(val, str): - return False - val = _process_text(val) - return val in {"homme", "femme", "h", "f", "m", "masculin", "feminin"} +from csv_detective.parsing.text import _process_text + +proportion = 1 +tags = ["fr"] +labels = ["sexe", "sex", "civilite", "genre", "id sexe" +] + + +def _is(val): + if not isinstance(val, str): + return False + val = _process_text(val) + return val in {"homme", "femme", "h", "f", "m", "masculin", "feminin"} + + +_test_values = { + True: ["hfemme", "H"], + False: ["adulte"], +} diff --git a/csv_detective/detect_fields/FR/other/siren/__init__.py b/csv_detective/formats/siren.py old mode 100644 new mode 100755 similarity index 53% rename from csv_detective/detect_fields/FR/other/siren/__init__.py rename to csv_detective/formats/siren.py index 58f4426e..6d73c169 --- a/csv_detective/detect_fields/FR/other/siren/__init__.py +++ b/csv_detective/formats/siren.py @@ -1,20 +1,37 @@ -import re - -PROPORTION = 0.9 - - -def _is(val): - """Repere les codes SIREN""" - if not isinstance(val, str): - return False - val = val.replace(" ", "") - if not bool(re.match(r"^[0-9]{9}$", val)): - return False - # Vérification par clé propre aux codes siren - cle = 0 - pair = False - for x in val: - y = int(x) * (1 + pair) - cle += y // 10 + y % 10 - pair = not pair - return cle % 10 == 0 +import re + +proportion = 0.9 +tags = ["fr"] +labels = [ + "siren", + "siren organisme designe", + "siren organisme designant", + "n° siren", + "siren organisme", + "siren titulaire", + "numero siren", + "epci", +] + + +def _is(val): + """Repere les codes SIREN""" + if not isinstance(val, str): + return False + val = val.replace(" ", "") + if not bool(re.match(r"^[0-9]{9}$", val)): + return False + # Vérification par clé propre aux codes siren + cle = 0 + pair = False + for x in val: + y = int(x) * (1 + pair) + cle += y // 10 + y % 10 + pair = not pair + return cle % 10 == 0 + + +_test_values = { + True: ["552 100 554", "552100554"], + False: ["42"], +} diff --git a/csv_detective/formats/siret.py b/csv_detective/formats/siret.py index b99cc381..7cee4a78 100755 --- a/csv_detective/formats/siret.py +++ b/csv_detective/formats/siret.py @@ -1,7 +1,7 @@ import re proportion = 0.8 - +tags = ["fr"] labels = [ "siret", "siret d", @@ -12,8 +12,6 @@ "epci", ] -tags = ["fr"] - def _is(val): """Détection des identifiants SIRET (SIRENE)""" @@ -41,3 +39,9 @@ def _is(val): cle += y // 10 + y % 10 pair = not pair return cle % 10 == 0 + + +_test_values = { + True: ["13002526500013", "130 025 265 00013"], + False: ["13002526500012"], +} diff --git a/csv_detective/formats/tel_fr.py b/csv_detective/formats/tel_fr.py new file mode 100755 index 00000000..a09fb11b --- /dev/null +++ b/csv_detective/formats/tel_fr.py @@ -0,0 +1,36 @@ +import re + +proportion = 0.7 +tags = ["fr"] +labels = [ + "telephone", + "tel", + "tel1", + "tel2", + "phone", + "num tel", + "tel mob", + "telephone sav", + "telephone1", + "coordinates.phone", + "telephone du lieu", +] + + +def _is(val): + if not isinstance(val, str): + return False + + if len(val) < 10: + return False + + val = val.replace(".", "").replace("-", "").replace(" ", "") + + match_1 = bool(re.match(r"^(0|\+33|0033)?[0-9]{9}$", val)) + return match_1 + + +_test_values = { + True: ["0134643467"], + False: ["6625388263", "01288398"], +} diff --git a/csv_detective/detect_labels/FR/other/uai/__init__.py b/csv_detective/formats/uai.py old mode 100644 new mode 100755 similarity index 51% rename from csv_detective/detect_labels/FR/other/uai/__init__.py rename to csv_detective/formats/uai.py index 58860339..145f8021 --- a/csv_detective/detect_labels/FR/other/uai/__init__.py +++ b/csv_detective/formats/uai.py @@ -1,25 +1,36 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "uai", - "code etablissement", - "code uai", - "uai - identifiant", - "numero uai", - "rne", - "numero de l'etablissement", - "code rne", - "codeetab", - "code uai de l'etablissement", - "ref uai", - "cd rne", - "numerouai", - "numero d etablissement", - "code etablissement", - "numero etablissement", - ] - return header_score(header, words_combinations_list) +import re + +proportion = 0.8 +tags = ["fr"] +labels = [ + "uai", + "code etablissement", + "code uai", + "uai - identifiant", + "numero uai", + "rne", + "numero de l'etablissement", + "code rne", + "codeetab", + "code uai de l'etablissement", + "ref uai", + "cd rne", + "numerouai", + "numero d etablissement", + "code etablissement", + "numero etablissement", +] + + +def _is(val): + if not isinstance(val, str) or len(val) != 8: + return False + if not bool(re.match(r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$", val)): + return False + return True + + +_test_values = { + True: ["0422170F"], + False: ["04292E"], +} diff --git a/csv_detective/formats/url.py b/csv_detective/formats/url.py new file mode 100755 index 00000000..eec411a3 --- /dev/null +++ b/csv_detective/formats/url.py @@ -0,0 +1,46 @@ +import re + +proportion = 1 +tags = [] +labels = [ + "url", + "url source", + "site web", + "source url", + "site internet", + "remote url", + "web", + "site", + "lien", + "site data", + "lien url", + "lien vers le fichier", + "sitweb", + "interneturl", +] + +pattern = re.compile( + r"^((https?|ftp)://|www\.)(([A-Za-z0-9-]+\.)+[A-Za-z]{2,6})" + r"(/[A-Za-z0-9._~:/?#[@!$&'()*+,;=%-]*)?$" +) + + +def _is(val): + if not isinstance(val, str): + return False + return bool(pattern.match(val)) + + +_test_values = { + True: [ + "www.data.gouv.fr", + "http://data.gouv.fr", + "https://www.youtube.com/@data-gouv-fr", + ( + "https://tabular-api.data.gouv.fr/api/resources/" + "aaaaaaaa-1111-bbbb-2222-cccccccccccc/data/" + "?score__greater=0.9&decompte__exact=13" + ), + ], + False: ["tmp@data.gouv.fr"], +} diff --git a/csv_detective/formats/username.py b/csv_detective/formats/username.py new file mode 100755 index 00000000..cab1240b --- /dev/null +++ b/csv_detective/formats/username.py @@ -0,0 +1,16 @@ +import re + +proportion = 1 +tags = [] +labels = ["account", "username", "user" +] + + +def _is(val): + return isinstance(val, str) and bool(re.match(r"^@[A-Za-z0-9_]+$", val)) + + +_test_values = { + True: ["@accueil1"], + False: ["adresse@mail"], +} diff --git a/csv_detective/formats/uuid.py b/csv_detective/formats/uuid.py new file mode 100755 index 00000000..4d4aaf6d --- /dev/null +++ b/csv_detective/formats/uuid.py @@ -0,0 +1,18 @@ +import re + +proportion = 0.8 +tags = [] +labels = [ +] + + +def _is(val) -> bool: + return isinstance(val, str) and bool( + re.match(r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$", val) + ) + + +_test_values = { + True: ["884762be-51f3-44c3-b811-1e14c5d89262"], + False: ["0610928327"], +} diff --git a/csv_detective/formats/year.py b/csv_detective/formats/year.py new file mode 100755 index 00000000..b3a1bf20 --- /dev/null +++ b/csv_detective/formats/year.py @@ -0,0 +1,28 @@ +proportion = 1 +tags = ["temp"] +labels = [ + "year", + "annee", + "annee depot", + "an nais", + "exercice", + "data year", + "annee de publication", + "exercice comptable", + "annee de naissance", + "annee ouverture", +] + + +def _is(val): + try: + val = int(val) + except ValueError: + return False + return (1800 <= val) and (val <= 2100) + + +_test_values = { + True: ["2015"], + False: ["20166", "123"], +} diff --git a/csv_detective/load_tests.py b/csv_detective/load_tests.py deleted file mode 100755 index 36a5c032..00000000 --- a/csv_detective/load_tests.py +++ /dev/null @@ -1,59 +0,0 @@ -import os - -from csv_detective import detect_fields, detect_labels # noqa - - -def get_all_packages(detect_type) -> list: - root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type - modules = [] - for dirpath, _, filenames in os.walk(root_dir): - for filename in filenames: - file = os.path.join(dirpath, filename).replace(root_dir, "") - if file.endswith("__init__.py"): - module = file.replace("__init__.py", "").replace("/", ".").replace("\\", ".")[:-1] - if module: - modules.append(detect_type + module) - return modules - - -def return_all_tests( - user_input_tests: str | list, - detect_type: str, -) -> dict[str, dict]: - """ - returns all tests that have a method _is and are listed in the user_input_tests - the function can select a sub_package from csv_detective - user_input_tests may look like this: - - "ALL": all possible tests are made - - "FR.other.siren" (or any other path-like string to one of the tests, or a group of tests, like "FR.geo"): - this specifc (group of) test(s) only - - ["FR.temp.mois_de_annee", "geo", ...]: only the specified tests will be made ; you may also skip - specific (groups of) tests by add "-" at the start (e.g "-temp.date") - """ - assert detect_type in ["detect_fields", "detect_labels"] - all_packages = get_all_packages(detect_type=detect_type) - - if isinstance(user_input_tests, str): - user_input_tests = [user_input_tests] - if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests): - tests_to_do = [detect_type] - else: - tests_to_do = [f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"] - tests_skipped = [f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"] - # removing specified (groups of) tests - all_tests = [ - # this is why we need to import detect_fields/labels - eval(x) - for x in all_packages - if any([y == x[: len(y)] for y in tests_to_do]) - and all([y != x[: len(y)] for y in tests_skipped]) - ] - return { - test.__name__.split(".")[-1]: { - "func": test._is, - "prop": test.PROPORTION, - "module": test, - } - for test in all_tests - if "_is" in dir(test) - } diff --git a/csv_detective/output/dataframe.py b/csv_detective/output/dataframe.py index 50b6909b..6981e9e6 100755 --- a/csv_detective/output/dataframe.py +++ b/csv_detective/output/dataframe.py @@ -5,9 +5,9 @@ import pandas as pd -from csv_detective.detect_fields.other.booleen import bool_casting -from csv_detective.detect_fields.other.float import float_casting -from csv_detective.detect_fields.temp.date import date_casting +from csv_detective.formats.booleen import bool_casting +from csv_detective.formats.date import date_casting +from csv_detective.formats.float import float_casting from csv_detective.parsing.csv import CHUNK_SIZE from csv_detective.utils import display_logs_depending_process_time diff --git a/csv_detective/output/profile.py b/csv_detective/output/profile.py index 5b45216a..2f3bd452 100755 --- a/csv_detective/output/profile.py +++ b/csv_detective/output/profile.py @@ -1,12 +1,11 @@ import logging from collections import defaultdict from time import time -from typing import Optional import numpy as np import pandas as pd -from csv_detective.detect_fields.other.float import float_casting +from csv_detective.formats.float import float_casting from csv_detective.utils import cast_prevent_nan, display_logs_depending_process_time @@ -17,7 +16,7 @@ def create_profile( limited_output: bool = True, cast_json: bool = True, verbose: bool = False, - _col_values: Optional[dict[str, pd.Series]] = None, + _col_values: dict[str, pd.Series] | None = None, ) -> dict: if verbose: start = time() diff --git a/csv_detective/parsing/columns.py b/csv_detective/parsing/columns.py index 4d4bee32..0a55ed3d 100755 --- a/csv_detective/parsing/columns.py +++ b/csv_detective/parsing/columns.py @@ -5,6 +5,7 @@ import pandas as pd from more_itertools import peekable +from csv_detective.format import Format from csv_detective.parsing.csv import CHUNK_SIZE from csv_detective.utils import display_logs_depending_process_time @@ -14,15 +15,13 @@ def test_col_val( serie: pd.Series, - test_func: Callable, - proportion: float = 0.9, + format: Format, skipna: bool = True, limited_output: bool = False, verbose: bool = False, ) -> float: """Tests values of the serie using test_func. - - skipna : if True indicates that NaNs are not counted as False - - proportion : indicates the proportion of values that have to pass the test + - skipna : if True indicates that NaNs are considered True for the serie to be detected as a certain format """ if verbose: @@ -34,28 +33,28 @@ def apply_test_func(serie: pd.Series, test_func: Callable, _range: int): try: if skipna: - serie = serie[serie.notnull()] + serie = serie.loc[serie.notnull()] ser_len = len(serie) if ser_len == 0: # being here means the whole column is NaN, so if skipna it's a pass return 1.0 if skipna else 0.0 if not limited_output: - result = apply_test_func(serie, test_func, ser_len).sum() / ser_len - return result if result >= proportion else 0.0 + result = apply_test_func(serie, format.func, ser_len).sum() / ser_len + return result if result >= format.proportion else 0.0 else: - if proportion == 1: + if format.proportion == 1: # early stops (1 then 5 rows) to not waste time if directly unsuccessful for _range in [ min(1, ser_len), min(5, ser_len), ser_len, ]: - if not all(apply_test_func(serie, test_func, _range)): + if not all(apply_test_func(serie, format.func, _range)): return 0.0 return 1.0 else: - result = apply_test_func(serie, test_func, ser_len).sum() / ser_len - return result if result >= proportion else 0.0 + result = apply_test_func(serie, format.func, ser_len).sum() / ser_len + return result if result >= format.proportion else 0.0 finally: if verbose and time() - start > 3: display_logs_depending_process_time( @@ -64,42 +63,27 @@ def apply_test_func(serie: pd.Series, test_func: Callable, _range: int): ) -def test_col_label( - label: str, test_func: Callable, proportion: float = 1, limited_output: bool = False -): - """Tests label (from header) using test_func. - - proportion : indicates the minimum score to pass the test for the serie - to be detected as a certain format - """ - if not limited_output: - return test_func(label) - else: - result = test_func(label) - return result if result >= proportion else 0 - - def test_col( table: pd.DataFrame, - all_tests: dict[str, dict], + formats: dict[str, Format], limited_output: bool, skipna: bool = True, verbose: bool = False, ): if verbose: start = time() - logging.info("Testing columns to get types") + logging.info("Testing columns to get formats") return_table = pd.DataFrame(columns=table.columns) - for idx, (name, attributes) in enumerate(all_tests.items()): + for idx, (label, format) in enumerate(formats.items()): if verbose: start_type = time() - logging.info(f"\t- Starting with type '{name}'") + logging.info(f"\t- Starting with format '{label}'") # improvement lead : put the longest tests behind and make them only if previous tests not satisfactory # => the following needs to change, "apply" means all columns are tested for one type at once - return_table.loc[name] = table.apply( + return_table.loc[label] = table.apply( lambda serie: test_col_val( serie, - attributes["func"], - attributes["prop"], + format, skipna=skipna, limited_output=limited_output, verbose=verbose, @@ -107,7 +91,7 @@ def test_col( ) if verbose: display_logs_depending_process_time( - f'\t> Done with type "{name}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(all_tests)})', + f'\t> Done with type "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})', time() - start_type, ) if verbose: @@ -118,23 +102,23 @@ def test_col( def test_label( - columns: list[str], all_tests: dict[str, dict], limited_output: bool, verbose: bool = False + columns: list[str], formats: dict[str, Format], limited_output: bool, verbose: bool = False ): if verbose: start = time() logging.info("Testing labels to get types") return_table = pd.DataFrame(columns=columns) - for idx, (key, value) in enumerate(all_tests.items()): + for idx, (label, format) in enumerate(formats.items()): if verbose: start_type = time() - return_table.loc[key] = [ - test_col_label(col_name, value["func"], value["prop"], limited_output=limited_output) + return_table.loc[label] = [ + format.is_valid_label(col_name) for col_name in columns ] if verbose: display_logs_depending_process_time( - f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(all_tests)})', + f'\t- Done with type "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})', time() - start_type, ) if verbose: @@ -148,23 +132,24 @@ def test_col_chunks( table: pd.DataFrame, file_path: str, analysis: dict, - all_tests: list, + formats: dict[str, Format], limited_output: bool, skipna: bool = True, verbose: bool = False, ) -> tuple[pd.DataFrame, dict, dict[str, pd.Series]]: def build_remaining_tests_per_col(return_table: pd.DataFrame) -> dict[str, list[str]]: + # returns a dict with the table's columns as keys and the list of remaining format labels to apply return { - col: [test for test in return_table.index if return_table.loc[test, col] > 0] + col: [fmt_label for fmt_label in return_table.index if return_table.loc[fmt_label, col] > 0] for col in return_table.columns } if verbose: start = time() - logging.info("Testing columns to get types on chunks") + logging.info("Testing columns to get formats on chunks") # analysing the sample to get a first guess - return_table = test_col(table, all_tests, limited_output, skipna=skipna, verbose=verbose) + return_table = test_col(table, formats, limited_output, skipna=skipna, verbose=verbose) remaining_tests_per_col = build_remaining_tests_per_col(return_table) # hashing rows to get nb_duplicates @@ -217,23 +202,22 @@ def build_remaining_tests_per_col(return_table: pd.DataFrame) -> dict[str, list[ if not any(remaining_tests for remaining_tests in remaining_tests_per_col.values()): # no more potential tests to do on any column, early stop break - for col, tests in remaining_tests_per_col.items(): + for col, fmt_labels in remaining_tests_per_col.items(): # testing each column with the tests that are still competing # after previous batchs analyses - for test in tests: + for label in fmt_labels: batch_col_test = test_col_val( batch[col], - all_tests[test]["func"], - all_tests[test]["prop"], + formats[label], limited_output=limited_output, skipna=skipna, ) - return_table.loc[test, col] = ( + return_table.loc[label, col] = ( # if this batch's column tested 0 then test fails overall 0 if batch_col_test == 0 # otherwise updating the score with weighted average - else ((return_table.loc[test, col] * idx + batch_col_test) / (idx + 1)) + else ((return_table.loc[label, col] * idx + batch_col_test) / (idx + 1)) ) remaining_tests_per_col = build_remaining_tests_per_col(return_table) batch, batch_number = [], batch_number + 1 diff --git a/csv_detective/parsing/csv.py b/csv_detective/parsing/csv.py index c1e06995..501696f2 100755 --- a/csv_detective/parsing/csv.py +++ b/csv_detective/parsing/csv.py @@ -1,6 +1,6 @@ import logging from time import time -from typing import Optional, TextIO +from typing import TextIO import pandas as pd @@ -18,7 +18,7 @@ def parse_csv( skiprows: int, random_state: int = 42, verbose: bool = False, -) -> tuple[pd.DataFrame, Optional[int], Optional[int]]: +) -> tuple[pd.DataFrame, int | None, int | None]: if verbose: start = time() logging.info("Parsing table") diff --git a/csv_detective/validate.py b/csv_detective/validate.py index 6dd1cc86..c779ccc3 100755 --- a/csv_detective/validate.py +++ b/csv_detective/validate.py @@ -2,13 +2,13 @@ import pandas as pd -from csv_detective.load_tests import return_all_tests +from csv_detective.format import FormatsManager from csv_detective.parsing.columns import MAX_NUMBER_CATEGORICAL_VALUES, test_col_val VALIDATION_CHUNK_SIZE = int(1e5) logging.basicConfig(level=logging.INFO) -tests = return_all_tests("ALL", "detect_fields") +formats = FormatsManager().formats def validate( @@ -101,8 +101,8 @@ def validate( continue test_result: float = test_col_val( serie=chunk[col_name], - test_func=tests[args["format"]]["func"], - proportion=tests[args["format"]]["prop"], + test_func=formats[args["format"]].func, + proportion=formats[args["format"]].proportion, skipna=skipna, ) if not bool(test_result): From 9e169d29a5d0dc0add11d736789f2928199790e0 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Fri, 28 Nov 2025 18:14:14 +0100 Subject: [PATCH 03/21] chore: lint --- csv_detective/detection/formats.py | 9 +-- csv_detective/format.py | 10 +-- csv_detective/formats/adresse.py | 4 +- csv_detective/formats/booleen.py | 4 +- csv_detective/formats/code_commune_insee.py | 4 +- csv_detective/formats/code_csp_insee.py | 7 +- csv_detective/formats/code_departement.py | 14 ++-- csv_detective/formats/code_fantoir.py | 10 +-- csv_detective/formats/code_import.py | 7 +- csv_detective/formats/code_postal.py | 18 ++--- csv_detective/formats/code_region.py | 12 +-- csv_detective/formats/code_rna.py | 26 +++---- csv_detective/formats/code_waldec.py | 7 +- csv_detective/formats/commune.py | 10 +-- csv_detective/formats/csp_insee.py | 10 +-- csv_detective/formats/date.py | 76 +++++++++---------- csv_detective/formats/date_fr.py | 7 +- csv_detective/formats/datetime_aware.py | 19 +++-- csv_detective/formats/datetime_naive.py | 29 ++++--- csv_detective/formats/datetime_rfc822.py | 24 +++--- csv_detective/formats/departement.py | 30 ++++---- csv_detective/formats/email.py | 26 +++---- csv_detective/formats/float.py | 4 +- csv_detective/formats/geojson.py | 26 +++---- csv_detective/formats/insee_ape700.py | 12 +-- csv_detective/formats/insee_canton.py | 12 +-- .../formats/iso_country_code_alpha2.py | 18 ++--- .../formats/iso_country_code_alpha3.py | 18 ++--- .../formats/iso_country_code_numeric.py | 18 ++--- csv_detective/formats/jour_de_la_semaine.py | 20 +++-- csv_detective/formats/json.py | 7 +- csv_detective/formats/latitude_l93.py | 44 +++++------ csv_detective/formats/latitude_wgs.py | 46 +++++------ .../formats/latitude_wgs_fr_metropole.py | 46 +++++------ csv_detective/formats/latlon_wgs.py | 4 +- csv_detective/formats/longitude_l93.py | 26 +++---- csv_detective/formats/longitude_wgs.py | 26 +++---- .../formats/longitude_wgs_fr_metropole.py | 26 +++---- csv_detective/formats/lonlat_wgs.py | 4 +- csv_detective/formats/money.py | 7 +- csv_detective/formats/mongo_object_id.py | 7 +- csv_detective/formats/pays.py | 26 +++---- csv_detective/formats/percent.py | 7 +- csv_detective/formats/region.py | 26 +++---- csv_detective/formats/sexe.py | 7 +- csv_detective/formats/siren.py | 20 ++--- csv_detective/formats/tel_fr.py | 26 +++---- csv_detective/formats/uai.py | 36 ++++----- csv_detective/formats/url.py | 50 ++++++------ csv_detective/formats/username.py | 7 +- csv_detective/formats/uuid.py | 7 +- csv_detective/formats/year.py | 24 +++--- csv_detective/parsing/columns.py | 11 +-- 53 files changed, 481 insertions(+), 500 deletions(-) diff --git a/csv_detective/detection/formats.py b/csv_detective/detection/formats.py index 7c6e9a28..d83ed5aa 100755 --- a/csv_detective/detection/formats.py +++ b/csv_detective/detection/formats.py @@ -18,6 +18,7 @@ fmtm = FormatsManager() + def detect_formats( table: pd.DataFrame, analysis: dict, @@ -31,9 +32,7 @@ def detect_formats( # list testing to be performed formats: dict[str, Format] = ( - fmtm.get_formats_from_tags(tags) - if tags is not None - else fmtm.formats + fmtm.get_formats_from_tags(tags) if tags is not None else fmtm.formats ) # if no testing then return @@ -70,9 +69,7 @@ def detect_formats( analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output) # Perform testing on labels - scores_table_labels = test_label( - analysis["header"], formats, limited_output, verbose=verbose - ) + scores_table_labels = test_label(analysis["header"], formats, limited_output, verbose=verbose) analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output) # Multiply the results of the fields by 1 + 0.5 * the results of the labels. diff --git a/csv_detective/format.py b/csv_detective/format.py index c35ce99d..72e9a7eb 100755 --- a/csv_detective/format.py +++ b/csv_detective/format.py @@ -19,7 +19,7 @@ def __init__( self.labels: list[str] = labels self.proportion: float = proportion self.tags: list[str] = tags - + def is_valid_label(self, val: str) -> float: return header_score(val, self.labels) @@ -29,10 +29,8 @@ class FormatsManager: def __init__(self) -> None: import csv_detective.formats as formats - format_labels = [ - f for f in dir(formats) - if "_is" in dir(getattr(formats, f)) - ] + + format_labels = [f for f in dir(formats) if "_is" in dir(getattr(formats, f))] assert len(format_labels) == len(set(format_labels)), "Format labels must be unique" self.formats = { label: Format( @@ -47,7 +45,7 @@ def __init__(self) -> None: ) for label in format_labels } - + def get_formats_from_tags(self, tags: list[str]) -> dict[str, Format]: # allowed to skip with -temp return { diff --git a/csv_detective/formats/adresse.py b/csv_detective/formats/adresse.py index 0cabdb6a..fae6ee47 100755 --- a/csv_detective/formats/adresse.py +++ b/csv_detective/formats/adresse.py @@ -111,6 +111,6 @@ def _is(val): _test_values = { - True: ["rue du martyr"], - False: ["un batiment"], + True: ["rue du martyr"], + False: ["un batiment"], } diff --git a/csv_detective/formats/booleen.py b/csv_detective/formats/booleen.py index 077d5658..f96343dd 100755 --- a/csv_detective/formats/booleen.py +++ b/csv_detective/formats/booleen.py @@ -29,6 +29,6 @@ def _is(val): _test_values = { - True: ["oui", "0", "1", "yes", "false", "True"], - False: ["nein", "ja", "2", "-0"], + True: ["oui", "0", "1", "yes", "false", "True"], + False: ["nein", "ja", "2", "-0"], } diff --git a/csv_detective/formats/code_commune_insee.py b/csv_detective/formats/code_commune_insee.py index 44f2b9f0..70d59d64 100755 --- a/csv_detective/formats/code_commune_insee.py +++ b/csv_detective/formats/code_commune_insee.py @@ -21,6 +21,6 @@ def _is(val): _test_values = { - True: ["91471", "01053"], - False: ["914712", "01000"], + True: ["91471", "01053"], + False: ["914712", "01000"], } diff --git a/csv_detective/formats/code_csp_insee.py b/csv_detective/formats/code_csp_insee.py index 954d3a17..b9c931b0 100755 --- a/csv_detective/formats/code_csp_insee.py +++ b/csv_detective/formats/code_csp_insee.py @@ -4,8 +4,7 @@ proportion = 1 tags = ["fr"] -labels = ["code csp insee", "code csp" -] +labels = ["code csp insee", "code csp"] def _is(val): @@ -32,6 +31,6 @@ def _is(val): _test_values = { - True: ["121f"], - False: ["121x"], + True: ["121f"], + False: ["121x"], } diff --git a/csv_detective/formats/code_departement.py b/csv_detective/formats/code_departement.py index c7823f3b..a9358a2e 100755 --- a/csv_detective/formats/code_departement.py +++ b/csv_detective/formats/code_departement.py @@ -3,11 +3,11 @@ proportion = 1 tags = ["fr", "geo"] labels = [ - "code departement", - "code_departement", - "dep", - "departement", - "dept", + "code departement", + "code_departement", + "dep", + "departement", + "dept", ] _options = Options( @@ -24,6 +24,6 @@ def _is(val): _test_values = { - True: ["75", "2A", "2b", "974", "01"], - False: ["00", "96", "101"], + True: ["75", "2A", "2b", "974", "01"], + False: ["00", "96", "101"], } diff --git a/csv_detective/formats/code_fantoir.py b/csv_detective/formats/code_fantoir.py index cfbe30a5..dc262fba 100755 --- a/csv_detective/formats/code_fantoir.py +++ b/csv_detective/formats/code_fantoir.py @@ -3,9 +3,9 @@ proportion = 1 tags = ["fr", "geo"] labels = [ - "cadastre1", - "code fantoir", - "fantoir", + "cadastre1", + "code fantoir", + "fantoir", ] _code_fantoir = CodeFantoir() @@ -16,6 +16,6 @@ def _is(val): _test_values = { - True: ["7755A", "B150B", "ZA04C", "ZB03D"], - False: ["7755", "ZA99A"], + True: ["7755A", "B150B", "ZA04C", "ZB03D"], + False: ["7755", "ZA99A"], } diff --git a/csv_detective/formats/code_import.py b/csv_detective/formats/code_import.py index a49c5e29..c500f1d7 100755 --- a/csv_detective/formats/code_import.py +++ b/csv_detective/formats/code_import.py @@ -2,8 +2,7 @@ proportion = 0.9 tags = ["fr"] -labels = [ -] +labels = [] regex = r"^(\d{3}[SP]\d{4,10}(.\w{1,3}\d{0,5})?|\d[A-Z0-9]\d[SP]\w(\w-?\w{0,2}\d{0,6})?)$" @@ -13,6 +12,6 @@ def _is(val): _test_values = { - True: ["123S1871092288"], - False: ["AA751PEE00188854", "W123456789"], + True: ["123S1871092288"], + False: ["AA751PEE00188854", "W123456789"], } diff --git a/csv_detective/formats/code_postal.py b/csv_detective/formats/code_postal.py index 1608ebcb..4b0ec994 100755 --- a/csv_detective/formats/code_postal.py +++ b/csv_detective/formats/code_postal.py @@ -3,13 +3,13 @@ proportion = 0.9 tags = ["fr", "geo"] labels = [ - "code postal", - "postal code", - "postcode", - "post code", - "cp", - "codes postaux", - "location postcode", + "code postal", + "postal code", + "postcode", + "post code", + "cp", + "codes postaux", + "location postcode", ] _code_postal = CodePostal() @@ -20,6 +20,6 @@ def _is(val): _test_values = { - True: ["75020", "01000"], - False: ["77777", "018339"], + True: ["75020", "01000"], + False: ["77777", "018339"], } diff --git a/csv_detective/formats/code_region.py b/csv_detective/formats/code_region.py index 72552e9f..92aff448 100755 --- a/csv_detective/formats/code_region.py +++ b/csv_detective/formats/code_region.py @@ -3,10 +3,10 @@ proportion = 1 tags = ["fr", "geo"] labels = [ - "code region", - "reg", - "code insee region", - "region", + "code region", + "reg", + "code insee region", + "region", ] _code_region = CodeRegion(Millesime.LATEST) @@ -17,6 +17,6 @@ def _is(val): _test_values = { - True: ["32"], - False: ["55"], + True: ["32"], + False: ["55"], } diff --git a/csv_detective/formats/code_rna.py b/csv_detective/formats/code_rna.py index 32bf5c89..c31b7194 100755 --- a/csv_detective/formats/code_rna.py +++ b/csv_detective/formats/code_rna.py @@ -3,10 +3,10 @@ proportion = 0.9 tags = ["fr"] labels = [ - "code rna", - "rna", - "n° inscription association", - "identifiant association", + "code rna", + "rna", + "n° inscription association", + "identifiant association", ] _code_rna = CodeRNA() @@ -17,13 +17,13 @@ def _is(val): _test_values = { - True: ["W751515517"], - False: [ - "W111111111111111111111111111111111111", - "w143788974", - "W12", - "678W23456", - "165789325", - "Wa1#89sf&h", - ], + True: ["W751515517"], + False: [ + "W111111111111111111111111111111111111", + "w143788974", + "W12", + "678W23456", + "165789325", + "Wa1#89sf&h", + ], } diff --git a/csv_detective/formats/code_waldec.py b/csv_detective/formats/code_waldec.py index b078035b..354068ac 100755 --- a/csv_detective/formats/code_waldec.py +++ b/csv_detective/formats/code_waldec.py @@ -2,8 +2,7 @@ proportion = 0.9 tags = ["fr"] -labels = ["code waldec", "waldec" -] +labels = ["code waldec", "waldec"] _code_rna = CodeRNA() @@ -13,6 +12,6 @@ def _is(val): _test_values = { - True: ["W123456789", "W2D1234567"], - False: ["AA751PEE00188854"], + True: ["W123456789", "W2D1234567"], + False: ["AA751PEE00188854"], } diff --git a/csv_detective/formats/commune.py b/csv_detective/formats/commune.py index 05c2f1a1..9037305f 100755 --- a/csv_detective/formats/commune.py +++ b/csv_detective/formats/commune.py @@ -3,9 +3,9 @@ proportion = 0.8 tags = ["fr", "geo"] labels = [ - "commune", - "ville", - "libelle commune", + "commune", + "ville", + "libelle commune", ] _options = Options( @@ -22,6 +22,6 @@ def _is(val): _test_values = { - True: ["saint denis"], - False: ["new york", "lion"], + True: ["saint denis"], + False: ["new york", "lion"], } diff --git a/csv_detective/formats/csp_insee.py b/csv_detective/formats/csp_insee.py index ea5bffd0..709cba9b 100755 --- a/csv_detective/formats/csp_insee.py +++ b/csv_detective/formats/csp_insee.py @@ -5,9 +5,9 @@ proportion = 1 tags = ["fr"] labels = [ - "csp insee", - "csp", - "categorie socioprofessionnelle", + "csp insee", + "csp", + "categorie socioprofessionnelle", ] f = open(join(dirname(__file__), "data", "csp_insee.txt"), "r") @@ -26,6 +26,6 @@ def _is(val): _test_values = { - True: ["employes de la poste"], - False: ["super-heros"], + True: ["employes de la poste"], + False: ["super-heros"], } diff --git a/csv_detective/formats/date.py b/csv_detective/formats/date.py index 5819259e..d1a9e38c 100755 --- a/csv_detective/formats/date.py +++ b/csv_detective/formats/date.py @@ -8,27 +8,28 @@ proportion = 1 tags = ["temp"] labels = [ - "date", - "jour", - "date de mise a jour", - "sns date", - "date maj", - "rem date", - "periode", - "date de publication", - "dpc", - "extract date", - "date immatriculation", - "date jeu donnees", - "datemaj", - "dateouv", - "date der maj", - "dmaj", - "jour", - "yyyymmdd", - "aaaammjj", + "date", + "jour", + "date de mise a jour", + "sns date", + "date maj", + "rem date", + "periode", + "date de publication", + "dpc", + "extract date", + "date immatriculation", + "date jeu donnees", + "datemaj", + "dateouv", + "date der maj", + "dmaj", + "jour", + "yyyymmdd", + "aaaammjj", ] + def date_casting(val: str) -> datetime | None: """For performance reasons, we try first with dateutil and fallback on dateparser""" try: @@ -58,7 +59,6 @@ def date_casting(val: str) -> datetime | None: ).replace("SEP", seps + "?") - def _is(val): # early stops, to cut processing time if not isinstance(val, str) or len(val) > 20 or len(val) < 8: @@ -82,22 +82,22 @@ def _is(val): _test_values = { - True: [ - "1960-08-07", - "12/02/2007", - "15 jan 1985", - "15 décembre 1985", - "02 05 2003", - "20030502", - "1993-12/02", - ], - False: [ - "1993-1993-1993", - "39-10-1993", - "19-15-1993", - "15 tambour 1985", - "12152003", - "20031512", - "02052003", - ], + True: [ + "1960-08-07", + "12/02/2007", + "15 jan 1985", + "15 décembre 1985", + "02 05 2003", + "20030502", + "1993-12/02", + ], + False: [ + "1993-1993-1993", + "39-10-1993", + "19-15-1993", + "15 tambour 1985", + "12152003", + "20031512", + "02052003", + ], } diff --git a/csv_detective/formats/date_fr.py b/csv_detective/formats/date_fr.py index e12ba32c..121cdfc5 100755 --- a/csv_detective/formats/date_fr.py +++ b/csv_detective/formats/date_fr.py @@ -2,8 +2,7 @@ proportion = 1 tags = ["fr", "temp"] -labels = ["date" -] +labels = ["date"] pattern = ( r"^\d{1,2}[ \-](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre" @@ -16,6 +15,6 @@ def _is(val): _test_values = { - True: ["13 février 1996"], - False: ["44 march 2025"], + True: ["13 février 1996"], + False: ["44 march 2025"], } diff --git a/csv_detective/formats/datetime_aware.py b/csv_detective/formats/datetime_aware.py index 4414bf03..0e4cc8dd 100755 --- a/csv_detective/formats/datetime_aware.py +++ b/csv_detective/formats/datetime_aware.py @@ -4,8 +4,7 @@ proportion = 1 tags = ["temp"] -labels = [ -] +labels = [] threshold = 0.7 pat = ( @@ -35,12 +34,12 @@ def _is(val): _test_values = { - True: [ - "2021-06-22 10:20:10-04:00", - "2030-06-22 00:00:00.0028+02:00", - "2000-12-21 10:20:10.1Z", - "2024-12-19T10:53:36.428000+00:00", - "1996/06/22 10:20:10 GMT", - ], - False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"], + True: [ + "2021-06-22 10:20:10-04:00", + "2030-06-22 00:00:00.0028+02:00", + "2000-12-21 10:20:10.1Z", + "2024-12-19T10:53:36.428000+00:00", + "1996/06/22 10:20:10 GMT", + ], + False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"], } diff --git a/csv_detective/formats/datetime_naive.py b/csv_detective/formats/datetime_naive.py index 491de27b..e6132486 100755 --- a/csv_detective/formats/datetime_naive.py +++ b/csv_detective/formats/datetime_naive.py @@ -5,8 +5,7 @@ proportion = 1 tags = ["temp"] -labels = [ -] +labels = [] threshold = 0.7 # matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR @@ -33,17 +32,17 @@ def _is(val: Any | None) -> bool: _test_values = { - True: [ - "2021-06-22 10:20:10", - "2030/06-22 00:00:00", - "2030/06/22 00:00:00.0028", - ], - False: [ - "2021-06-22T30:20:10", - "Sun, 06 Nov 1994 08:49:37 GMT", - "2021-06-44 10:20:10+02:00", - "1999-12-01T00:00:00Z", - "2021-06-44", - "15 décembre 1985", - ], + True: [ + "2021-06-22 10:20:10", + "2030/06-22 00:00:00", + "2030/06/22 00:00:00.0028", + ], + False: [ + "2021-06-22T30:20:10", + "Sun, 06 Nov 1994 08:49:37 GMT", + "2021-06-44 10:20:10+02:00", + "1999-12-01T00:00:00Z", + "2021-06-44", + "15 décembre 1985", + ], } diff --git a/csv_detective/formats/datetime_rfc822.py b/csv_detective/formats/datetime_rfc822.py index d044c475..6011c60a 100755 --- a/csv_detective/formats/datetime_rfc822.py +++ b/csv_detective/formats/datetime_rfc822.py @@ -3,16 +3,16 @@ proportion = 1 tags = ["temp"] labels = [ - "datetime", - "timestamp", - "osm_timestamp", - "date", - "created at", - "last update", - "date maj", - "createdat", - "date naissance", - "date donnees", + "datetime", + "timestamp", + "osm_timestamp", + "date", + "created at", + "last update", + "date maj", + "createdat", + "date naissance", + "date donnees", ] @@ -29,6 +29,6 @@ def _is(val): _test_values = { - True: ["Sun, 06 Nov 1994 08:49:37 GMT"], - False: ["2021-06-22T10:20:10"], + True: ["Sun, 06 Nov 1994 08:49:37 GMT"], + False: ["2021-06-22T10:20:10"], } diff --git a/csv_detective/formats/departement.py b/csv_detective/formats/departement.py index e2c4930b..e25d2224 100755 --- a/csv_detective/formats/departement.py +++ b/csv_detective/formats/departement.py @@ -3,19 +3,19 @@ proportion = 0.9 tags = ["fr", "geo"] labels = [ - "departement", - "libelle du departement", - "deplib", - "nom dept", - "dept", - "libdepartement", - "nom departement", - "libelle dep", - "libelle departement", - "lb departements", - "dep libusage", - "lb departement", - "nom dep", + "departement", + "libelle du departement", + "deplib", + "nom dept", + "dept", + "libdepartement", + "nom departement", + "libelle dep", + "libelle departement", + "lb departements", + "dep libusage", + "lb departement", + "nom dep", ] _options = Options( @@ -32,6 +32,6 @@ def _is(val): _test_values = { - True: ["essonne"], - False: ["alabama", "auvergne"], + True: ["essonne"], + False: ["alabama", "auvergne"], } diff --git a/csv_detective/formats/email.py b/csv_detective/formats/email.py index 329567f3..fc14a9db 100755 --- a/csv_detective/formats/email.py +++ b/csv_detective/formats/email.py @@ -3,17 +3,17 @@ proportion = 0.9 tags = [] labels = [ - "email", - "mail", - "courriel", - "contact", - "mel", - "lieucourriel", - "coordinates.emailcontact", - "e mail", - "mo mail", - "adresse mail", - "adresse email", + "email", + "mail", + "courriel", + "contact", + "mel", + "lieucourriel", + "coordinates.emailcontact", + "e mail", + "mo mail", + "adresse mail", + "adresse email", ] @@ -24,6 +24,6 @@ def _is(val): _test_values = { - True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"], - False: ["cdo@@gouv.sfd"], + True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"], + False: ["cdo@@gouv.sfd"], } diff --git a/csv_detective/formats/float.py b/csv_detective/formats/float.py index b903e2be..f5c878a5 100755 --- a/csv_detective/formats/float.py +++ b/csv_detective/formats/float.py @@ -24,6 +24,6 @@ def _is(val): _test_values = { - True: ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"], - False: ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"], + True: ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"], + False: ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"], } diff --git a/csv_detective/formats/geojson.py b/csv_detective/formats/geojson.py index 574c4168..48d28712 100755 --- a/csv_detective/formats/geojson.py +++ b/csv_detective/formats/geojson.py @@ -3,14 +3,14 @@ proportion = 1 tags = ["geo"] labels = [ - "json geojson", - "json", - "geojson", - "geo shape", - "geom", - "geometry", - "geo shape", - "geoshape", + "json geojson", + "json", + "geojson", + "geo shape", + "geom", + "geometry", + "geo shape", + "geoshape", ] @@ -28,9 +28,9 @@ def _is(val) -> bool: _test_values = { - True: [ - '{"coordinates": [45.783753, 3.049342], "type": "63870"}', - '{"geometry": {"coordinates": [45.783753, 3.049342]}}', - ], - False: ['{"pomme": "fruit", "reponse": 42}'], + True: [ + '{"coordinates": [45.783753, 3.049342], "type": "63870"}', + '{"geometry": {"coordinates": [45.783753, 3.049342]}}', + ], + False: ['{"pomme": "fruit", "reponse": 42}'], } diff --git a/csv_detective/formats/insee_ape700.py b/csv_detective/formats/insee_ape700.py index e2b16201..193fd963 100755 --- a/csv_detective/formats/insee_ape700.py +++ b/csv_detective/formats/insee_ape700.py @@ -5,12 +5,12 @@ proportion = 0.8 tags = ["fr"] labels = [ - "code ape", - "code activite (ape)", - "code naf", - "code naf organisme designe", - "code naf organisme designant", - "base sirene : code ape de l'etablissement siege", + "code ape", + "code activite (ape)", + "code naf", + "code naf organisme designe", + "code naf organisme designant", + "base sirene : code ape de l'etablissement siege", ] f = open(join(dirname(__file__), "data", "insee_ape700.txt"), "r") diff --git a/csv_detective/formats/insee_canton.py b/csv_detective/formats/insee_canton.py index 72c470da..cc622b2f 100755 --- a/csv_detective/formats/insee_canton.py +++ b/csv_detective/formats/insee_canton.py @@ -3,10 +3,10 @@ proportion = 0.9 tags = ["fr", "geo"] labels = [ - "insee canton", - "canton", - "cant", - "nom canton", + "insee canton", + "canton", + "cant", + "nom canton", ] _options = Options( @@ -23,6 +23,6 @@ def _is(val): _test_values = { - True: ["nantua"], - False: ["california"], + True: ["nantua"], + False: ["california"], } diff --git a/csv_detective/formats/iso_country_code_alpha2.py b/csv_detective/formats/iso_country_code_alpha2.py index 2633cff3..ce7f5e4d 100755 --- a/csv_detective/formats/iso_country_code_alpha2.py +++ b/csv_detective/formats/iso_country_code_alpha2.py @@ -4,13 +4,13 @@ proportion = 1 tags = ["geo"] labels = [ - "iso country code", - "code pays", - "pays", - "country", - "nation", - "pays code", - "code pays (iso)", + "iso country code", + "code pays", + "pays", + "country", + "nation", + "pays code", + "code pays (iso)", ] with open(join(dirname(__file__), "data", "iso_country_code_alpha2.txt"), "r") as iofile: @@ -25,6 +25,6 @@ def _is(val): _test_values = { - True: ["FR"], - False: ["XX", "A", "FRA"], + True: ["FR"], + False: ["XX", "A", "FRA"], } diff --git a/csv_detective/formats/iso_country_code_alpha3.py b/csv_detective/formats/iso_country_code_alpha3.py index 50f745db..c44b6d1d 100755 --- a/csv_detective/formats/iso_country_code_alpha3.py +++ b/csv_detective/formats/iso_country_code_alpha3.py @@ -4,13 +4,13 @@ proportion = 1 tags = ["geo"] labels = [ - "iso country code", - "code pays", - "pays", - "country", - "nation", - "pays code", - "code pays (iso)", + "iso country code", + "code pays", + "pays", + "country", + "nation", + "pays code", + "code pays (iso)", ] with open(join(dirname(__file__), "data", "iso_country_code_alpha3.txt"), "r") as iofile: @@ -25,6 +25,6 @@ def _is(val): _test_values = { - True: ["FRA"], - False: ["XXX", "FR", "A"], + True: ["FRA"], + False: ["XXX", "FR", "A"], } diff --git a/csv_detective/formats/iso_country_code_numeric.py b/csv_detective/formats/iso_country_code_numeric.py index 32c68d3b..ca2d298a 100755 --- a/csv_detective/formats/iso_country_code_numeric.py +++ b/csv_detective/formats/iso_country_code_numeric.py @@ -4,13 +4,13 @@ proportion = 1 tags = ["geo"] labels = [ - "iso country code", - "code pays", - "pays", - "country", - "nation", - "pays code", - "code pays (iso)", + "iso country code", + "code pays", + "pays", + "country", + "nation", + "pays code", + "code pays (iso)", ] with open(join(dirname(__file__), "data", "iso_country_code_numeric.txt"), "r") as iofile: @@ -26,6 +26,6 @@ def _is(val): _test_values = { - True: ["250"], - False: ["003"], + True: ["250"], + False: ["003"], } diff --git a/csv_detective/formats/jour_de_la_semaine.py b/csv_detective/formats/jour_de_la_semaine.py index 2f09ac5b..99de99ae 100755 --- a/csv_detective/formats/jour_de_la_semaine.py +++ b/csv_detective/formats/jour_de_la_semaine.py @@ -1,15 +1,13 @@ - - proportion = 0.8 tags = ["fr", "temp"] labels = [ - "jour semaine", - "type jour", - "jour de la semaine", - "saufjour", - "nomjour", - "jour", - "jour de fermeture", + "jour semaine", + "type jour", + "jour de la semaine", + "saufjour", + "nomjour", + "jour", + "jour de fermeture", ] jours = { @@ -38,6 +36,6 @@ def _is(val): _test_values = { - True: ["lundi"], - False: ["jour de la biere"], + True: ["lundi"], + False: ["jour de la biere"], } diff --git a/csv_detective/formats/json.py b/csv_detective/formats/json.py index d7f09722..3c0e249f 100755 --- a/csv_detective/formats/json.py +++ b/csv_detective/formats/json.py @@ -3,8 +3,7 @@ proportion = 1 tags = [] -labels = [ -] +labels = [] def _is(val): @@ -17,6 +16,6 @@ def _is(val): _test_values = { - True: ['{"pomme": "fruit", "reponse": 42}', "[1,2,3,4]"], - False: ["5", '{"zefib":', '{"a"}'], + True: ['{"pomme": "fruit", "reponse": 42}', "[1,2,3,4]"], + False: ["5", '{"zefib":', '{"a"}'], } diff --git a/csv_detective/formats/latitude_l93.py b/csv_detective/formats/latitude_l93.py index 475cc112..069f1737 100755 --- a/csv_detective/formats/latitude_l93.py +++ b/csv_detective/formats/latitude_l93.py @@ -6,26 +6,26 @@ proportion = 1 tags = ["fr", "geo"] labels = [ - "latitude", - "lat", - "y", - "yf", - "yd", - "y l93", - "coordonnee y", - "latitude lb93", - "coord y", - "ycoord", - "geocodage y gps", - "location latitude", - "ylatitude", - "ylat", - "latitude (y)", - "latitudeorg", - "coordinates.latitude", - "googlemap latitude", - "latitudelieu", - "latitude googlemap", + "latitude", + "lat", + "y", + "yf", + "yd", + "y l93", + "coordonnee y", + "latitude lb93", + "coord y", + "ycoord", + "geocodage y gps", + "location latitude", + "ylatitude", + "ylat", + "latitude (y)", + "latitudeorg", + "coordinates.latitude", + "googlemap latitude", + "latitudelieu", + "latitude googlemap", ] _latitudel93 = LatitudeL93() @@ -43,6 +43,6 @@ def _is(val): _test_values = { - True: ["6037008", "7123528.5", "7124528,5"], - False: ["0", "-6734529.6", "7245669.8", "3422674,78", "32_34"], + True: ["6037008", "7123528.5", "7124528,5"], + False: ["0", "-6734529.6", "7245669.8", "3422674,78", "32_34"], } diff --git a/csv_detective/formats/latitude_wgs.py b/csv_detective/formats/latitude_wgs.py index ca3e700f..7ff9f116 100755 --- a/csv_detective/formats/latitude_wgs.py +++ b/csv_detective/formats/latitude_wgs.py @@ -3,27 +3,27 @@ proportion = 1 tags = ["geo"] labels = [ - "latitude", - "lat", - "y", - "yf", - "yd", - "coordonnee y", - "coord y", - "ycoord", - "geocodage y gps", - "location latitude", - "ylatitude", - "ylat", - "latitude (y)", - "latitudeorg", - "coordinates.latitude", - "googlemap latitude", - "latitudelieu", - "latitude googlemap", - "latitude wgs84", - "y wgs84", - "latitude (wgs84)", + "latitude", + "lat", + "y", + "yf", + "yd", + "coordonnee y", + "coord y", + "ycoord", + "geocodage y gps", + "location latitude", + "ylatitude", + "ylat", + "latitude (y)", + "latitudeorg", + "coordinates.latitude", + "googlemap latitude", + "latitudelieu", + "latitude googlemap", + "latitude wgs84", + "y wgs84", + "latitude (wgs84)", ] @@ -37,6 +37,6 @@ def _is(val): _test_values = { - True: ["43.2", "-22"], - False: ["100"], + True: ["43.2", "-22"], + False: ["100"], } diff --git a/csv_detective/formats/latitude_wgs_fr_metropole.py b/csv_detective/formats/latitude_wgs_fr_metropole.py index 1c77d04a..de98d047 100755 --- a/csv_detective/formats/latitude_wgs_fr_metropole.py +++ b/csv_detective/formats/latitude_wgs_fr_metropole.py @@ -3,27 +3,27 @@ proportion = 1 tags = ["fr", "geo"] labels = [ - "latitude", - "lat", - "y", - "yf", - "yd", - "coordonnee y", - "coord y", - "ycoord", - "geocodage y gps", - "location latitude", - "ylatitude", - "ylat", - "latitude (y)", - "latitudeorg", - "coordinates.latitude", - "googlemap latitude", - "latitudelieu", - "latitude googlemap", - "latitude wgs84", - "y wgs84", - "latitude (wgs84)", + "latitude", + "lat", + "y", + "yf", + "yd", + "coordonnee y", + "coord y", + "ycoord", + "geocodage y gps", + "location latitude", + "ylatitude", + "ylat", + "latitude (y)", + "latitudeorg", + "coordinates.latitude", + "googlemap latitude", + "latitudelieu", + "latitude googlemap", + "latitude wgs84", + "y wgs84", + "latitude (wgs84)", ] @@ -37,6 +37,6 @@ def _is(val): _test_values = { - True: ["42.5"], - False: ["22.5", "62.5"], + True: ["42.5"], + False: ["22.5", "62.5"], } diff --git a/csv_detective/formats/latlon_wgs.py b/csv_detective/formats/latlon_wgs.py index 1d535c62..d0358281 100755 --- a/csv_detective/formats/latlon_wgs.py +++ b/csv_detective/formats/latlon_wgs.py @@ -48,6 +48,6 @@ def _is(val): _test_values = { - True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8", "[12,-0.28]"], - False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27"], + True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8", "[12,-0.28]"], + False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27"], } diff --git a/csv_detective/formats/longitude_l93.py b/csv_detective/formats/longitude_l93.py index 8172a771..86f13e13 100755 --- a/csv_detective/formats/longitude_l93.py +++ b/csv_detective/formats/longitude_l93.py @@ -6,17 +6,17 @@ proportion = 1 tags = ["fr", "geo"] labels = [ - "longitude", - "lon", - "long", - "geocodage x gps", - "location longitude", - "xlongitude", - "lng", - "xlong", - "x", - "xf", - "xd", + "longitude", + "lon", + "long", + "geocodage x gps", + "location longitude", + "xlongitude", + "lng", + "xlong", + "x", + "xf", + "xd", ] _longitudel93 = LongitudeL93() @@ -34,6 +34,6 @@ def _is(val): _test_values = { - True: ["0", "-154", "1265783,45", "34723.4"], - False: ["1456669.8", "-776225", "346_3214"], + True: ["0", "-154", "1265783,45", "34723.4"], + False: ["1456669.8", "-776225", "346_3214"], } diff --git a/csv_detective/formats/longitude_wgs.py b/csv_detective/formats/longitude_wgs.py index c2de9425..f953edd7 100755 --- a/csv_detective/formats/longitude_wgs.py +++ b/csv_detective/formats/longitude_wgs.py @@ -3,17 +3,17 @@ proportion = 1 tags = ["geo"] labels = [ - "longitude", - "lon", - "long", - "geocodage x gps", - "location longitude", - "xlongitude", - "lng", - "xlong", - "x", - "xf", - "xd", + "longitude", + "lon", + "long", + "geocodage x gps", + "location longitude", + "xlongitude", + "lng", + "xlong", + "x", + "xf", + "xd", ] @@ -27,6 +27,6 @@ def _is(val): _test_values = { - True: ["120", "-20.2"], - False: ["-200"], + True: ["120", "-20.2"], + False: ["-200"], } diff --git a/csv_detective/formats/longitude_wgs_fr_metropole.py b/csv_detective/formats/longitude_wgs_fr_metropole.py index 7145abb3..20560e8c 100755 --- a/csv_detective/formats/longitude_wgs_fr_metropole.py +++ b/csv_detective/formats/longitude_wgs_fr_metropole.py @@ -3,17 +3,17 @@ proportion = 1 tags = ["fr", "geo"] labels = [ - "longitude", - "lon", - "long", - "geocodage x gps", - "location longitude", - "xlongitude", - "lng", - "xlong", - "x", - "xf", - "xd", + "longitude", + "lon", + "long", + "geocodage x gps", + "location longitude", + "xlongitude", + "lng", + "xlong", + "x", + "xf", + "xd", ] @@ -27,6 +27,6 @@ def _is(val): _test_values = { - True: ["-2.5"], - False: ["12.8"], + True: ["-2.5"], + False: ["12.8"], } diff --git a/csv_detective/formats/lonlat_wgs.py b/csv_detective/formats/lonlat_wgs.py index 54de3709..c8bcd80d 100755 --- a/csv_detective/formats/lonlat_wgs.py +++ b/csv_detective/formats/lonlat_wgs.py @@ -31,6 +31,6 @@ def _is(val): _test_values = { - True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"], - False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"], + True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"], + False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"], } diff --git a/csv_detective/formats/money.py b/csv_detective/formats/money.py index 6539fcd4..d76855bc 100755 --- a/csv_detective/formats/money.py +++ b/csv_detective/formats/money.py @@ -1,8 +1,7 @@ from .float import _is as is_float proportion = 0.8 -labels = ["budget", "salaire", "euro", "euros", "prêt", "montant" -] +labels = ["budget", "salaire", "euro", "euros", "prêt", "montant"] currencies = {"€", "$", "£", "¥"} @@ -14,6 +13,6 @@ def _is(val): _test_values = { - True: ["120€", "-20.2$"], - False: ["200", "100 euros"], + True: ["120€", "-20.2$"], + False: ["200", "100 euros"], } diff --git a/csv_detective/formats/mongo_object_id.py b/csv_detective/formats/mongo_object_id.py index bff75806..43c4ceb9 100755 --- a/csv_detective/formats/mongo_object_id.py +++ b/csv_detective/formats/mongo_object_id.py @@ -1,8 +1,7 @@ import re proportion = 0.8 -labels = ["id", "objectid" -] +labels = ["id", "objectid"] def _is(val): @@ -10,6 +9,6 @@ def _is(val): _test_values = { - True: ["62320e50f981bc2b57bcc044"], - False: ["884762be-51f3-44c3-b811-1e14c5d89262", "0230240284a66e"], + True: ["62320e50f981bc2b57bcc044"], + False: ["884762be-51f3-44c3-b811-1e14c5d89262", "0230240284a66e"], } diff --git a/csv_detective/formats/pays.py b/csv_detective/formats/pays.py index edb139f7..e52d778b 100755 --- a/csv_detective/formats/pays.py +++ b/csv_detective/formats/pays.py @@ -3,17 +3,17 @@ proportion = 0.6 tags = ["fr", "geo"] labels = [ - "pays", - "payslieu", - "paysorg", - "country", - "pays lib", - "lieupays", - "pays beneficiaire", - "nom du pays", - "journey start country", - "libelle pays", - "journey end country", + "pays", + "payslieu", + "paysorg", + "country", + "pays lib", + "lieupays", + "pays beneficiaire", + "nom du pays", + "journey start country", + "libelle pays", + "journey end country", ] _options = Options( @@ -30,6 +30,6 @@ def _is(val): _test_values = { - True: ["france", "italie"], - False: ["amerique", "paris"], + True: ["france", "italie"], + False: ["amerique", "paris"], } diff --git a/csv_detective/formats/percent.py b/csv_detective/formats/percent.py index 1f0c0231..0e26ea53 100755 --- a/csv_detective/formats/percent.py +++ b/csv_detective/formats/percent.py @@ -2,8 +2,7 @@ proportion = 0.8 tags = [] -labels = [ -] +labels = [] def _is(val): @@ -13,6 +12,6 @@ def _is(val): _test_values = { - True: ["120%", "-20.2%"], - False: ["200", "100 pourcents"], + True: ["120%", "-20.2%"], + False: ["200", "100 pourcents"], } diff --git a/csv_detective/formats/region.py b/csv_detective/formats/region.py index 0300ffbf..f336d6fc 100755 --- a/csv_detective/formats/region.py +++ b/csv_detective/formats/region.py @@ -3,17 +3,17 @@ proportion = 1 tags = ["fr", "geo"] labels = [ - "region", - "libelle region", - "nom region", - "libelle reg", - "nom reg", - "reg libusage", - "nom de la region", - "regionorg", - "regionlieu", - "reg", - "nom officiel region", + "region", + "libelle region", + "nom region", + "libelle reg", + "nom reg", + "reg libusage", + "nom de la region", + "regionorg", + "regionlieu", + "reg", + "nom officiel region", ] _extra_valid_values_set = frozenset( @@ -65,6 +65,6 @@ def _is(val): _test_values = { - True: ["bretagne", "ile-de-france"], - False: ["baviere", "overgne"], + True: ["bretagne", "ile-de-france"], + False: ["baviere", "overgne"], } diff --git a/csv_detective/formats/sexe.py b/csv_detective/formats/sexe.py index cab962b8..38c61084 100755 --- a/csv_detective/formats/sexe.py +++ b/csv_detective/formats/sexe.py @@ -2,8 +2,7 @@ proportion = 1 tags = ["fr"] -labels = ["sexe", "sex", "civilite", "genre", "id sexe" -] +labels = ["sexe", "sex", "civilite", "genre", "id sexe"] def _is(val): @@ -14,6 +13,6 @@ def _is(val): _test_values = { - True: ["hfemme", "H"], - False: ["adulte"], + True: ["hfemme", "H"], + False: ["adulte"], } diff --git a/csv_detective/formats/siren.py b/csv_detective/formats/siren.py index 6d73c169..175c948f 100755 --- a/csv_detective/formats/siren.py +++ b/csv_detective/formats/siren.py @@ -3,14 +3,14 @@ proportion = 0.9 tags = ["fr"] labels = [ - "siren", - "siren organisme designe", - "siren organisme designant", - "n° siren", - "siren organisme", - "siren titulaire", - "numero siren", - "epci", + "siren", + "siren organisme designe", + "siren organisme designant", + "n° siren", + "siren organisme", + "siren titulaire", + "numero siren", + "epci", ] @@ -32,6 +32,6 @@ def _is(val): _test_values = { - True: ["552 100 554", "552100554"], - False: ["42"], + True: ["552 100 554", "552100554"], + False: ["42"], } diff --git a/csv_detective/formats/tel_fr.py b/csv_detective/formats/tel_fr.py index a09fb11b..bf5028a7 100755 --- a/csv_detective/formats/tel_fr.py +++ b/csv_detective/formats/tel_fr.py @@ -3,17 +3,17 @@ proportion = 0.7 tags = ["fr"] labels = [ - "telephone", - "tel", - "tel1", - "tel2", - "phone", - "num tel", - "tel mob", - "telephone sav", - "telephone1", - "coordinates.phone", - "telephone du lieu", + "telephone", + "tel", + "tel1", + "tel2", + "phone", + "num tel", + "tel mob", + "telephone sav", + "telephone1", + "coordinates.phone", + "telephone du lieu", ] @@ -31,6 +31,6 @@ def _is(val): _test_values = { - True: ["0134643467"], - False: ["6625388263", "01288398"], + True: ["0134643467"], + False: ["6625388263", "01288398"], } diff --git a/csv_detective/formats/uai.py b/csv_detective/formats/uai.py index 145f8021..f7dcf6d7 100755 --- a/csv_detective/formats/uai.py +++ b/csv_detective/formats/uai.py @@ -3,22 +3,22 @@ proportion = 0.8 tags = ["fr"] labels = [ - "uai", - "code etablissement", - "code uai", - "uai - identifiant", - "numero uai", - "rne", - "numero de l'etablissement", - "code rne", - "codeetab", - "code uai de l'etablissement", - "ref uai", - "cd rne", - "numerouai", - "numero d etablissement", - "code etablissement", - "numero etablissement", + "uai", + "code etablissement", + "code uai", + "uai - identifiant", + "numero uai", + "rne", + "numero de l'etablissement", + "code rne", + "codeetab", + "code uai de l'etablissement", + "ref uai", + "cd rne", + "numerouai", + "numero d etablissement", + "code etablissement", + "numero etablissement", ] @@ -31,6 +31,6 @@ def _is(val): _test_values = { - True: ["0422170F"], - False: ["04292E"], + True: ["0422170F"], + False: ["04292E"], } diff --git a/csv_detective/formats/url.py b/csv_detective/formats/url.py index eec411a3..10f0ac3f 100755 --- a/csv_detective/formats/url.py +++ b/csv_detective/formats/url.py @@ -3,20 +3,20 @@ proportion = 1 tags = [] labels = [ - "url", - "url source", - "site web", - "source url", - "site internet", - "remote url", - "web", - "site", - "lien", - "site data", - "lien url", - "lien vers le fichier", - "sitweb", - "interneturl", + "url", + "url source", + "site web", + "source url", + "site internet", + "remote url", + "web", + "site", + "lien", + "site data", + "lien url", + "lien vers le fichier", + "sitweb", + "interneturl", ] pattern = re.compile( @@ -32,15 +32,15 @@ def _is(val): _test_values = { - True: [ - "www.data.gouv.fr", - "http://data.gouv.fr", - "https://www.youtube.com/@data-gouv-fr", - ( - "https://tabular-api.data.gouv.fr/api/resources/" - "aaaaaaaa-1111-bbbb-2222-cccccccccccc/data/" - "?score__greater=0.9&decompte__exact=13" - ), - ], - False: ["tmp@data.gouv.fr"], + True: [ + "www.data.gouv.fr", + "http://data.gouv.fr", + "https://www.youtube.com/@data-gouv-fr", + ( + "https://tabular-api.data.gouv.fr/api/resources/" + "aaaaaaaa-1111-bbbb-2222-cccccccccccc/data/" + "?score__greater=0.9&decompte__exact=13" + ), + ], + False: ["tmp@data.gouv.fr"], } diff --git a/csv_detective/formats/username.py b/csv_detective/formats/username.py index cab1240b..f93b3cf5 100755 --- a/csv_detective/formats/username.py +++ b/csv_detective/formats/username.py @@ -2,8 +2,7 @@ proportion = 1 tags = [] -labels = ["account", "username", "user" -] +labels = ["account", "username", "user"] def _is(val): @@ -11,6 +10,6 @@ def _is(val): _test_values = { - True: ["@accueil1"], - False: ["adresse@mail"], + True: ["@accueil1"], + False: ["adresse@mail"], } diff --git a/csv_detective/formats/uuid.py b/csv_detective/formats/uuid.py index 4d4aaf6d..47871d58 100755 --- a/csv_detective/formats/uuid.py +++ b/csv_detective/formats/uuid.py @@ -2,8 +2,7 @@ proportion = 0.8 tags = [] -labels = [ -] +labels = [] def _is(val) -> bool: @@ -13,6 +12,6 @@ def _is(val) -> bool: _test_values = { - True: ["884762be-51f3-44c3-b811-1e14c5d89262"], - False: ["0610928327"], + True: ["884762be-51f3-44c3-b811-1e14c5d89262"], + False: ["0610928327"], } diff --git a/csv_detective/formats/year.py b/csv_detective/formats/year.py index b3a1bf20..4de3dd3c 100755 --- a/csv_detective/formats/year.py +++ b/csv_detective/formats/year.py @@ -1,16 +1,16 @@ proportion = 1 tags = ["temp"] labels = [ - "year", - "annee", - "annee depot", - "an nais", - "exercice", - "data year", - "annee de publication", - "exercice comptable", - "annee de naissance", - "annee ouverture", + "year", + "annee", + "annee depot", + "an nais", + "exercice", + "data year", + "annee de publication", + "exercice comptable", + "annee de naissance", + "annee ouverture", ] @@ -23,6 +23,6 @@ def _is(val): _test_values = { - True: ["2015"], - False: ["20166", "123"], + True: ["2015"], + False: ["20166", "123"], } diff --git a/csv_detective/parsing/columns.py b/csv_detective/parsing/columns.py index 0a55ed3d..9e253541 100755 --- a/csv_detective/parsing/columns.py +++ b/csv_detective/parsing/columns.py @@ -112,10 +112,7 @@ def test_label( for idx, (label, format) in enumerate(formats.items()): if verbose: start_type = time() - return_table.loc[label] = [ - format.is_valid_label(col_name) - for col_name in columns - ] + return_table.loc[label] = [format.is_valid_label(col_name) for col_name in columns] if verbose: display_logs_depending_process_time( f'\t- Done with type "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})', @@ -140,7 +137,11 @@ def test_col_chunks( def build_remaining_tests_per_col(return_table: pd.DataFrame) -> dict[str, list[str]]: # returns a dict with the table's columns as keys and the list of remaining format labels to apply return { - col: [fmt_label for fmt_label in return_table.index if return_table.loc[fmt_label, col] > 0] + col: [ + fmt_label + for fmt_label in return_table.index + if return_table.loc[fmt_label, col] > 0 + ] for col in return_table.columns } From 4ef3ebe7d4928734b51fe66e736b500373993cc6 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Fri, 28 Nov 2025 18:15:14 +0100 Subject: [PATCH 04/21] chore: lint --- csv_detective/formats/__init__.py | 2 +- csv_detective/formats/lonlat_wgs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/csv_detective/formats/__init__.py b/csv_detective/formats/__init__.py index 105e3b43..88bdf47e 100755 --- a/csv_detective/formats/__init__.py +++ b/csv_detective/formats/__init__.py @@ -1,5 +1,5 @@ -import os import importlib +import os for file in os.listdir(os.path.dirname(__file__)): if file.endswith(".py") and not file.startswith("_"): diff --git a/csv_detective/formats/lonlat_wgs.py b/csv_detective/formats/lonlat_wgs.py index c8bcd80d..d1c87215 100755 --- a/csv_detective/formats/lonlat_wgs.py +++ b/csv_detective/formats/lonlat_wgs.py @@ -1,6 +1,6 @@ from .latitude_wgs import _is as is_lat -from .longitude_wgs import _is as is_lon from .latlon_wgs import COMMON_COORDS_LABELS +from .longitude_wgs import _is as is_lon proportion = 1 tags = ["geo"] From 58ad98e9ef53f1ddf519e63c18ddb554af75ab2c Mon Sep 17 00:00:00 2001 From: Pierlou Date: Mon, 1 Dec 2025 13:47:58 +0100 Subject: [PATCH 05/21] fix: typos in formats --- csv_detective/formats/code_waldec.py | 6 +++--- csv_detective/formats/date_fr.py | 10 ++++++---- csv_detective/formats/sexe.py | 5 ++--- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/csv_detective/formats/code_waldec.py b/csv_detective/formats/code_waldec.py index 354068ac..bca1dfff 100755 --- a/csv_detective/formats/code_waldec.py +++ b/csv_detective/formats/code_waldec.py @@ -1,14 +1,14 @@ -from frformat import CodeRNA +import re proportion = 0.9 tags = ["fr"] labels = ["code waldec", "waldec"] -_code_rna = CodeRNA() +regex = r"^W\d[\dA-Z]\d{7}$" def _is(val): - return isinstance(val, str) and _code_rna.is_valid(val) + return isinstance(val, str) and bool(re.match(regex, val)) _test_values = { diff --git a/csv_detective/formats/date_fr.py b/csv_detective/formats/date_fr.py index 121cdfc5..b158a2f2 100755 --- a/csv_detective/formats/date_fr.py +++ b/csv_detective/formats/date_fr.py @@ -1,20 +1,22 @@ import re +from ..parsing.text import _process_text + proportion = 1 tags = ["fr", "temp"] labels = ["date"] pattern = ( - r"^\d{1,2}[ \-](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre" - r"|octobre|novembre|decembre)[ \-]\d{4}$" + r"^(0?[1-9]|[12][0-9]|3[01])[ \-/](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre" + r"|octobre|novembre|decembre)[ \-/]\d{4}$" ) def _is(val): - return isinstance(val, str) and bool(re.match(pattern, val)) + return isinstance(val, str) and bool(re.match(pattern, _process_text(val))) _test_values = { - True: ["13 février 1996"], + True: ["13 février 1996", "15 decembre 2024"], False: ["44 march 2025"], } diff --git a/csv_detective/formats/sexe.py b/csv_detective/formats/sexe.py index 38c61084..f86e5791 100755 --- a/csv_detective/formats/sexe.py +++ b/csv_detective/formats/sexe.py @@ -8,11 +8,10 @@ def _is(val): if not isinstance(val, str): return False - val = _process_text(val) - return val in {"homme", "femme", "h", "f", "m", "masculin", "feminin"} + return _process_text(val) in {"homme", "femme", "h", "f", "m", "masculin", "feminin"} _test_values = { - True: ["hfemme", "H"], + True: ["femme", "H"], False: ["adulte"], } From cbde3d7a1bf04d047d043b00b6c5628c2c57ec60 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Mon, 1 Dec 2025 13:48:24 +0100 Subject: [PATCH 06/21] chore: adapt tests on fields --- tests/test_fields.py | 402 ++++--------------------------------------- 1 file changed, 38 insertions(+), 364 deletions(-) diff --git a/tests/test_fields.py b/tests/test_fields.py index 3bef2cfc..4b38f433 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -6,87 +6,21 @@ import pytest from numpy import random -from csv_detective.detect_fields.FR.geo import ( - adresse, - code_commune_insee, - code_departement, - code_fantoir, - code_postal, - code_region, - commune, - departement, - insee_canton, - latitude_l93, - latitude_wgs_fr_metropole, - longitude_l93, - longitude_wgs_fr_metropole, - pays, - region, -) -from csv_detective.detect_fields.FR.other import ( - code_csp_insee, - code_import, - code_rna, - code_waldec, - csp_insee, - date_fr, - insee_ape700, - sexe, - siren, - siret, - tel_fr, - uai, -) -from csv_detective.detect_fields.FR.temp import jour_de_la_semaine, mois_de_annee -from csv_detective.detect_fields.geo import ( - iso_country_code_alpha2, - iso_country_code_alpha3, - iso_country_code_numeric, - json_geojson, - latitude_wgs, - latlon_wgs, - longitude_wgs, - lonlat_wgs, -) -from csv_detective.detect_fields.other import ( - booleen, - email, - json, - money, - mongo_object_id, - percent, - twitter, - url, - uuid, -) -from csv_detective.detect_fields.other import ( - float as test_float, -) -from csv_detective.detect_fields.other import ( - int as test_int, -) -from csv_detective.detect_fields.temp import ( - date, - datetime_aware, - datetime_naive, - datetime_rfc822, - year, -) from csv_detective.detection.variables import ( detect_categorical_variable, detect_continuous_variable, ) -from csv_detective.load_tests import return_all_tests +from csv_detective.format import FormatsManager from csv_detective.output.dataframe import cast from csv_detective.output.utils import prepare_output_dict from csv_detective.parsing.columns import test_col as col_test # to prevent pytest from testing it +fmtm = FormatsManager() -def test_all_tests_return_bool(): - all_tests = return_all_tests("ALL", "detect_fields") - for attr in all_tests.values(): +def test_all_format_funcs_return_bool(): + for format in fmtm.formats.values(): for tmp in ["a", "9", "3.14", "[]", float("nan"), "2021-06-22 10:20:10"]: - assert isinstance(attr["func"](tmp), bool) + assert isinstance(format.func(tmp), bool) # categorical @@ -124,292 +58,37 @@ def test_detect_continuous_variable(): assert res2.values and res2.values[0] == "cont" -fields = { - adresse: { - True: ["rue du martyr"], - False: ["un batiment"], - }, - code_commune_insee: { - True: ["91471", "01053"], - False: ["914712", "01000"], - }, - code_departement: { - True: ["75", "2A", "2b", "974", "01"], - False: ["00", "96", "101"], - }, - code_fantoir: { - True: ["7755A", "B150B", "ZA04C", "ZB03D"], - False: ["7755", "ZA99A"], - }, - code_postal: { - True: ["75020", "01000"], - False: ["77777", "018339"], - }, - code_region: { - True: ["32"], - False: ["55"], - }, - commune: { - True: ["saint denis"], - False: ["new york", "lion"], - }, - departement: { - True: ["essonne"], - False: ["alabama", "auvergne"], - }, - insee_canton: { - True: ["nantua"], - False: ["california"], - }, - latitude_l93: { - True: ["6037008", "7123528.5", "7124528,5"], - False: ["0", "-6734529.6", "7245669.8", "3422674,78", "32_34"], - }, - longitude_l93: { - True: ["0", "-154", "1265783,45", "34723.4"], - False: ["1456669.8", "-776225", "346_3214"], - }, - latitude_wgs_fr_metropole: { - True: ["42.5"], - False: ["22.5", "62.5"], - }, - longitude_wgs_fr_metropole: { - True: ["-2.5"], - False: ["12.8"], - }, - pays: { - True: ["france", "italie"], - False: ["amerique", "paris"], - }, - region: { - True: ["bretagne", "ile-de-france"], - False: ["baviere", "overgne"], - }, - code_csp_insee: { - True: ["121f"], - False: ["121x"], - }, - code_rna: { - True: ["W751515517"], - False: [ - "W111111111111111111111111111111111111", - "w143788974", - "W12", - "678W23456", - "165789325", - "Wa1#89sf&h", - ], - }, - code_import: { - True: ["123S1871092288"], - False: ["AA751PEE00188854", "W123456789"], - }, - code_waldec: { - True: ["W123456789", "W2D1234567"], - False: ["AA751PEE00188854"], - }, - csp_insee: { - True: ["employes de la poste"], - False: ["super-heros"], - }, - sexe: { - True: ["homme"], - False: ["hermaphrodite"], - }, - siren: { - True: ["552 100 554", "552100554"], - False: ["42"], - }, - siret: { - True: ["13002526500013", "130 025 265 00013"], - False: ["13002526500012"], - }, - uai: { - True: ["0422170F"], - False: ["04292E"], - }, - date_fr: { - True: ["13 fevrier 1996"], - False: ["44 march 2025"], - }, - insee_ape700: {True: ["0116Z"], False: ["0116A"]}, - tel_fr: { - True: ["0134643467"], - False: ["6625388263", "01288398"], - }, - jour_de_la_semaine: { - True: ["lundi"], - False: ["jour de la biere"], - }, - mois_de_annee: { - True: ["juin", "décembre"], - False: ["november"], - }, - iso_country_code_alpha2: { - True: ["FR"], - False: ["XX", "A", "FRA"], - }, - iso_country_code_alpha3: { - True: ["FRA"], - False: ["XXX", "FR", "A"], - }, - iso_country_code_numeric: { - True: ["250"], - False: ["003"], - }, - json_geojson: { - True: [ - '{"coordinates": [45.783753, 3.049342], "type": "63870"}', - '{"geometry": {"coordinates": [45.783753, 3.049342]}}', - ], - False: ['{"pomme": "fruit", "reponse": 42}'], - }, - latitude_wgs: { - True: ["43.2", "-22"], - False: ["100"], - }, - latlon_wgs: { - True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8", "[12,-0.28]"], - False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27"], - }, - longitude_wgs: { - True: ["120", "-20.2"], - False: ["-200"], - }, - lonlat_wgs: { - True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"], - False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"], - }, - booleen: { - True: ["oui", "0", "1", "yes", "false", "True"], - False: ["nein", "ja", "2", "-0"], - }, - email: { - True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"], - False: ["cdo@@gouv.sfd"], - }, - json: { - True: ['{"pomme": "fruit", "reponse": 42}', "[1,2,3,4]"], - False: ["5", '{"zefib":', '{"a"}'], - }, - money: { - True: ["120€", "-20.2$"], - False: ["200", "100 euros"], - }, - mongo_object_id: { - True: ["62320e50f981bc2b57bcc044"], - False: ["884762be-51f3-44c3-b811-1e14c5d89262", "0230240284a66e"], - }, - percent: { - True: ["120%", "-20.2%"], - False: ["200", "100 pourcents"], - }, - twitter: { - True: ["@accueil1"], - False: ["adresse@mail"], - }, - url: { - True: [ - "www.data.gouv.fr", - "http://data.gouv.fr", - "https://www.youtube.com/@data-gouv-fr", - ( - "https://tabular-api.data.gouv.fr/api/resources/" - "aaaaaaaa-1111-bbbb-2222-cccccccccccc/data/" - "?score__greater=0.9&decompte__exact=13" - ), - ], - False: ["tmp@data.gouv.fr"], - }, - uuid: { - True: ["884762be-51f3-44c3-b811-1e14c5d89262"], - False: ["0610928327"], - }, - test_int: { - True: ["1", "0", "1764", "-24"], - False: ["01053", "1.2", "123_456", "+35"], - }, - test_float: { - True: ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"], - False: ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"], - }, - date: { - True: [ - "1960-08-07", - "12/02/2007", - "15 jan 1985", - "15 décembre 1985", - "02 05 2003", - "20030502", - "1993-12/02", - ], - False: [ - "1993-1993-1993", - "39-10-1993", - "19-15-1993", - "15 tambour 1985", - "12152003", - "20031512", - "02052003", - ], - }, - datetime_aware: { - True: [ - "2021-06-22 10:20:10-04:00", - "2030-06-22 00:00:00.0028+02:00", - "2000-12-21 10:20:10.1Z", - "2024-12-19T10:53:36.428000+00:00", - "1996/06/22 10:20:10 GMT", - ], - False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"], - }, - datetime_naive: { - True: [ - "2021-06-22 10:20:10", - "2030/06-22 00:00:00", - "2030/06/22 00:00:00.0028", - ], - False: [ - "2021-06-22T30:20:10", - "Sun, 06 Nov 1994 08:49:37 GMT", - "2021-06-44 10:20:10+02:00", - "1999-12-01T00:00:00Z", - "2021-06-44", - "15 décembre 1985", - ], - }, - datetime_rfc822: { - True: ["Sun, 06 Nov 1994 08:49:37 GMT"], - False: ["2021-06-22T10:20:10"], - }, - year: { - True: ["2015"], - False: ["20166"], - }, -} - # we could also have a function here to add all True values of (almost) -# each field to the False values of all others +# each field to the False values of all others (to do when parenthood is added) def test_all_fields_have_tests(): - all_tests = return_all_tests("ALL", "detect_fields") - for attr in all_tests.values(): - assert fields.get(attr["module"]) - - + for format in fmtm.formats.values(): + valid = format._test_values + # checking structure + assert all( + isinstance(key, bool) + and isinstance(vals, list) + and all(isinstance(val, str) for val in vals) + for key, vals in valid.items() + ) + # checking that we have valid and invalid cases for each + assert all(b in valid.keys() for b in [True, False]) + + +# this is based on the _test_values of each .py file @pytest.mark.parametrize( "args", ( - (field, value, valid) - for field in fields + (format.func, value, valid) for valid in [True, False] - for value in fields[field][valid] + for format in fmtm.formats.values() + for value in format._test_values[valid] ), ) def test_fields_with_values(args): - field, value, valid = args - assert field._is(value) is valid + func, value, valid = args + assert func(value) is valid @pytest.mark.parametrize( @@ -456,37 +135,32 @@ def test_priority(args): @pytest.mark.parametrize( "args", ( - ("1996-02-13", date), - ("28/01/2000", date), - ("2025-08-20T14:30:00+02:00", datetime_aware), - ("2025/08/20 14:30:00.2763-12:00", datetime_aware), - ("1925_12_20T14:30:00.2763", datetime_naive), - ("1925 12 20 14:30:00Z", datetime_aware), + ("1996-02-13", fmtm.formats["date"]), + ("28/01/2000", fmtm.formats["date"]), + ("2025-08-20T14:30:00+02:00", fmtm.formats["datetime_aware"]), + ("2025/08/20 14:30:00.2763-12:00", fmtm.formats["datetime_aware"]), + ("1925_12_20T14:30:00.2763", fmtm.formats["datetime_naive"]), + ("1925 12 20 14:30:00Z", fmtm.formats["datetime_aware"]), ), ) def test_early_detection(args): - value, module = args - with patch("csv_detective.detect_fields.temp.date.date_casting") as mock_func: - res = module._is(value) + value, format = args + with patch("csv_detective.formats.date.date_casting") as mock_func: + res = format.func(value) assert res mock_func.assert_not_called() def test_all_proportion_1(): - all_tests = return_all_tests("ALL", "detect_fields") - prop_1 = { - name: eval(name if name not in ["int", "float"] else "test_" + name) - for name, attr in all_tests.items() - if attr["prop"] == 1 - } # building a table that uses only correct values for these formats, except on one row table = pd.DataFrame( { - test_name: (fields[test_module][True] * 100)[:100] + ["not_suitable"] - for test_name, test_module in prop_1.items() + name: (format._test_values[True] * 100)[:100] + ["not_suitable"] + for name, format in fmtm.formats.items() + if format.proportion == 1 } ) # testing columns for all formats - returned_table = col_test(table, all_tests, limited_output=True) + returned_table = col_test(table, fmtm.formats, limited_output=True) # the analysis should have found no match on any format assert all(returned_table[col].sum() == 0 for col in table.columns) From 66485f6c2989dfd0df2539c5e42fc81412875603 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Mon, 1 Dec 2025 14:00:49 +0100 Subject: [PATCH 07/21] chore: adapt to geojson modification --- csv_detective/detection/formats.py | 2 +- csv_detective/output/schema.py | 4 ++-- tests/data/a_test_file.json | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/csv_detective/detection/formats.py b/csv_detective/detection/formats.py index d83ed5aa..39647128 100755 --- a/csv_detective/detection/formats.py +++ b/csv_detective/detection/formats.py @@ -112,7 +112,7 @@ def detect_formats( "float": "float", "string": "string", "json": "json", - "json_geojson": "json", + "geojson": "json", "datetime_aware": "datetime", "datetime_naive": "datetime", "datetime_rfc822": "datetime", diff --git a/csv_detective/output/schema.py b/csv_detective/output/schema.py index c78f6284..51577a08 100755 --- a/csv_detective/output/schema.py +++ b/csv_detective/output/schema.py @@ -103,7 +103,7 @@ def get_validata_type(format: str) -> str: "datetime_aware": "datetime", "datetime_naive": "datetime", "datetime_rfc822": "datetime", - "json_geojson": "geojson", + "geojson": "geojson", "latitude": "number", "latitude_l93": "number", "latitude_wgs": "number", @@ -150,7 +150,7 @@ def get_example(format: str) -> str: "iso_country_code_alpha3": "FRA", "iso_country_code_numeric": 250, "jour_de_la_semaine": "lundi", - "json_geojson": '{"type": "Point", "coordinates": [0, 0]}', + "geojson": '{"type": "Point", "coordinates": [0, 0]}', "latitude": 42.42, "latitude_l93": 6037008, "latitude_wgs": 42.42, diff --git a/tests/data/a_test_file.json b/tests/data/a_test_file.json index a9b61062..6dc68571 100755 --- a/tests/data/a_test_file.json +++ b/tests/data/a_test_file.json @@ -81,7 +81,7 @@ }, "GEO_INFO": { "python_type": "json", - "format": "json_geojson", + "format": "geojson", "score": 1.0 } }, @@ -195,7 +195,7 @@ }, "GEO_INFO": { "python_type": "json", - "format": "json_geojson", + "format": "geojson", "score": 1.0 } }, @@ -226,7 +226,7 @@ "json": [ "STRUCTURED_INFO" ], - "json_geojson": [ + "geojson": [ "GEO_INFO" ] }, From 665bf9e98750f6c3f90cf8194ee489b494908388 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Mon, 1 Dec 2025 14:01:29 +0100 Subject: [PATCH 08/21] chore: adapt file tests --- tests/test_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_file.py b/tests/test_file.py index ee670011..5e632fcd 100644 --- a/tests/test_file.py +++ b/tests/test_file.py @@ -49,7 +49,7 @@ def test_columns_output_on_file(chunk_size): assert output["columns"]["STRUCTURED_INFO"]["python_type"] == "json" assert output["columns"]["STRUCTURED_INFO"]["format"] == "json" assert output["columns"]["GEO_INFO"]["python_type"] == "json" - assert output["columns"]["GEO_INFO"]["format"] == "json_geojson" + assert output["columns"]["GEO_INFO"]["format"] == "geojson" def test_profile_output_on_file(): From 1bb40619bfae95e8847c0fcfc5be8e3a59ed318d Mon Sep 17 00:00:00 2001 From: Pierlou Date: Mon, 1 Dec 2025 14:04:30 +0100 Subject: [PATCH 09/21] chore: adapt labels tests --- tests/test_labels.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_labels.py b/tests/test_labels.py index 0b8a7b3b..ff5300c5 100644 --- a/tests/test_labels.py +++ b/tests/test_labels.py @@ -1,12 +1,13 @@ import pytest -from csv_detective.detect_labels import latitude_wgs, money +from csv_detective.format import FormatsManager +fmtm = FormatsManager() # money labels def test_money_labels(): header = "Montant total" - assert money._is(header) == 0.5 + assert fmtm.formats["money"].is_valid_label(header) == 0.5 @pytest.mark.parametrize( @@ -21,4 +22,4 @@ def test_money_labels(): ) def test_latitude(params): header, expected = params - assert expected == latitude_wgs._is(header) + assert expected == fmtm.formats["latitude_wgs"].is_valid_label(header) From 9d323ecd2c223b927210a0889bf7c60d56b2e402 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Mon, 1 Dec 2025 15:05:15 +0100 Subject: [PATCH 10/21] chore: adapt structure test --- csv_detective/format.py | 1 - tests/test_structure.py | 56 ++++++++++++++++------------------------- 2 files changed, 21 insertions(+), 36 deletions(-) diff --git a/csv_detective/format.py b/csv_detective/format.py index 72e9a7eb..17baa863 100755 --- a/csv_detective/format.py +++ b/csv_detective/format.py @@ -31,7 +31,6 @@ def __init__(self) -> None: import csv_detective.formats as formats format_labels = [f for f in dir(formats) if "_is" in dir(getattr(formats, f))] - assert len(format_labels) == len(set(format_labels)), "Format labels must be unique" self.formats = { label: Format( name=label, diff --git a/tests/test_structure.py b/tests/test_structure.py index e4a02591..aff6a410 100755 --- a/tests/test_structure.py +++ b/tests/test_structure.py @@ -1,41 +1,27 @@ import os -from csv_detective import detect_fields, detect_labels # noqa -from csv_detective.load_tests import return_all_tests +from csv_detective.format import Format, FormatsManager -def tests_conformity(): - """ - Check that all tests are properly structured: - - an __init__.py file in the test folder - - an _is function in the __init__.py file - """ - for _type in ["fields", "labels"]: - _dir = f"csv_detective/detect_{_type}" - subfolders = [] - for dirpath, dirnames, _ in os.walk(_dir): - for dirname in dirnames: - if "__pycache__" not in dirname: - subfolders.append(os.path.join(dirpath, dirname)) - final_subfolders = [ - sf - for sf in subfolders - if not any(other_sf.startswith(sf) for other_sf in subfolders if sf != other_sf) - ] - for f_sf in final_subfolders: - assert "__init__.py" in os.listdir(f_sf) - _package = eval( - f_sf.replace("csv_detective/", "") - # locally we have "\\", but in CI for instance there is "/" - .replace("\\", ".") - .replace("/", ".") - ) - assert "_is" in dir(_package) +def test_all_tests_have_unique_name(): + formats: list[str] = os.listdir("csv_detective/formats") + assert "__init__.py" in formats + assert len(formats) == len(set(formats)) -def test_all_tests_have_unique_name(): - names = [ - attr["module"].__name__.split(".")[-1] - for attr in return_all_tests("ALL", "detect_fields").values() - ] - assert len(names) == len(set(names)) +def tests_conformity(): + fmtm = FormatsManager() + for name, format in fmtm.formats.items(): + assert isinstance(name, str) + assert isinstance(format, Format) + assert all( + getattr(format, attr) is not None + for attr in [ + "name", + "func", + "_test_values", + "labels", + "proportion", + "tags", + ] + ) From 519fadbcef7a8d7b86b79e01f125c358fdfbd437 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Mon, 1 Dec 2025 15:28:21 +0100 Subject: [PATCH 11/21] refactor: clean up labels and tags --- csv_detective/formats/booleen.py | 1 + csv_detective/formats/code_import.py | 2 +- csv_detective/formats/date.py | 26 ++++++++++-------------- csv_detective/formats/datetime_aware.py | 6 +++--- csv_detective/formats/datetime_naive.py | 4 ++-- csv_detective/formats/datetime_rfc822.py | 16 +++------------ csv_detective/formats/email.py | 1 - csv_detective/formats/float.py | 2 +- csv_detective/formats/int.py | 1 + csv_detective/formats/json.py | 3 +-- csv_detective/formats/latlon_wgs.py | 6 +++--- csv_detective/formats/lonlat_wgs.py | 6 +++--- csv_detective/formats/percent.py | 1 - csv_detective/formats/url.py | 1 - csv_detective/formats/username.py | 1 - csv_detective/formats/uuid.py | 3 +-- 16 files changed, 31 insertions(+), 49 deletions(-) diff --git a/csv_detective/formats/booleen.py b/csv_detective/formats/booleen.py index f96343dd..0ef02282 100755 --- a/csv_detective/formats/booleen.py +++ b/csv_detective/formats/booleen.py @@ -1,4 +1,5 @@ proportion = 1 +tags = ["type"] labels = ["is ", "has ", "est "] bool_mapping = { diff --git a/csv_detective/formats/code_import.py b/csv_detective/formats/code_import.py index c500f1d7..43e39e84 100755 --- a/csv_detective/formats/code_import.py +++ b/csv_detective/formats/code_import.py @@ -2,7 +2,7 @@ proportion = 0.9 tags = ["fr"] -labels = [] +labels = ["code"] regex = r"^(\d{3}[SP]\d{4,10}(.\w{1,3}\d{0,5})?|\d[A-Z0-9]\d[SP]\w(\w-?\w{0,2}\d{0,6})?)$" diff --git a/csv_detective/formats/date.py b/csv_detective/formats/date.py index d1a9e38c..68f5a45b 100755 --- a/csv_detective/formats/date.py +++ b/csv_detective/formats/date.py @@ -6,25 +6,21 @@ from dateutil.parser import parse as dateutil_parser proportion = 1 -tags = ["temp"] -labels = [ +tags = ["temp", "type"] +SHARED_DATE_LABELS = [ "date", + "mise à jour", + "modifie", + "maj", + "datemaj", + "update", + "created", + "modified", +] +labels = SHARED_DATE_LABELS + [ "jour", - "date de mise a jour", - "sns date", - "date maj", - "rem date", "periode", - "date de publication", "dpc", - "extract date", - "date immatriculation", - "date jeu donnees", - "datemaj", - "dateouv", - "date der maj", - "dmaj", - "jour", "yyyymmdd", "aaaammjj", ] diff --git a/csv_detective/formats/datetime_aware.py b/csv_detective/formats/datetime_aware.py index 0e4cc8dd..4d783b21 100755 --- a/csv_detective/formats/datetime_aware.py +++ b/csv_detective/formats/datetime_aware.py @@ -1,10 +1,10 @@ import re -from .date import aaaammjj_pattern, date_casting +from .date import aaaammjj_pattern, date_casting, SHARED_DATE_LABELS proportion = 1 -tags = ["temp"] -labels = [] +tags = ["temp", "type"] +labels = SHARED_DATE_LABELS + ["datetime", "timestamp"] threshold = 0.7 pat = ( diff --git a/csv_detective/formats/datetime_naive.py b/csv_detective/formats/datetime_naive.py index e6132486..e09e230c 100755 --- a/csv_detective/formats/datetime_naive.py +++ b/csv_detective/formats/datetime_naive.py @@ -2,10 +2,10 @@ from typing import Any from .date import aaaammjj_pattern, date_casting +from .datetime_aware import labels # noqa proportion = 1 -tags = ["temp"] -labels = [] +tags = ["temp", "type"] threshold = 0.7 # matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR diff --git a/csv_detective/formats/datetime_rfc822.py b/csv_detective/formats/datetime_rfc822.py index 6011c60a..49629569 100755 --- a/csv_detective/formats/datetime_rfc822.py +++ b/csv_detective/formats/datetime_rfc822.py @@ -1,19 +1,9 @@ import re +from .datetime_aware import labels # noqa + proportion = 1 -tags = ["temp"] -labels = [ - "datetime", - "timestamp", - "osm_timestamp", - "date", - "created at", - "last update", - "date maj", - "createdat", - "date naissance", - "date donnees", -] +tags = ["temp", "type"] def _is(val): diff --git a/csv_detective/formats/email.py b/csv_detective/formats/email.py index fc14a9db..87e98f1d 100755 --- a/csv_detective/formats/email.py +++ b/csv_detective/formats/email.py @@ -1,7 +1,6 @@ import re proportion = 0.9 -tags = [] labels = [ "email", "mail", diff --git a/csv_detective/formats/float.py b/csv_detective/formats/float.py index f5c878a5..46b33e22 100755 --- a/csv_detective/formats/float.py +++ b/csv_detective/formats/float.py @@ -1,5 +1,5 @@ proportion = 1 -tags = [] +tags = ["type"] labels = ["part", "ratio", "taux"] diff --git a/csv_detective/formats/int.py b/csv_detective/formats/int.py index b6ecf328..d2c72062 100755 --- a/csv_detective/formats/int.py +++ b/csv_detective/formats/int.py @@ -1,4 +1,5 @@ labels = ["nb", "nombre", "nbre"] +tag = ["type"] def _is(val): diff --git a/csv_detective/formats/json.py b/csv_detective/formats/json.py index 3c0e249f..634c4adb 100755 --- a/csv_detective/formats/json.py +++ b/csv_detective/formats/json.py @@ -2,8 +2,7 @@ from json import JSONDecodeError proportion = 1 -tags = [] -labels = [] +tags = ["type"] def _is(val): diff --git a/csv_detective/formats/latlon_wgs.py b/csv_detective/formats/latlon_wgs.py index d0358281..7d37b917 100755 --- a/csv_detective/formats/latlon_wgs.py +++ b/csv_detective/formats/latlon_wgs.py @@ -4,7 +4,7 @@ proportion = 1 tags = ["geo"] -COMMON_COORDS_LABELS = [ +SHARED_COORDS_LABELS = [ "ban", "coordinates", "coordonnees", @@ -31,9 +31,9 @@ # we aim wide to catch exact matches if possible for the highest possible score labels = ( - COMMON_COORDS_LABELS + SHARED_COORDS_LABELS + specific - + [w + sep + suf for suf in specific for w in COMMON_COORDS_LABELS for sep in ["", " "]] + + [w + sep + suf for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]] ) diff --git a/csv_detective/formats/lonlat_wgs.py b/csv_detective/formats/lonlat_wgs.py index d1c87215..dcef794c 100755 --- a/csv_detective/formats/lonlat_wgs.py +++ b/csv_detective/formats/lonlat_wgs.py @@ -1,5 +1,5 @@ from .latitude_wgs import _is as is_lat -from .latlon_wgs import COMMON_COORDS_LABELS +from .latlon_wgs import SHARED_COORDS_LABELS from .longitude_wgs import _is as is_lon proportion = 1 @@ -14,9 +14,9 @@ # we aim wide to catch exact matches if possible for the highest possible score words = ( - COMMON_COORDS_LABELS + SHARED_COORDS_LABELS + specific - + [w + sep + suf for suf in specific for w in COMMON_COORDS_LABELS for sep in ["", " "]] + + [w + sep + suf for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]] ) diff --git a/csv_detective/formats/percent.py b/csv_detective/formats/percent.py index 0e26ea53..3c3b7dca 100755 --- a/csv_detective/formats/percent.py +++ b/csv_detective/formats/percent.py @@ -1,7 +1,6 @@ from .float import _is as is_float proportion = 0.8 -tags = [] labels = [] diff --git a/csv_detective/formats/url.py b/csv_detective/formats/url.py index 10f0ac3f..de8c0b2c 100755 --- a/csv_detective/formats/url.py +++ b/csv_detective/formats/url.py @@ -1,7 +1,6 @@ import re proportion = 1 -tags = [] labels = [ "url", "url source", diff --git a/csv_detective/formats/username.py b/csv_detective/formats/username.py index f93b3cf5..4e4952ad 100755 --- a/csv_detective/formats/username.py +++ b/csv_detective/formats/username.py @@ -1,7 +1,6 @@ import re proportion = 1 -tags = [] labels = ["account", "username", "user"] diff --git a/csv_detective/formats/uuid.py b/csv_detective/formats/uuid.py index 47871d58..7aeaa017 100755 --- a/csv_detective/formats/uuid.py +++ b/csv_detective/formats/uuid.py @@ -1,8 +1,7 @@ import re proportion = 0.8 -tags = [] -labels = [] +labels = ["id", "identifiant"] def _is(val) -> bool: From 5a9c7c86ac69818051cbaba7bd71748f641e9149 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Mon, 1 Dec 2025 15:58:25 +0100 Subject: [PATCH 12/21] fix: get from tags --- csv_detective/format.py | 1 - tests/test_structure.py | 22 ++++++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/csv_detective/format.py b/csv_detective/format.py index 17baa863..7ae4c641 100755 --- a/csv_detective/format.py +++ b/csv_detective/format.py @@ -46,7 +46,6 @@ def __init__(self) -> None: } def get_formats_from_tags(self, tags: list[str]) -> dict[str, Format]: - # allowed to skip with -temp return { label: fmt for label, fmt in self.formats.items() diff --git a/tests/test_structure.py b/tests/test_structure.py index aff6a410..84d5976c 100755 --- a/tests/test_structure.py +++ b/tests/test_structure.py @@ -1,7 +1,11 @@ import os +import pytest + from csv_detective.format import Format, FormatsManager +fmtm = FormatsManager() + def test_all_tests_have_unique_name(): formats: list[str] = os.listdir("csv_detective/formats") @@ -9,8 +13,7 @@ def test_all_tests_have_unique_name(): assert len(formats) == len(set(formats)) -def tests_conformity(): - fmtm = FormatsManager() +def test_conformity(): for name, format in fmtm.formats.items(): assert isinstance(name, str) assert isinstance(format, Format) @@ -25,3 +28,18 @@ def tests_conformity(): "tags", ] ) + + +@pytest.mark.parametrize( + "tags", + ( + ["type"], + ["temp", "fr"], + ), +) +def test_get_from_tags(tags): + fmts = fmtm.get_formats_from_tags(tags) + assert len(fmts) + for fmt in fmts.values(): + for tag in tags: + assert tag in fmt.tags \ No newline at end of file From 4d030c5f16cf290add9ef617c098e93991b2e824 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Mon, 1 Dec 2025 15:59:39 +0100 Subject: [PATCH 13/21] chore: lint --- csv_detective/formats/datetime_aware.py | 2 +- tests/test_fields.py | 1 + tests/test_labels.py | 1 + tests/test_structure.py | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/csv_detective/formats/datetime_aware.py b/csv_detective/formats/datetime_aware.py index 4d783b21..2a153e6a 100755 --- a/csv_detective/formats/datetime_aware.py +++ b/csv_detective/formats/datetime_aware.py @@ -1,6 +1,6 @@ import re -from .date import aaaammjj_pattern, date_casting, SHARED_DATE_LABELS +from .date import SHARED_DATE_LABELS, aaaammjj_pattern, date_casting proportion = 1 tags = ["temp", "type"] diff --git a/tests/test_fields.py b/tests/test_fields.py index 4b38f433..c4cc49cd 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -17,6 +17,7 @@ fmtm = FormatsManager() + def test_all_format_funcs_return_bool(): for format in fmtm.formats.values(): for tmp in ["a", "9", "3.14", "[]", float("nan"), "2021-06-22 10:20:10"]: diff --git a/tests/test_labels.py b/tests/test_labels.py index ff5300c5..a6370614 100644 --- a/tests/test_labels.py +++ b/tests/test_labels.py @@ -4,6 +4,7 @@ fmtm = FormatsManager() + # money labels def test_money_labels(): header = "Montant total" diff --git a/tests/test_structure.py b/tests/test_structure.py index 84d5976c..458dc11c 100755 --- a/tests/test_structure.py +++ b/tests/test_structure.py @@ -42,4 +42,4 @@ def test_get_from_tags(tags): assert len(fmts) for fmt in fmts.values(): for tag in tags: - assert tag in fmt.tags \ No newline at end of file + assert tag in fmt.tags From ad31b86f2c530d623ff72872803461d0c4efd38d Mon Sep 17 00:00:00 2001 From: Pierlou Date: Mon, 1 Dec 2025 16:11:03 +0100 Subject: [PATCH 14/21] chore: adapt chunk validation --- csv_detective/validate.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/csv_detective/validate.py b/csv_detective/validate.py index c779ccc3..24921705 100755 --- a/csv_detective/validate.py +++ b/csv_detective/validate.py @@ -101,8 +101,7 @@ def validate( continue test_result: float = test_col_val( serie=chunk[col_name], - test_func=formats[args["format"]].func, - proportion=formats[args["format"]].proportion, + format=formats[args["format"]], skipna=skipna, ) if not bool(test_result): From 687ff9b56d48ea0ababcdb7f95d614ba0a75962a Mon Sep 17 00:00:00 2001 From: Pierlou Date: Mon, 1 Dec 2025 16:16:54 +0100 Subject: [PATCH 15/21] docs: update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fa2908bd..38341e54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ - Process big csv files in chunks (/!\ breaking changes) [#159](https://github.com/datagouv/csv-detective/pull/159) - Handle column named `count` in profile creation [#171](https://github.com/datagouv/csv-detective/pull/171) - Prevent `NaN` in headers [#173](https://github.com/datagouv/csv-detective/pull/173) +- Refactor formats structure in the repo (/!\ breaking changes) [#176](https://github.com/datagouv/csv-detective/pull/176) ## 0.9.2 (2025-08-26) From 1529f524d490d93b033f29842d21d2fc84f805ab Mon Sep 17 00:00:00 2001 From: Pierlou Date: Mon, 1 Dec 2025 17:50:14 +0100 Subject: [PATCH 16/21] chore: absolute paths --- csv_detective/format.py | 2 +- csv_detective/formats/date_fr.py | 2 +- csv_detective/formats/datetime_aware.py | 2 +- csv_detective/formats/datetime_rfc822.py | 2 +- csv_detective/formats/latitude_l93.py | 4 ++-- csv_detective/formats/latitude_wgs.py | 2 +- csv_detective/formats/latitude_wgs_fr_metropole.py | 2 +- csv_detective/formats/latlon_wgs.py | 4 ++-- csv_detective/formats/longitude_l93.py | 4 ++-- csv_detective/formats/longitude_wgs.py | 2 +- csv_detective/formats/longitude_wgs_fr_metropole.py | 2 +- csv_detective/formats/lonlat_wgs.py | 6 +++--- csv_detective/formats/money.py | 2 +- csv_detective/formats/percent.py | 2 +- csv_detective/output/__init__.py | 6 +++--- csv_detective/parsing/load.py | 6 +++--- 16 files changed, 25 insertions(+), 25 deletions(-) diff --git a/csv_detective/format.py b/csv_detective/format.py index 7ae4c641..527abf94 100755 --- a/csv_detective/format.py +++ b/csv_detective/format.py @@ -1,6 +1,6 @@ from typing import Callable -from .parsing.text import header_score +from csv_detective.parsing.text import header_score class Format: diff --git a/csv_detective/formats/date_fr.py b/csv_detective/formats/date_fr.py index b158a2f2..2ba04928 100755 --- a/csv_detective/formats/date_fr.py +++ b/csv_detective/formats/date_fr.py @@ -1,6 +1,6 @@ import re -from ..parsing.text import _process_text +from csv_detective.parsing.text import _process_text proportion = 1 tags = ["fr", "temp"] diff --git a/csv_detective/formats/datetime_aware.py b/csv_detective/formats/datetime_aware.py index 2a153e6a..426e9e48 100755 --- a/csv_detective/formats/datetime_aware.py +++ b/csv_detective/formats/datetime_aware.py @@ -1,6 +1,6 @@ import re -from .date import SHARED_DATE_LABELS, aaaammjj_pattern, date_casting +from csv_detective.formats.date import SHARED_DATE_LABELS, aaaammjj_pattern, date_casting proportion = 1 tags = ["temp", "type"] diff --git a/csv_detective/formats/datetime_rfc822.py b/csv_detective/formats/datetime_rfc822.py index 49629569..4cd9d8fd 100755 --- a/csv_detective/formats/datetime_rfc822.py +++ b/csv_detective/formats/datetime_rfc822.py @@ -1,6 +1,6 @@ import re -from .datetime_aware import labels # noqa +from csv_detective.formats.datetime_aware import labels # noqa proportion = 1 tags = ["temp", "type"] diff --git a/csv_detective/formats/latitude_l93.py b/csv_detective/formats/latitude_l93.py index 069f1737..4866c1bb 100755 --- a/csv_detective/formats/latitude_l93.py +++ b/csv_detective/formats/latitude_l93.py @@ -1,7 +1,7 @@ from frformat import LatitudeL93 -from .float import _is as is_float -from .float import float_casting +from csv_detective.formats.float import _is as is_float +from csv_detective.formats.float import float_casting proportion = 1 tags = ["fr", "geo"] diff --git a/csv_detective/formats/latitude_wgs.py b/csv_detective/formats/latitude_wgs.py index 7ff9f116..58701e7a 100755 --- a/csv_detective/formats/latitude_wgs.py +++ b/csv_detective/formats/latitude_wgs.py @@ -1,4 +1,4 @@ -from .float import _is as is_float +from csv_detective.formats.float import _is as is_float proportion = 1 tags = ["geo"] diff --git a/csv_detective/formats/latitude_wgs_fr_metropole.py b/csv_detective/formats/latitude_wgs_fr_metropole.py index de98d047..d7489831 100755 --- a/csv_detective/formats/latitude_wgs_fr_metropole.py +++ b/csv_detective/formats/latitude_wgs_fr_metropole.py @@ -1,4 +1,4 @@ -from .float import _is as is_float +from csv_detective.formats.float import _is as is_float proportion = 1 tags = ["fr", "geo"] diff --git a/csv_detective/formats/latlon_wgs.py b/csv_detective/formats/latlon_wgs.py index 7d37b917..8a486d03 100755 --- a/csv_detective/formats/latlon_wgs.py +++ b/csv_detective/formats/latlon_wgs.py @@ -1,5 +1,5 @@ -from .latitude_wgs import _is as is_lat -from .longitude_wgs import _is as is_lon +from csv_detective.formats.latitude_wgs import _is as is_lat +from csv_detective.formats.longitude_wgs import _is as is_lon proportion = 1 tags = ["geo"] diff --git a/csv_detective/formats/longitude_l93.py b/csv_detective/formats/longitude_l93.py index 86f13e13..d44d812a 100755 --- a/csv_detective/formats/longitude_l93.py +++ b/csv_detective/formats/longitude_l93.py @@ -1,7 +1,7 @@ from frformat import LongitudeL93 -from .float import _is as is_float -from .float import float_casting +from csv_detective.formats.float import _is as is_float +from csv_detective.formats.float import float_casting proportion = 1 tags = ["fr", "geo"] diff --git a/csv_detective/formats/longitude_wgs.py b/csv_detective/formats/longitude_wgs.py index f953edd7..b0ded2e1 100755 --- a/csv_detective/formats/longitude_wgs.py +++ b/csv_detective/formats/longitude_wgs.py @@ -1,4 +1,4 @@ -from .float import _is as is_float +from csv_detective.formats.float import _is as is_float proportion = 1 tags = ["geo"] diff --git a/csv_detective/formats/longitude_wgs_fr_metropole.py b/csv_detective/formats/longitude_wgs_fr_metropole.py index 20560e8c..20a3be05 100755 --- a/csv_detective/formats/longitude_wgs_fr_metropole.py +++ b/csv_detective/formats/longitude_wgs_fr_metropole.py @@ -1,4 +1,4 @@ -from .float import _is as is_float +from csv_detective.formats.float import _is as is_float proportion = 1 tags = ["fr", "geo"] diff --git a/csv_detective/formats/lonlat_wgs.py b/csv_detective/formats/lonlat_wgs.py index dcef794c..293851fd 100755 --- a/csv_detective/formats/lonlat_wgs.py +++ b/csv_detective/formats/lonlat_wgs.py @@ -1,6 +1,6 @@ -from .latitude_wgs import _is as is_lat -from .latlon_wgs import SHARED_COORDS_LABELS -from .longitude_wgs import _is as is_lon +from csv_detective.formats.latitude_wgs import _is as is_lat +from csv_detective.formats.latlon_wgs import SHARED_COORDS_LABELS +from csv_detective.formats.longitude_wgs import _is as is_lon proportion = 1 tags = ["geo"] diff --git a/csv_detective/formats/money.py b/csv_detective/formats/money.py index d76855bc..81bbbc58 100755 --- a/csv_detective/formats/money.py +++ b/csv_detective/formats/money.py @@ -1,4 +1,4 @@ -from .float import _is as is_float +from csv_detective.formats.float import _is as is_float proportion = 0.8 labels = ["budget", "salaire", "euro", "euros", "prêt", "montant"] diff --git a/csv_detective/formats/percent.py b/csv_detective/formats/percent.py index 3c3b7dca..31529e7c 100755 --- a/csv_detective/formats/percent.py +++ b/csv_detective/formats/percent.py @@ -1,4 +1,4 @@ -from .float import _is as is_float +from csv_detective.formats.float import _is as is_float proportion = 0.8 labels = [] diff --git a/csv_detective/output/__init__.py b/csv_detective/output/__init__.py index 9a0271b6..980163a3 100755 --- a/csv_detective/output/__init__.py +++ b/csv_detective/output/__init__.py @@ -6,9 +6,9 @@ from csv_detective.utils import is_url -from .dataframe import cast_df_chunks -from .profile import create_profile -from .schema import generate_table_schema +from csv_detective.output.dataframe import cast_df_chunks +from csv_detective.output.profile import create_profile +from csv_detective.output.schema import generate_table_schema def generate_output( diff --git a/csv_detective/parsing/load.py b/csv_detective/parsing/load.py index 4ad7d6d8..3e204790 100755 --- a/csv_detective/parsing/load.py +++ b/csv_detective/parsing/load.py @@ -14,9 +14,9 @@ from csv_detective.detection.separator import detect_separator from csv_detective.utils import is_url -from .compression import unzip -from .csv import parse_csv -from .excel import ( +from csv_detective.parsing.compression import unzip +from csv_detective.parsing.csv import parse_csv +from csv_detective.parsing.excel import ( XLS_LIKE_EXT, parse_excel, ) From 4497df53c549fd9f8737ae5c962e4a7ac0f154d1 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Mon, 1 Dec 2025 17:56:27 +0100 Subject: [PATCH 17/21] chore: absolute paths --- csv_detective/formats/datetime_naive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csv_detective/formats/datetime_naive.py b/csv_detective/formats/datetime_naive.py index e09e230c..28b94d98 100755 --- a/csv_detective/formats/datetime_naive.py +++ b/csv_detective/formats/datetime_naive.py @@ -1,8 +1,8 @@ import re from typing import Any -from .date import aaaammjj_pattern, date_casting -from .datetime_aware import labels # noqa +from csv_detective.formats.date import aaaammjj_pattern, date_casting +from csv_detective.formats.datetime_aware import labels # noqa proportion = 1 tags = ["temp", "type"] From 12bb7e6c0e7290ff7c81b47989f613a28172c930 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Mon, 1 Dec 2025 17:56:53 +0100 Subject: [PATCH 18/21] docs: add insights for Format's attributes --- csv_detective/format.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/csv_detective/format.py b/csv_detective/format.py index 527abf94..bd271d07 100755 --- a/csv_detective/format.py +++ b/csv_detective/format.py @@ -14,10 +14,16 @@ def __init__( tags: list[str] = [], ) -> None: self.name: str = name + # func is the value test for the format (returns whether a string is valid) self.func: Callable = func + # _test_values are lists of valid and invalid values, used in the tests self._test_values: dict[bool, list[str]] = _test_values + # labels is the list of hint headers for the header score self.labels: list[str] = labels + # proportion is the tolerance (between 0 and 1) to say a column is valid for a format + # (1 => 100% of the column has to pass the func check for the column to be considered valid) self.proportion: float = proportion + # tags are to allow users to submit a file to only a subset of formats self.tags: list[str] = tags def is_valid_label(self, val: str) -> float: From 0383f564bbd1520482955d8c28625ac72b6f928f Mon Sep 17 00:00:00 2001 From: Pierlou Date: Mon, 1 Dec 2025 17:57:36 +0100 Subject: [PATCH 19/21] chore: lint --- csv_detective/output/__init__.py | 3 +-- csv_detective/parsing/load.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/csv_detective/output/__init__.py b/csv_detective/output/__init__.py index 980163a3..c5f90a92 100755 --- a/csv_detective/output/__init__.py +++ b/csv_detective/output/__init__.py @@ -4,11 +4,10 @@ import pandas as pd -from csv_detective.utils import is_url - from csv_detective.output.dataframe import cast_df_chunks from csv_detective.output.profile import create_profile from csv_detective.output.schema import generate_table_schema +from csv_detective.utils import is_url def generate_output( diff --git a/csv_detective/parsing/load.py b/csv_detective/parsing/load.py index 3e204790..5c20567b 100755 --- a/csv_detective/parsing/load.py +++ b/csv_detective/parsing/load.py @@ -12,14 +12,13 @@ ) from csv_detective.detection.headers import detect_headers from csv_detective.detection.separator import detect_separator -from csv_detective.utils import is_url - from csv_detective.parsing.compression import unzip from csv_detective.parsing.csv import parse_csv from csv_detective.parsing.excel import ( XLS_LIKE_EXT, parse_excel, ) +from csv_detective.utils import is_url def load_file( From 6e255ba5b7f3244750e46f9739d1c4ffaf178dd9 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Tue, 2 Dec 2025 14:32:05 +0100 Subject: [PATCH 20/21] docs: improve docstrings --- csv_detective/explore_csv.py | 29 ++++++++++++++++++++++++----- csv_detective/format.py | 21 +++++++++++++-------- csv_detective/validate.py | 6 ++++++ 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/csv_detective/explore_csv.py b/csv_detective/explore_csv.py index 241d04a0..8dc59886 100644 --- a/csv_detective/explore_csv.py +++ b/csv_detective/explore_csv.py @@ -28,14 +28,13 @@ def routine( verbose: bool = False, sheet_name: str | int | None = None, ) -> dict | tuple[dict, pd.DataFrame]: - """Returns a dict with information about the table and possible - column contents, and if requested the DataFrame with columns cast according to analysis. + """ + Returns a dict with information about the table and possible column contents, and if requested the DataFrame with columns cast according to analysis. Args: file_path: local path or URL to file - num_rows: number of rows to sample from the file for analysis ; -1 for analysis - of the whole file - tags: tags to filter formats + num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file + tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats) limited_output: whether or not to return all possible types or only the most likely one for each column save_results: whether or not to save the results in a json file, or the path where to dump the output output_profile: whether or not to add the 'profile' field to the output @@ -117,6 +116,26 @@ def validate_then_detect( cast_json: bool = True, verbose: bool = False, ): + """ + Performs a validation of the given file against the given analysis. + If the validation fails, performs a full analysis and return it. + Otherwise return the previous analysis (which is therefore still valid). + NB: if asked, the profile is recreated in both cases. + + Args: + file_path: the path of the file to validate. + previous_analysis: the previous analysis to validate against (expected in the same structure as the output of the routine) + num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file + tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats) + limited_output: whether or not to return all possible types or only the most likely one for each column + save_results: whether or not to save the results in a json file, or the path where to dump the output + skipna: whether to ignore NaN values in the checks + output_profile: whether or not to add the 'profile' field to the output + output_schema: whether or not to add the 'schema' field to the output (tableschema) + output_df: whether or not to return the loaded DataFrame along with the analysis report + cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings) + verbose: whether the code displays the steps it's going through + """ if verbose: start_routine = time() if is_url(file_path): diff --git a/csv_detective/format.py b/csv_detective/format.py index bd271d07..cb0a1a16 100755 --- a/csv_detective/format.py +++ b/csv_detective/format.py @@ -1,4 +1,4 @@ -from typing import Callable +from typing import Any, Callable from csv_detective.parsing.text import header_score @@ -7,23 +7,28 @@ class Format: def __init__( self, name: str, - func: Callable, + func: Callable[[Any], bool], _test_values: dict[bool, list[str]], labels: list[str] = [], proportion: float = 1, tags: list[str] = [], ) -> None: + """ + Instanciates a Format object. + + Args: + name: the name of the format. + func: the value test for the format (returns whether a string is valid). + _test_values: lists of valid and invalid values, used in the tests + labels: the list of hint headers for the header score + proportion: the tolerance (between 0 and 1) to say a column is valid for a format. (1 => 100% of the column has to pass the func check for the column to be considered valid) + tags: to allow users to submit a file to only a subset of formats + """ self.name: str = name - # func is the value test for the format (returns whether a string is valid) self.func: Callable = func - # _test_values are lists of valid and invalid values, used in the tests self._test_values: dict[bool, list[str]] = _test_values - # labels is the list of hint headers for the header score self.labels: list[str] = labels - # proportion is the tolerance (between 0 and 1) to say a column is valid for a format - # (1 => 100% of the column has to pass the func check for the column to be considered valid) self.proportion: float = proportion - # tags are to allow users to submit a file to only a subset of formats self.tags: list[str] = tags def is_valid_label(self, val: str) -> float: diff --git a/csv_detective/validate.py b/csv_detective/validate.py index 24921705..90a49e6e 100755 --- a/csv_detective/validate.py +++ b/csv_detective/validate.py @@ -19,6 +19,12 @@ def validate( ) -> tuple[bool, pd.DataFrame | None, dict | None, dict[str, pd.Series] | None]: """ Verify is the given file has the same fields and types as in the given analysis. + + Args: + file_path: the path of the file to validate + previous_analysis: the previous analysis to validate against (expected in the same structure as the output of the routine) + verbose: whether the code displays the steps it's going through + skipna: whether to ignore NaN values in the checks """ try: if previous_analysis.get("separator"): From 1ed1aa48c529e50d388c4da1eb6320c840c138f0 Mon Sep 17 00:00:00 2001 From: Pierlou Date: Tue, 2 Dec 2025 14:46:28 +0100 Subject: [PATCH 21/21] docs: update readme --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9598aa5c..c64a23c9 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks (regex, casting, comparison with official lists...) to see for each column if it matches with various content types. -Currently supported file types: csv, xls, xlsx, ods. +Currently supported file types: csv(.gz), xls, xlsx, ods. You can also directly feed the URL of a remote file (from data.gouv.fr for instance). @@ -34,7 +34,8 @@ inspection_results = routine( num_rows=-1, # Value -1 will analyze all lines of your file, you can change with the number of lines you wish to analyze save_results=False, # Default False. If True, it will save result output into the same directory as the analyzed file, using the same name as your file and .json extension output_profile=True, # Default False. If True, returned dict will contain a property "profile" indicating profile (min, max, mean, tops...) of every column of you csv - output_schema=True, # Default False. If True, returned dict will contain a property "schema" containing basic [tableschema](https://specs.frictionlessdata.io/table-schema/) of your file. This can be use to validate structure of other csv which should match same structure. + output_schema=True, # Default False. If True, returned dict will contain a property "schema" containing basic [tableschema](https://specs.frictionlessdata.io/table-schema/) of your file. This can be use to validate structure of other csv which should match same structure. + tags=["fr"], # Default None. If set as a list of strings, only performs checks related to the specified tags (you can see the available tags with FormatsManager().available_tags()) ) ``` @@ -42,7 +43,7 @@ inspection_results = routine( ### Output -The program creates a `Python` dictionnary with the following information : +The program creates a `python` dictionnary with the following information : ``` { @@ -185,7 +186,7 @@ Only the format with highest score is present in the output. ## Improvement suggestions - Smarter refactors -- Improve performances +- Performances improvements - Test other ways to load and process data (`pandas` alternatives) - Add more and more detection modules...