diff --git a/CHANGELOG.md b/CHANGELOG.md index fa2908bd..38341e54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ - Process big csv files in chunks (/!\ breaking changes) [#159](https://github.com/datagouv/csv-detective/pull/159) - Handle column named `count` in profile creation [#171](https://github.com/datagouv/csv-detective/pull/171) - Prevent `NaN` in headers [#173](https://github.com/datagouv/csv-detective/pull/173) +- Refactor formats structure in the repo (/!\ breaking changes) [#176](https://github.com/datagouv/csv-detective/pull/176) ## 0.9.2 (2025-08-26) diff --git a/README.md b/README.md index 9598aa5c..c64a23c9 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks (regex, casting, comparison with official lists...) to see for each column if it matches with various content types. -Currently supported file types: csv, xls, xlsx, ods. +Currently supported file types: csv(.gz), xls, xlsx, ods. You can also directly feed the URL of a remote file (from data.gouv.fr for instance). @@ -34,7 +34,8 @@ inspection_results = routine( num_rows=-1, # Value -1 will analyze all lines of your file, you can change with the number of lines you wish to analyze save_results=False, # Default False. If True, it will save result output into the same directory as the analyzed file, using the same name as your file and .json extension output_profile=True, # Default False. If True, returned dict will contain a property "profile" indicating profile (min, max, mean, tops...) of every column of you csv - output_schema=True, # Default False. If True, returned dict will contain a property "schema" containing basic [tableschema](https://specs.frictionlessdata.io/table-schema/) of your file. This can be use to validate structure of other csv which should match same structure. + output_schema=True, # Default False. If True, returned dict will contain a property "schema" containing basic [tableschema](https://specs.frictionlessdata.io/table-schema/) of your file. This can be use to validate structure of other csv which should match same structure. + tags=["fr"], # Default None. If set as a list of strings, only performs checks related to the specified tags (you can see the available tags with FormatsManager().available_tags()) ) ``` @@ -42,7 +43,7 @@ inspection_results = routine( ### Output -The program creates a `Python` dictionnary with the following information : +The program creates a `python` dictionnary with the following information : ``` { @@ -185,7 +186,7 @@ Only the format with highest score is present in the output. ## Improvement suggestions - Smarter refactors -- Improve performances +- Performances improvements - Test other ways to load and process data (`pandas` alternatives) - Add more and more detection modules... diff --git a/csv_detective/detect_fields/FR/README.md b/csv_detective/detect_fields/FR/README.md deleted file mode 100644 index 588ffa52..00000000 --- a/csv_detective/detect_fields/FR/README.md +++ /dev/null @@ -1 +0,0 @@ -Folder for French standards. diff --git a/csv_detective/detect_fields/FR/__init__.py b/csv_detective/detect_fields/FR/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_fields/FR/geo/__init__.py b/csv_detective/detect_fields/FR/geo/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py b/csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py deleted file mode 100644 index 7975c9a2..00000000 --- a/csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from frformat import CodeCommuneInsee, Millesime - -PROPORTION = 0.75 - -_code_commune_insee = CodeCommuneInsee(Millesime.LATEST) - - -def _is(val): - return _code_commune_insee.is_valid(val) diff --git a/csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py b/csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py deleted file mode 100644 index 31be7a03..00000000 --- a/csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from frformat import CodeFantoir - -PROPORTION = 1 - -_code_fantoir = CodeFantoir() - - -def _is(val): - return isinstance(val, str) and _code_fantoir.is_valid(val) diff --git a/csv_detective/detect_fields/FR/geo/code_postal/__init__.py b/csv_detective/detect_fields/FR/geo/code_postal/__init__.py deleted file mode 100644 index fdaf9590..00000000 --- a/csv_detective/detect_fields/FR/geo/code_postal/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from frformat import CodePostal - -PROPORTION = 0.9 - -_code_postal = CodePostal() - - -def _is(val): - return _code_postal.is_valid(val) diff --git a/csv_detective/detect_fields/FR/geo/code_region/__init__.py b/csv_detective/detect_fields/FR/geo/code_region/__init__.py deleted file mode 100644 index 3a7c20dd..00000000 --- a/csv_detective/detect_fields/FR/geo/code_region/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from frformat import CodeRegion, Millesime - -PROPORTION = 1 - -_code_region = CodeRegion(Millesime.LATEST) - - -def _is(val): - """Renvoie True si val peut être un code_région, False sinon""" - return isinstance(val, str) and _code_region.is_valid(val) diff --git a/csv_detective/detect_fields/FR/geo/departement/__init__.py b/csv_detective/detect_fields/FR/geo/departement/__init__.py deleted file mode 100644 index 9df01681..00000000 --- a/csv_detective/detect_fields/FR/geo/departement/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from frformat import Departement, Millesime, Options - -PROPORTION = 0.9 - -_options = Options( - ignore_case=True, - ignore_accents=True, - replace_non_alphanumeric_with_space=True, - ignore_extra_whitespace=True, -) -_departement = Departement(Millesime.LATEST, _options) - - -def _is(val): - """Match avec le nom des departements""" - return isinstance(val, str) and _departement.is_valid(val) diff --git a/csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py b/csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py deleted file mode 100644 index 445ee164..00000000 --- a/csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from frformat import LatitudeL93 - -from csv_detective.detect_fields.other.float import _is as is_float -from csv_detective.detect_fields.other.float import float_casting - -PROPORTION = 1 - -_latitudel93 = LatitudeL93() - - -def _is(val): - try: - if isinstance(val, str) and is_float(val): - return _latitudel93.is_valid(float_casting(val)) - - return False - - except (ValueError, OverflowError): - return False diff --git a/csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py b/csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py deleted file mode 100644 index 9608e74b..00000000 --- a/csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from csv_detective.detect_fields.other.float import _is as is_float - -PROPORTION = 1 - - -def _is(val): - """Renvoie True si val peut etre une latitude en métropole""" - try: - return is_float(val) and float(val) >= 41.3 and float(val) <= 51.3 - except ValueError: - return False - except OverflowError: - return False diff --git a/csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py b/csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py deleted file mode 100644 index dc1baf22..00000000 --- a/csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from frformat import LongitudeL93 - -from csv_detective.detect_fields.other.float import _is as is_float -from csv_detective.detect_fields.other.float import float_casting - -PROPORTION = 1 - -_longitudel93 = LongitudeL93() - - -def _is(val): - try: - if isinstance(val, str) and is_float(val): - return _longitudel93.is_valid(float_casting(val)) - - return False - - except (ValueError, OverflowError): - return False diff --git a/csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py b/csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py deleted file mode 100644 index 8684398e..00000000 --- a/csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from csv_detective.detect_fields.other.float import _is as is_float - -PROPORTION = 1 - - -def _is(val): - """Renvoie True si val peut etre une longitude en métropole""" - try: - return is_float(val) and float(val) >= -5.5 and float(val) <= 9.8 - except ValueError: - return False - except OverflowError: - return False diff --git a/csv_detective/detect_fields/FR/geo/pays/__init__.py b/csv_detective/detect_fields/FR/geo/pays/__init__.py deleted file mode 100644 index 637f630c..00000000 --- a/csv_detective/detect_fields/FR/geo/pays/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from frformat import Millesime, Options, Pays - -PROPORTION = 0.6 - -_options = Options( - ignore_case=True, - ignore_accents=True, - replace_non_alphanumeric_with_space=True, - ignore_extra_whitespace=True, -) -_pays = Pays(Millesime.LATEST, _options) - - -def _is(val): - """Match avec le nom des pays""" - return isinstance(val, str) and _pays.is_valid(val) diff --git a/csv_detective/detect_fields/FR/other/__init__.py b/csv_detective/detect_fields/FR/other/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt b/csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt deleted file mode 100644 index 9bd8128e..00000000 --- a/csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +++ /dev/null @@ -1,498 +0,0 @@ -111a -111b -111c -111d -111e -111f -121a -121b -121c -121d -121e -121f -122a -122b -122c -131a -131b -131c -131d -131e -131f -211a -211b -211c -211d -211e -211f -211g -211h -211j -212a -212b -212c -212d -213a -214a -214b -214c -214d -214e -214f -215a -215b -215c -215d -216a -216b -216c -217a -217b -217c -217d -217e -218a -219a -221a -221b -222a -222b -223a -223b -223c -223d -223e -223f -223g -223h -224a -224b -224c -224d -225a -226a -226b -226c -227a -227b -227c -227d -231a -232a -233a -233b -233c -233d -311a -311b -311c -311d -311e -311f -312a -312b -312c -312d -312e -312f -312g -313a -331a -332a -332b -333a -333b -333c -333d -333e -333f -334a -335a -341a -341b -342a -342e -343a -344a -344b -344c -344d -351a -352a -352b -353a -353b -353c -354a -354b -354c -354d -354g -371a -372a -372b -372c -372d -372e -372f -373a -373b -373c -373d -374a -374b -374c -374d -375a -375b -376a -376b -376c -376d -376e -376f -376g -377a -380a -381a -382a -382b -382c -382d -383a -383b -383c -384a -384b -384c -385a -385b -385c -386a -386d -386e -387a -387b -387c -387d -387e -387f -388a -388b -388c -388d -388e -389a -389b -389c -421a -421b -422a -422b -422c -422d -422e -423a -423b -424a -425a -431a -431b -431c -431d -431e -431f -431g -432a -432b -432c -432d -433a -433b -433c -433d -434a -434b -434c -434d -434e -434f -434g -435a -435b -441a -441b -451a -451b -451c -451d -451e -451f -452a -452b -461a -461d -461e -461f -462a -462b -462c -462d -462e -463a -463b -463c -463d -463e -464a -464b -465a -465b -465c -466a -466b -466c -467a -467b -467c -467d -468a -468b -471a -471b -472a -472b -472c -472d -473a -473b -473c -474a -474b -474c -475a -475b -476a -476b -477a -477b -477c -477d -478a -478b -478c -478d -479a -479b -480a -480b -481a -481b -482a -483a -484a -484b -485a -485b -486a -486d -486e -487a -487b -488a -488b -521a -521b -522a -523a -524a -525a -525b -525c -525d -526a -526b -526c -526d -526e -531a -531b -531c -532a -532b -532c -533a -533b -533c -534a -534b -541a -541d -542a -542b -543a -543d -544a -545a -545b -545c -545d -546a -546b -546c -546d -546e -551a -552a -553a -554a -554b -554c -554d -554e -554f -554g -554h -554j -555a -556a -561a -561d -561e -561f -562a -562b -563a -563b -563c -564a -564b -621a -621b -621c -621d -621e -621f -621g -622a -622b -622g -623a -623b -623c -623f -623g -624a -624d -624e -624f -624g -625a -625b -625c -625d -625e -625h -626a -626b -626c -627a -627b -627c -627d -627e -627f -628a -628b -628c -628d -628e -628f -628g -631a -632a -632b -632c -632d -632e -632f -632g -632h -632j -632k -633a -633b -633c -633d -634a -634b -634c -634d -635a -636a -636b -636c -636d -637a -637b -637c -637d -641a -641b -642a -642b -643a -644a -651a -651b -652a -652b -653a -654a -655a -656a -671a -671b -672a -673a -673b -673c -674a -674b -674c -674d -674e -675a -675b -675c -676a -676b -676c -676d -676e -681a -681b -682a -683a -684a -684b -685a -691a -691b -691c -691d -691e -691f -692a -7100 -7200 -7400 -7500 -7700 -7800 -8100 -8300 -8400 -8500 -8600 - diff --git a/csv_detective/detect_fields/FR/other/code_rna/__init__.py b/csv_detective/detect_fields/FR/other/code_rna/__init__.py deleted file mode 100644 index 4d725f33..00000000 --- a/csv_detective/detect_fields/FR/other/code_rna/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from frformat import CodeRNA - -PROPORTION = 0.9 - -_code_rna = CodeRNA() - - -def _is(val): - return isinstance(val, str) and _code_rna.is_valid(val) diff --git a/csv_detective/detect_fields/FR/other/code_waldec/__init__.py b/csv_detective/detect_fields/FR/other/code_waldec/__init__.py deleted file mode 100644 index 5595d869..00000000 --- a/csv_detective/detect_fields/FR/other/code_waldec/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -import re - -PROPORTION = 0.9 -regex = r"^W\d[\dA-Z]\d{7}$" - - -def _is(val): - """Repere le code Waldec""" - return isinstance(val, str) and bool(re.match(regex, val)) diff --git a/csv_detective/detect_fields/FR/other/date_fr/__init__.py b/csv_detective/detect_fields/FR/other/date_fr/__init__.py deleted file mode 100644 index 1d234dda..00000000 --- a/csv_detective/detect_fields/FR/other/date_fr/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -import re - -PROPORTION = 1 -regex = ( - r"^\d{1,2}[ \-](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre" - r"|octobre|novembre|decembre)[ \-]\d{4}$" -) - - -def _is(val): - """Repere les dates textuelles FR""" - return isinstance(val, str) and bool(re.match(regex, val)) diff --git a/csv_detective/detect_fields/FR/other/sexe/__init__.py b/csv_detective/detect_fields/FR/other/sexe/__init__.py deleted file mode 100644 index 185b65cf..00000000 --- a/csv_detective/detect_fields/FR/other/sexe/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -from csv_detective.parsing.text import _process_text - -PROPORTION = 1 - - -def _is(val): - """Repère le sexe""" - if not isinstance(val, str): - return False - val = _process_text(val) - return val in {"homme", "femme", "h", "f", "m", "masculin", "feminin"} diff --git a/csv_detective/detect_fields/FR/other/tel_fr/__init__.py b/csv_detective/detect_fields/FR/other/tel_fr/__init__.py deleted file mode 100644 index a232c9b9..00000000 --- a/csv_detective/detect_fields/FR/other/tel_fr/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -import re - -PROPORTION = 0.7 - - -def _is(val): - """Repère les numeros de telephone francais""" - if not isinstance(val, str): - return False - - if len(val) < 10: - return False - - val = val.replace(".", "").replace("-", "").replace(" ", "") - - match_1 = bool(re.match(r"^(0|\+33|0033)?[0-9]{9}$", val)) - return match_1 diff --git a/csv_detective/detect_fields/FR/other/uai/__init__.py b/csv_detective/detect_fields/FR/other/uai/__init__.py deleted file mode 100644 index 26bf3beb..00000000 --- a/csv_detective/detect_fields/FR/other/uai/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -import re - -PROPORTION = 1 - - -def _is(val): - """Repere les codes UAI de l'éducation nationale""" - - # test sur la longueur - if not isinstance(val, str) or len(val) != 8: - return False - - if not bool(re.match(r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$", val)): - return False - return True diff --git a/csv_detective/detect_fields/FR/temp/__init__.py b/csv_detective/detect_fields/FR/temp/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_fields/README.md b/csv_detective/detect_fields/README.md deleted file mode 100644 index 163031fa..00000000 --- a/csv_detective/detect_fields/README.md +++ /dev/null @@ -1,5 +0,0 @@ -Each country (indicated by ISO-code) folder corresponds to fields specifc to a country. Other folders contain tests for international standards (e-mails, dates ...). - -## TODO - -Update file "code_postal.txt" in FR.geo.code_postal and reset PROPORTION to 1 \ No newline at end of file diff --git a/csv_detective/detect_fields/__init__.py b/csv_detective/detect_fields/__init__.py deleted file mode 100644 index c47c0019..00000000 --- a/csv_detective/detect_fields/__init__.py +++ /dev/null @@ -1,112 +0,0 @@ -from .FR.geo import ( - adresse, - code_commune_insee, - code_departement, - code_fantoir, - code_postal, - code_region, - commune, - departement, - insee_canton, - latitude_l93, - latitude_wgs_fr_metropole, - longitude_l93, - longitude_wgs_fr_metropole, - pays, - region, -) -from .FR.other import ( - code_csp_insee, - code_import, - code_rna, - code_waldec, - csp_insee, - date_fr, - insee_ape700, - sexe, - siren, - siret, - tel_fr, - uai, -) -from .FR.temp import jour_de_la_semaine, mois_de_annee -from .geo import ( - iso_country_code_alpha2, - iso_country_code_alpha3, - iso_country_code_numeric, - json_geojson, - latitude_wgs, - latlon_wgs, - longitude_wgs, - lonlat_wgs, -) -from .other import ( - booleen, - email, - float, - int, - json, - money, - mongo_object_id, - percent, - twitter, - url, - uuid, -) -from .temp import date, datetime_aware, datetime_naive, datetime_rfc822, year - -__all__ = [ - "adresse", - "code_commune_insee", - "code_departement", - "code_fantoir", - "code_postal", - "code_region", - "commune", - "departement", - "insee_canton", - "latitude_l93", - "latitude_wgs_fr_metropole", - "longitude_l93", - "longitude_wgs_fr_metropole", - "pays", - "region", - "code_csp_insee", - "code_import", - "code_rna", - "code_waldec", - "csp_insee", - "date_fr", - "insee_ape700", - "sexe", - "siren", - "siret", - "tel_fr", - "uai", - "jour_de_la_semaine", - "mois_de_annee", - "iso_country_code_alpha2", - "iso_country_code_alpha3", - "iso_country_code_numeric", - "json_geojson", - "latitude_wgs", - "latlon_wgs", - "longitude_wgs", - "lonlat_wgs", - "booleen", - "email", - "float", - "int", - "json", - "money", - "mongo_object_id", - "percent", - "twitter", - "url", - "uuid", - "date", - "datetime_aware", - "datetime_naive", - "datetime_rfc822", - "year", -] diff --git a/csv_detective/detect_fields/geo/README.md b/csv_detective/detect_fields/geo/README.md deleted file mode 100644 index 2801b48e..00000000 --- a/csv_detective/detect_fields/geo/README.md +++ /dev/null @@ -1 +0,0 @@ -Folder for international spacial variables (international codes, spatial coordinates, etc.). diff --git a/csv_detective/detect_fields/geo/__init__.py b/csv_detective/detect_fields/geo/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py b/csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py deleted file mode 100644 index 916d6352..00000000 --- a/csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -import re -from os.path import dirname, join - -PROPORTION = 1 - -with open(join(dirname(__file__), "iso_country_code_alpha2.txt"), "r") as iofile: - liste_pays = iofile.read().split("\n") -liste_pays = set(liste_pays) - - -def _is(val): - """Renvoie True si val peut etre un code iso pays alpha-2, False sinon""" - if not isinstance(val, str) or not bool(re.match(r"[A-Z]{2}$", val)): - return False - return val in liste_pays diff --git a/csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py b/csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py deleted file mode 100644 index 9d89c15b..00000000 --- a/csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -import re -from os.path import dirname, join - -PROPORTION = 1 - -with open(join(dirname(__file__), "iso_country_code_alpha3.txt"), "r") as iofile: - liste_pays = iofile.read().split("\n") - - -def _is(val): - """Renvoie True si val peut etre un code iso pays alpha-3, False sinon""" - if not isinstance(val, str) or not bool(re.match(r"[A-Z]{3}$", val)): - return False - return val in set(liste_pays) diff --git a/csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py b/csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py deleted file mode 100644 index a420ba4f..00000000 --- a/csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -import re -from os.path import dirname, join - -PROPORTION = 1 - -with open(join(dirname(__file__), "iso_country_code_numeric.txt"), "r") as iofile: - liste_pays = iofile.read().split("\n") -liste_pays = set(liste_pays) - - -def _is(val): - """Renvoie True si val peut etre un code iso pays numerique, False sinon""" - if not isinstance(val, str) or not bool(re.match(r"[0-9]{3}$", val)): - return False - return val in liste_pays diff --git a/csv_detective/detect_fields/geo/json_geojson/__init__.py b/csv_detective/detect_fields/geo/json_geojson/__init__.py deleted file mode 100644 index 2f7a06bd..00000000 --- a/csv_detective/detect_fields/geo/json_geojson/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -import json - -PROPORTION = 0.9 - - -def _is(val): - """Renvoie True si val peut etre un geojson""" - - try: - j = json.loads(val) - if isinstance(j, dict): - if "type" in j and "coordinates" in j: - return True - if "geometry" in j and "coordinates" in j["geometry"]: - return True - except Exception: - pass - return False diff --git a/csv_detective/detect_fields/geo/latitude_wgs/__init__.py b/csv_detective/detect_fields/geo/latitude_wgs/__init__.py deleted file mode 100644 index 90a1ed7f..00000000 --- a/csv_detective/detect_fields/geo/latitude_wgs/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from csv_detective.detect_fields.other.float import _is as is_float - -PROPORTION = 1 - - -def _is(val): - """Renvoie True si val peut etre une latitude""" - try: - return is_float(val) and float(val) >= -90 and float(val) <= 90 - except ValueError: - return False - except OverflowError: - return False diff --git a/csv_detective/detect_fields/geo/latlon_wgs/__init__.py b/csv_detective/detect_fields/geo/latlon_wgs/__init__.py deleted file mode 100644 index 5bcc6fc3..00000000 --- a/csv_detective/detect_fields/geo/latlon_wgs/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from ..latitude_wgs import _is as is_lat -from ..longitude_wgs import _is as is_lon - -PROPORTION = 1 - - -def _is(val): - """Renvoie True si val peut etre une latitude,longitude""" - - if not isinstance(val, str) or val.count(",") != 1: - return False - lat, lon = val.split(",") - # handling [lat,lon] - if lat.startswith("[") and lon.endswith("]"): - lat, lon = lat[1:], lon[:-1] - return is_lat(lat) and is_lon(lon.replace(" ", "")) diff --git a/csv_detective/detect_fields/geo/longitude_wgs/__init__.py b/csv_detective/detect_fields/geo/longitude_wgs/__init__.py deleted file mode 100644 index 584e8906..00000000 --- a/csv_detective/detect_fields/geo/longitude_wgs/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from csv_detective.detect_fields.other.float import _is as is_float - -PROPORTION = 1 - - -def _is(val): - """Renvoie True si val peut etre une longitude""" - try: - return is_float(val) and float(val) >= -180 and float(val) <= 180 - except ValueError: - return False - except OverflowError: - return False diff --git a/csv_detective/detect_fields/geo/lonlat_wgs/__init__.py b/csv_detective/detect_fields/geo/lonlat_wgs/__init__.py deleted file mode 100644 index 05580850..00000000 --- a/csv_detective/detect_fields/geo/lonlat_wgs/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from ..latitude_wgs import _is as is_lat -from ..longitude_wgs import _is as is_lon - -PROPORTION = 1 - - -def _is(val): - """Renvoie True si val peut etre une longitude,latitude""" - - if not isinstance(val, str) or val.count(",") != 1: - return False - lon, lat = val.split(",") - # handling [lon,lat] - if lon.startswith("[") and lat.endswith("]"): - lon, lat = lon[1:], lat[:-1] - return is_lon(lon) and is_lat(lat.replace(" ", "")) diff --git a/csv_detective/detect_fields/other/__init__.py b/csv_detective/detect_fields/other/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_fields/other/email/__init__.py b/csv_detective/detect_fields/other/email/__init__.py deleted file mode 100644 index 667dd9f3..00000000 --- a/csv_detective/detect_fields/other/email/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -import re - -PROPORTION = 0.9 - - -def _is(val): - """Detects e-mails""" - return isinstance(val, str) and bool( - re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", val, re.IGNORECASE) - ) diff --git a/csv_detective/detect_fields/other/money/__init__.py b/csv_detective/detect_fields/other/money/__init__.py deleted file mode 100644 index ad9c1ef2..00000000 --- a/csv_detective/detect_fields/other/money/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -from ..float import _is as is_float - -currencies = set(["€", "$", "£", "¥"]) - -PROPORTION = 0.8 - - -def _is(val: str): - if not isinstance(val, str) or val[-1] not in currencies: - return False - return is_float(val[:-1]) diff --git a/csv_detective/detect_fields/other/mongo_object_id/__init__.py b/csv_detective/detect_fields/other/mongo_object_id/__init__.py deleted file mode 100644 index 4aca7ec2..00000000 --- a/csv_detective/detect_fields/other/mongo_object_id/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -import re - -PROPORTION = 0.8 - - -def _is(val): - """Detects Mongo ObjectIds""" - return isinstance(val, str) and bool(re.match(r"^[0-9a-fA-F]{24}$", val)) diff --git a/csv_detective/detect_fields/other/percent/__init__.py b/csv_detective/detect_fields/other/percent/__init__.py deleted file mode 100644 index 9d2620ad..00000000 --- a/csv_detective/detect_fields/other/percent/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from ..float import _is as is_float - -PROPORTION = 0.8 - - -def _is(val: str): - if not isinstance(val, str) or val[-1] != "%": - return False - return is_float(val[:-1]) diff --git a/csv_detective/detect_fields/other/twitter/__init__.py b/csv_detective/detect_fields/other/twitter/__init__.py deleted file mode 100644 index d63c541f..00000000 --- a/csv_detective/detect_fields/other/twitter/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -import re - -PROPORTION = 1 - - -def _is(val): - """Detects twitter accounts""" - return isinstance(val, str) and bool(re.match(r"^@[A-Za-z0-9_]+$", val)) diff --git a/csv_detective/detect_fields/other/url/__init__.py b/csv_detective/detect_fields/other/url/__init__.py deleted file mode 100644 index 72bb178f..00000000 --- a/csv_detective/detect_fields/other/url/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -import re - -PROPORTION = 1 -url_pattern = re.compile( - r"^((https?|ftp)://|www\.)(([A-Za-z0-9-]+\.)+[A-Za-z]{2,6})" - r"(/[A-Za-z0-9._~:/?#[@!$&'()*+,;=%-]*)?$" -) - - -def _is(val): - """Detects urls""" - if not isinstance(val, str): - return False - return bool(url_pattern.match(val)) diff --git a/csv_detective/detect_fields/other/uuid/__init__.py b/csv_detective/detect_fields/other/uuid/__init__.py deleted file mode 100644 index 75f39bdd..00000000 --- a/csv_detective/detect_fields/other/uuid/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -import re - -PROPORTION = 0.8 - - -def _is(val): - """Detects UUIDs""" - return isinstance(val, str) and bool( - re.match(r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$", val) - ) diff --git a/csv_detective/detect_fields/temp/README.md b/csv_detective/detect_fields/temp/README.md deleted file mode 100644 index e9e3c5b7..00000000 --- a/csv_detective/detect_fields/temp/README.md +++ /dev/null @@ -1 +0,0 @@ -Folder for international temporal variables (date, time, time zone, etc.). diff --git a/csv_detective/detect_fields/temp/__init__.py b/csv_detective/detect_fields/temp/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_fields/temp/year/__init__.py b/csv_detective/detect_fields/temp/year/__init__.py deleted file mode 100644 index 79a68e1f..00000000 --- a/csv_detective/detect_fields/temp/year/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -PROPORTION = 1 - - -def _is(val): - """Returns True if val can be a year""" - try: - val = int(val) - except ValueError: - return False - return (1800 <= val) and (val <= 2100) diff --git a/csv_detective/detect_labels/FR/__init__.py b/csv_detective/detect_labels/FR/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_labels/FR/geo/__init__.py b/csv_detective/detect_labels/FR/geo/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_labels/FR/geo/adresse/__init__.py b/csv_detective/detect_labels/FR/geo/adresse/__init__.py deleted file mode 100644 index 281f2499..00000000 --- a/csv_detective/detect_labels/FR/geo/adresse/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "adresse", - "adresse postale", - "adresse geographique", - "adr", - "adresse complete", - "adresse station", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py b/csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py deleted file mode 100644 index 9421cd4e..00000000 --- a/csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "code commune insee", - "code insee", - "codes insee", - "code commune", - "code insee commune", - "insee", - "code com", - "com", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/code_departement/__init__.py b/csv_detective/detect_labels/FR/geo/code_departement/__init__.py deleted file mode 100644 index 1eaacb9b..00000000 --- a/csv_detective/detect_labels/FR/geo/code_departement/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - # "dep": Possible confusion with dep name? - words_combinations_list = [ - "code departement", - "code_departement", - "dep", - "departement", - "dept", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py b/csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py deleted file mode 100644 index 78230a42..00000000 --- a/csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "cadastre1", - "code fantoir", - "fantoir", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/code_postal/__init__.py b/csv_detective/detect_labels/FR/geo/code_postal/__init__.py deleted file mode 100644 index 7c8cfff6..00000000 --- a/csv_detective/detect_labels/FR/geo/code_postal/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "code postal", - "postal code", - "postcode", - "post code", - "cp", - "codes postaux", - "location postcode", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/code_region/__init__.py b/csv_detective/detect_labels/FR/geo/code_region/__init__.py deleted file mode 100644 index a254e7d7..00000000 --- a/csv_detective/detect_labels/FR/geo/code_region/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - # "reg" : possible confusion with region name? - words_combinations_list = [ - "code region", - "reg", - "code insee region", - "region", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/commune/__init__.py b/csv_detective/detect_labels/FR/geo/commune/__init__.py deleted file mode 100644 index de106e9b..00000000 --- a/csv_detective/detect_labels/FR/geo/commune/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "commune", - "ville", - "libelle commune", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/departement/__init__.py b/csv_detective/detect_labels/FR/geo/departement/__init__.py deleted file mode 100644 index 7d1cd08c..00000000 --- a/csv_detective/detect_labels/FR/geo/departement/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "departement", - "libelle du departement", - "deplib", - "nom dept", - "dept", - "libdepartement", - "nom departement", - "libelle dep", - "libelle departement", - "lb departements", - "dep libusage", - "lb departement", - "nom dep", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/insee_canton/__init__.py b/csv_detective/detect_labels/FR/geo/insee_canton/__init__.py deleted file mode 100644 index 451f8e1c..00000000 --- a/csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "insee canton", - "canton", - "cant", - "nom canton", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py b/csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py deleted file mode 100644 index 0da69fc3..00000000 --- a/csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +++ /dev/null @@ -1,30 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - # Does not always detect CRS - words_combinations_list = [ - "latitude", - "lat", - "y", - "yf", - "yd", - "y l93", - "coordonnee y", - "latitude lb93", - "coord y", - "ycoord", - "geocodage y gps", - "location latitude", - "ylatitude", - "ylat", - "latitude (y)", - "latitudeorg", - "coordinates.latitude", - "googlemap latitude", - "latitudelieu", - "latitude googlemap", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py b/csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py deleted file mode 100644 index ae4b6afb..00000000 --- a/csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +++ /dev/null @@ -1,30 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "latitude", - "lat", - "y", - "yf", - "yd", - "coordonnee y", - "coord y", - "ycoord", - "geocodage y gps", - "location latitude", - "ylatitude", - "ylat", - "latitude (y)", - "latitudeorg", - "coordinates.latitude", - "googlemap latitude", - "latitudelieu", - "latitude googlemap", - "latitude wgs84", - "y wgs84", - "latitude (wgs84)", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py b/csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py deleted file mode 100644 index ce92b90a..00000000 --- a/csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - # Does not detect CRS - words_combinations_list = [ - "longitude", - "lon", - "long", - "geocodage x gps", - "location longitude", - "xlongitude", - "lng", - "xlong", - "x", - "xf", - "xd", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py b/csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py deleted file mode 100644 index ce92b90a..00000000 --- a/csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - # Does not detect CRS - words_combinations_list = [ - "longitude", - "lon", - "long", - "geocodage x gps", - "location longitude", - "xlongitude", - "lng", - "xlong", - "x", - "xf", - "xd", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/pays/__init__.py b/csv_detective/detect_labels/FR/geo/pays/__init__.py deleted file mode 100644 index fb83bac1..00000000 --- a/csv_detective/detect_labels/FR/geo/pays/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "pays", - "payslieu", - "paysorg", - "country", - "pays lib", - "lieupays", - "pays beneficiaire", - "nom du pays", - "journey start country", - "libelle pays", - "journey end country", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/geo/region/__init__.py b/csv_detective/detect_labels/FR/geo/region/__init__.py deleted file mode 100644 index c65603d7..00000000 --- a/csv_detective/detect_labels/FR/geo/region/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "region", - "libelle region", - "nom region", - "libelle reg", - "nom reg", - "reg libusage", - "nom de la region", - "regionorg", - "regionlieu", - "reg", - "nom officiel region", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/__init__.py b/csv_detective/detect_labels/FR/other/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py b/csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py deleted file mode 100644 index f11ff450..00000000 --- a/csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["code csp insee", "code csp"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/code_rna/__init__.py b/csv_detective/detect_labels/FR/other/code_rna/__init__.py deleted file mode 100644 index cf69f302..00000000 --- a/csv_detective/detect_labels/FR/other/code_rna/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "code rna", - "rna", - "n° inscription association", - "identifiant association", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/code_waldec/__init__.py b/csv_detective/detect_labels/FR/other/code_waldec/__init__.py deleted file mode 100644 index 9450733d..00000000 --- a/csv_detective/detect_labels/FR/other/code_waldec/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["code waldec", "waldec"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/csp_insee/__init__.py b/csv_detective/detect_labels/FR/other/csp_insee/__init__.py deleted file mode 100644 index 0cae8075..00000000 --- a/csv_detective/detect_labels/FR/other/csp_insee/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - # To improve? No specific header found in data - words_combinations_list = [ - "csp insee", - "csp", - "categorie socioprofessionnelle", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/date_fr/__init__.py b/csv_detective/detect_labels/FR/other/date_fr/__init__.py deleted file mode 100644 index 10a10891..00000000 --- a/csv_detective/detect_labels/FR/other/date_fr/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - # To improve: no header specific to "fr" found in data - words_combinations_list = ["date"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/insee_ape700/__init__.py b/csv_detective/detect_labels/FR/other/insee_ape700/__init__.py deleted file mode 100644 index 58dfb26f..00000000 --- a/csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "code ape", - "code activite (ape)", - "code naf", - "code naf organisme designe", - "code naf organisme designant", - "base sirene : code ape de l'etablissement siege", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/sexe/__init__.py b/csv_detective/detect_labels/FR/other/sexe/__init__.py deleted file mode 100644 index f4583170..00000000 --- a/csv_detective/detect_labels/FR/other/sexe/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["sexe", "sex", "civilite", "genre", "id sexe"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/siren/__init__.py b/csv_detective/detect_labels/FR/other/siren/__init__.py deleted file mode 100644 index e57aa56a..00000000 --- a/csv_detective/detect_labels/FR/other/siren/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "siren", - "siren organisme designe", - "siren organisme designant", - "n° siren", - "siren organisme", - "siren titulaire", - "numero siren", - "epci", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/siret/__init__.py b/csv_detective/detect_labels/FR/other/siret/__init__.py deleted file mode 100644 index 7741596e..00000000 --- a/csv_detective/detect_labels/FR/other/siret/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "siret", - "siret d", - "num siret", - "siretacheteur", - "n° siret", - "coll siret", - "epci", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/tel_fr/__init__.py b/csv_detective/detect_labels/FR/other/tel_fr/__init__.py deleted file mode 100644 index 2cb895a4..00000000 --- a/csv_detective/detect_labels/FR/other/tel_fr/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "telephone", - "tel", - "tel1", - "tel2", - "phone", - "num tel", - "tel mob", - "telephone sav", - "telephone1", - "coordinates.phone", - "telephone du lieu", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/other/uai/__init__.py b/csv_detective/detect_labels/FR/other/uai/__init__.py deleted file mode 100644 index 58860339..00000000 --- a/csv_detective/detect_labels/FR/other/uai/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "uai", - "code etablissement", - "code uai", - "uai - identifiant", - "numero uai", - "rne", - "numero de l'etablissement", - "code rne", - "codeetab", - "code uai de l'etablissement", - "ref uai", - "cd rne", - "numerouai", - "numero d etablissement", - "code etablissement", - "numero etablissement", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/temp/__init__.py b/csv_detective/detect_labels/FR/temp/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py b/csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py deleted file mode 100644 index db06549a..00000000 --- a/csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "jour semaine", - "type jour", - "jour de la semaine", - "saufjour", - "nomjour", - "jour", - "jour de fermeture", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py b/csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py deleted file mode 100644 index dd6aca72..00000000 --- a/csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["mois de annee", "mois", "month"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/__init__.py b/csv_detective/detect_labels/__init__.py deleted file mode 100644 index c78d34cb..00000000 --- a/csv_detective/detect_labels/__init__.py +++ /dev/null @@ -1,94 +0,0 @@ -from .FR.geo import ( - adresse, - code_commune_insee, - code_departement, - code_fantoir, - code_postal, - code_region, - commune, - departement, - insee_canton, - latitude_l93, - latitude_wgs_fr_metropole, - longitude_l93, - longitude_wgs_fr_metropole, - pays, - region, -) -from .FR.other import ( - code_csp_insee, - code_rna, - code_waldec, - csp_insee, - date_fr, - insee_ape700, - sexe, - siren, - siret, - tel_fr, - uai, -) -from .FR.temp import jour_de_la_semaine, mois_de_annee -from .geo import ( - iso_country_code_alpha2, - iso_country_code_alpha3, - iso_country_code_numeric, - json_geojson, - latitude_wgs, - latlon_wgs, - longitude_wgs, - lonlat_wgs, -) -from .other import booleen, email, float, int, money, mongo_object_id, twitter, url, uuid -from .temp import date, datetime_rfc822, year - -__all__ = [ - "adresse", - "code_commune_insee", - "code_departement", - "code_fantoir", - "code_postal", - "code_region", - "commune", - "departement", - "insee_canton", - "latitude_l93", - "latitude_wgs_fr_metropole", - "longitude_l93", - "longitude_wgs_fr_metropole", - "pays", - "region", - "code_csp_insee", - "code_rna", - "code_waldec", - "csp_insee", - "date_fr", - "insee_ape700", - "sexe", - "siren", - "siret", - "tel_fr", - "uai", - "iso_country_code_alpha2", - "iso_country_code_alpha3", - "iso_country_code_numeric", - "json_geojson", - "latitude_wgs", - "latlon_wgs", - "longitude_wgs", - "lonlat_wgs", - "jour_de_la_semaine", - "mois_de_annee", - "booleen", - "email", - "float", - "int", - "money", - "mongo_object_id", - "twitter", - "url", - "uuid", - "date", - "datetime_rfc822", - "year", -] diff --git a/csv_detective/detect_labels/geo/__init__.py b/csv_detective/detect_labels/geo/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py b/csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py deleted file mode 100644 index 55077b29..00000000 --- a/csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "iso country code", - "code pays", - "pays", - "country", - "nation", - "pays code", - "code pays (iso)", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py b/csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py deleted file mode 100644 index 55077b29..00000000 --- a/csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "iso country code", - "code pays", - "pays", - "country", - "nation", - "pays code", - "code pays (iso)", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py b/csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py deleted file mode 100644 index 55077b29..00000000 --- a/csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "iso country code", - "code pays", - "pays", - "country", - "nation", - "pays code", - "code pays (iso)", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/geo/json_geojson/__init__.py b/csv_detective/detect_labels/geo/json_geojson/__init__.py deleted file mode 100644 index f1b298f4..00000000 --- a/csv_detective/detect_labels/geo/json_geojson/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "json geojson", - "json", - "geojson", - "geo shape", - "geom", - "geometry", - "geo shape", - "geoshape", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/geo/latitude_wgs/__init__.py b/csv_detective/detect_labels/geo/latitude_wgs/__init__.py deleted file mode 100644 index ae4b6afb..00000000 --- a/csv_detective/detect_labels/geo/latitude_wgs/__init__.py +++ /dev/null @@ -1,30 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "latitude", - "lat", - "y", - "yf", - "yd", - "coordonnee y", - "coord y", - "ycoord", - "geocodage y gps", - "location latitude", - "ylatitude", - "ylat", - "latitude (y)", - "latitudeorg", - "coordinates.latitude", - "googlemap latitude", - "latitudelieu", - "latitude googlemap", - "latitude wgs84", - "y wgs84", - "latitude (wgs84)", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/geo/latlon_wgs/__init__.py b/csv_detective/detect_labels/geo/latlon_wgs/__init__.py deleted file mode 100644 index c78c3535..00000000 --- a/csv_detective/detect_labels/geo/latlon_wgs/__init__.py +++ /dev/null @@ -1,39 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - -COMMON_COORDS_LABELS = [ - "ban", - "coordinates", - "coordonnees", - "coordonnees insee", - "geo", - "geopoint", - "geoloc", - "geolocalisation", - "geom", - "geometry", - "gps", - "localisation", - "point", - "position", - "wgs84", -] - -specific = [ - "latlon", - "lat lon", - "x y", - "xy", -] - -# we aim wide to catch exact matches if possible for the highest possible score -words = ( - COMMON_COORDS_LABELS - + specific - + [w + sep + suf for suf in specific for w in COMMON_COORDS_LABELS for sep in ["", " "]] -) - - -def _is(header: str) -> float: - return header_score(header, words) diff --git a/csv_detective/detect_labels/geo/longitude_wgs/__init__.py b/csv_detective/detect_labels/geo/longitude_wgs/__init__.py deleted file mode 100644 index a30b67b7..00000000 --- a/csv_detective/detect_labels/geo/longitude_wgs/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - # Does not detect CRS - words_combinations_list = [ - "longitude", - "lon", - "long", - "geocodage x gps", - "location longitude", - "xlongitude", - "lng", - "xlong", - "x", - "xf", - "xd", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/geo/lonlat_wgs/__init__.py b/csv_detective/detect_labels/geo/lonlat_wgs/__init__.py deleted file mode 100644 index ef529c82..00000000 --- a/csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -from csv_detective.parsing.text import header_score - -from ..latlon_wgs import COMMON_COORDS_LABELS - -PROPORTION = 0.5 - -specific = [ - "lonlat", - "lon lat", - "y x", - "yx", -] - -# we aim wide to catch exact matches if possible for the highest possible score -words = ( - COMMON_COORDS_LABELS - + specific - + [w + sep + suf for suf in specific for w in COMMON_COORDS_LABELS for sep in ["", " "]] -) - - -def _is(header: str) -> float: - return header_score(header, words) diff --git a/csv_detective/detect_labels/other/__init__.py b/csv_detective/detect_labels/other/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_labels/other/booleen/__init__.py b/csv_detective/detect_labels/other/booleen/__init__.py deleted file mode 100644 index 307378ea..00000000 --- a/csv_detective/detect_labels/other/booleen/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["is ", "has ", "est "] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/other/email/__init__.py b/csv_detective/detect_labels/other/email/__init__.py deleted file mode 100644 index de771fe8..00000000 --- a/csv_detective/detect_labels/other/email/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "email", - "mail", - "courriel", - "contact", - "mel", - "lieucourriel", - "coordinates.emailcontact", - "e mail", - "mo mail", - "adresse mail", - "adresse email", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/other/float/__init__.py b/csv_detective/detect_labels/other/float/__init__.py deleted file mode 100644 index 354814c2..00000000 --- a/csv_detective/detect_labels/other/float/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["part", "ratio", "taux"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/other/int/__init__.py b/csv_detective/detect_labels/other/int/__init__.py deleted file mode 100644 index 74b3586c..00000000 --- a/csv_detective/detect_labels/other/int/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["nb", "nombre", "nbre"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/other/money/__init__.py b/csv_detective/detect_labels/other/money/__init__.py deleted file mode 100644 index 8944b79d..00000000 --- a/csv_detective/detect_labels/other/money/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["budget", "salaire", "euro", "euros", "prêt", "montant"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/other/mongo_object_id/__init__.py b/csv_detective/detect_labels/other/mongo_object_id/__init__.py deleted file mode 100644 index b110538f..00000000 --- a/csv_detective/detect_labels/other/mongo_object_id/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["id", "objectid"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/other/twitter/__init__.py b/csv_detective/detect_labels/other/twitter/__init__.py deleted file mode 100644 index 9b6c5a31..00000000 --- a/csv_detective/detect_labels/other/twitter/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["twitter", "twitter account", "twitter username"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/other/url/__init__.py b/csv_detective/detect_labels/other/url/__init__.py deleted file mode 100644 index cc51d569..00000000 --- a/csv_detective/detect_labels/other/url/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "url", - "url source", - "site web", - "source url", - "site internet", - "remote url", - "web", - "site", - "lien", - "site data", - "lien url", - "lien vers le fichier", - "sitweb", - "interneturl", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/other/uuid/__init__.py b/csv_detective/detect_labels/other/uuid/__init__.py deleted file mode 100644 index c05eeed9..00000000 --- a/csv_detective/detect_labels/other/uuid/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = ["id", "uuid", "guid"] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/temp/__init__.py b/csv_detective/detect_labels/temp/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/csv_detective/detect_labels/temp/date/__init__.py b/csv_detective/detect_labels/temp/date/__init__.py deleted file mode 100644 index ffe2673b..00000000 --- a/csv_detective/detect_labels/temp/date/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "date", - "jour", - "date de mise a jour", - "sns date", - "date maj", - "rem date", - "periode", - "date de publication", - "dpc", - "extract date", - "date immatriculation", - "date jeu donnees", - "datemaj", - "dateouv", - "date der maj", - "dmaj", - "jour", - "yyyymmdd", - "aaaammjj", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/temp/datetime_rfc822/__init__.py b/csv_detective/detect_labels/temp/datetime_rfc822/__init__.py deleted file mode 100644 index ea968b46..00000000 --- a/csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "datetime", - "timestamp", - "osm_timestamp", - "date", - "created at", - "last update", - "date maj", - "createdat", - "date naissance", - "date donnees", - ] # Almost same as IS0, no example in data - return header_score(header, words_combinations_list) diff --git a/csv_detective/detect_labels/temp/year/__init__.py b/csv_detective/detect_labels/temp/year/__init__.py deleted file mode 100644 index 24976e6e..00000000 --- a/csv_detective/detect_labels/temp/year/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from csv_detective.parsing.text import header_score - -PROPORTION = 0.5 - - -def _is(header: str) -> float: - words_combinations_list = [ - "year", - "annee", - "annee depot", - "an nais", - "exercice", - "data year", - "annee de publication", - "exercice comptable", - "annee de naissance", - "annee ouverture", - ] - return header_score(header, words_combinations_list) diff --git a/csv_detective/detection/formats.py b/csv_detective/detection/formats.py index 1fe6e66d..39647128 100755 --- a/csv_detective/detection/formats.py +++ b/csv_detective/detection/formats.py @@ -7,7 +7,7 @@ detect_categorical_variable, # detect_continuous_variable, ) -from csv_detective.load_tests import return_all_tests +from csv_detective.format import Format, FormatsManager from csv_detective.output.utils import prepare_output_dict from csv_detective.parsing.columns import ( MAX_NUMBER_CATEGORICAL_VALUES, @@ -16,12 +16,14 @@ test_label, ) +fmtm = FormatsManager() + def detect_formats( table: pd.DataFrame, analysis: dict, file_path: str, - user_input_tests: str | list[str] = "ALL", + tags: list[str] | None = None, limited_output: bool = True, skipna: bool = True, verbose: bool = False, @@ -29,15 +31,12 @@ def detect_formats( in_chunks = analysis.get("total_lines") is None # list testing to be performed - all_tests_fields = return_all_tests( - user_input_tests, detect_type="detect_fields" - ) # list all tests for the fields - all_tests_labels = return_all_tests( - user_input_tests, detect_type="detect_labels" - ) # list all tests for the labels + formats: dict[str, Format] = ( + fmtm.get_formats_from_tags(tags) if tags is not None else fmtm.formats + ) # if no testing then return - if not all_tests_fields and not all_tests_labels: + if len(formats) == 0: return analysis, None # Perform testing on fields @@ -45,7 +44,7 @@ def detect_formats( # table is small enough to be tested in one go scores_table_fields = test_col( table=table, - all_tests=all_tests_fields, + formats=formats, limited_output=limited_output, skipna=skipna, verbose=verbose, @@ -62,7 +61,7 @@ def detect_formats( table=table, file_path=file_path, analysis=analysis, - all_tests=all_tests_fields, + formats=formats, limited_output=limited_output, skipna=skipna, verbose=verbose, @@ -70,9 +69,7 @@ def detect_formats( analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output) # Perform testing on labels - scores_table_labels = test_label( - analysis["header"], all_tests_labels, limited_output, verbose=verbose - ) + scores_table_labels = test_label(analysis["header"], formats, limited_output, verbose=verbose) analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output) # Multiply the results of the fields by 1 + 0.5 * the results of the labels. @@ -115,7 +112,7 @@ def detect_formats( "float": "float", "string": "string", "json": "json", - "json_geojson": "json", + "geojson": "json", "datetime_aware": "datetime", "datetime_naive": "datetime", "datetime_rfc822": "datetime", diff --git a/csv_detective/explore_csv.py b/csv_detective/explore_csv.py index afd791a9..8dc59886 100644 --- a/csv_detective/explore_csv.py +++ b/csv_detective/explore_csv.py @@ -15,7 +15,7 @@ def routine( file_path: str, num_rows: int = 500, - user_input_tests: str | list[str] = "ALL", + tags: list[str] | None = None, limited_output: bool = True, save_results: bool | str = True, encoding: str | None = None, @@ -28,14 +28,13 @@ def routine( verbose: bool = False, sheet_name: str | int | None = None, ) -> dict | tuple[dict, pd.DataFrame]: - """Returns a dict with information about the table and possible - column contents, and if requested the DataFrame with columns cast according to analysis. + """ + Returns a dict with information about the table and possible column contents, and if requested the DataFrame with columns cast according to analysis. Args: file_path: local path or URL to file - num_rows: number of rows to sample from the file for analysis ; -1 for analysis - of the whole file - user_input_tests: tests to run on the file + num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file + tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats) limited_output: whether or not to return all possible types or only the most likely one for each column save_results: whether or not to save the results in a json file, or the path where to dump the output output_profile: whether or not to add the 'profile' field to the output @@ -74,7 +73,7 @@ def routine( table=table, analysis=analysis, file_path=file_path, - user_input_tests=user_input_tests, + tags=tags, limited_output=limited_output, skipna=skipna, verbose=verbose, @@ -107,7 +106,7 @@ def validate_then_detect( file_path: str, previous_analysis: dict, num_rows: int = 500, - user_input_tests: str | list[str] = "ALL", + tags: list[str] | None = None, limited_output: bool = True, save_results: bool | str = True, skipna: bool = True, @@ -117,6 +116,26 @@ def validate_then_detect( cast_json: bool = True, verbose: bool = False, ): + """ + Performs a validation of the given file against the given analysis. + If the validation fails, performs a full analysis and return it. + Otherwise return the previous analysis (which is therefore still valid). + NB: if asked, the profile is recreated in both cases. + + Args: + file_path: the path of the file to validate. + previous_analysis: the previous analysis to validate against (expected in the same structure as the output of the routine) + num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file + tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats) + limited_output: whether or not to return all possible types or only the most likely one for each column + save_results: whether or not to save the results in a json file, or the path where to dump the output + skipna: whether to ignore NaN values in the checks + output_profile: whether or not to add the 'profile' field to the output + output_schema: whether or not to add the 'schema' field to the output (tableschema) + output_df: whether or not to return the loaded DataFrame along with the analysis report + cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings) + verbose: whether the code displays the steps it's going through + """ if verbose: start_routine = time() if is_url(file_path): @@ -140,7 +159,7 @@ def validate_then_detect( table=table, analysis=analysis, file_path=file_path, - user_input_tests=user_input_tests, + tags=tags, limited_output=limited_output, skipna=skipna, verbose=verbose, diff --git a/csv_detective/format.py b/csv_detective/format.py new file mode 100755 index 00000000..cb0a1a16 --- /dev/null +++ b/csv_detective/format.py @@ -0,0 +1,67 @@ +from typing import Any, Callable + +from csv_detective.parsing.text import header_score + + +class Format: + def __init__( + self, + name: str, + func: Callable[[Any], bool], + _test_values: dict[bool, list[str]], + labels: list[str] = [], + proportion: float = 1, + tags: list[str] = [], + ) -> None: + """ + Instanciates a Format object. + + Args: + name: the name of the format. + func: the value test for the format (returns whether a string is valid). + _test_values: lists of valid and invalid values, used in the tests + labels: the list of hint headers for the header score + proportion: the tolerance (between 0 and 1) to say a column is valid for a format. (1 => 100% of the column has to pass the func check for the column to be considered valid) + tags: to allow users to submit a file to only a subset of formats + """ + self.name: str = name + self.func: Callable = func + self._test_values: dict[bool, list[str]] = _test_values + self.labels: list[str] = labels + self.proportion: float = proportion + self.tags: list[str] = tags + + def is_valid_label(self, val: str) -> float: + return header_score(val, self.labels) + + +class FormatsManager: + formats: dict[str, Format] + + def __init__(self) -> None: + import csv_detective.formats as formats + + format_labels = [f for f in dir(formats) if "_is" in dir(getattr(formats, f))] + self.formats = { + label: Format( + name=label, + func=(module := getattr(formats, label))._is, + _test_values=module._test_values, + **{ + attr: val + for attr in ["labels", "proportion", "tags"] + if (val := getattr(module, attr, None)) + }, + ) + for label in format_labels + } + + def get_formats_from_tags(self, tags: list[str]) -> dict[str, Format]: + return { + label: fmt + for label, fmt in self.formats.items() + if all(tag in fmt.tags for tag in tags) + } + + def available_tags(self) -> set[str]: + return set(tag for format in self.formats.values() for tag in format.tags) diff --git a/csv_detective/formats/__init__.py b/csv_detective/formats/__init__.py new file mode 100755 index 00000000..88bdf47e --- /dev/null +++ b/csv_detective/formats/__init__.py @@ -0,0 +1,9 @@ +import importlib +import os + +for file in os.listdir(os.path.dirname(__file__)): + if file.endswith(".py") and not file.startswith("_"): + module_name = file[:-3] + module = importlib.import_module(f"csv_detective.formats.{module_name}") + globals()[module_name] = module + del module diff --git a/csv_detective/detect_fields/FR/geo/adresse/__init__.py b/csv_detective/formats/adresse.py old mode 100644 new mode 100755 similarity index 79% rename from csv_detective/detect_fields/FR/geo/adresse/__init__.py rename to csv_detective/formats/adresse.py index 9233ca2f..fae6ee47 --- a/csv_detective/detect_fields/FR/geo/adresse/__init__.py +++ b/csv_detective/formats/adresse.py @@ -1,100 +1,116 @@ -from csv_detective.parsing.text import _process_text - -PROPORTION = 0.55 -# ajouts d'espaces en fin de mots pour s'assurer que le str n'est pas juste une substr d'un mot plus long -voies = { - "aire ", - "allee ", - "avenue ", - "base ", - "boulevard ", - "cami ", - "carrefour ", - "chemin ", - "cheminement ", - "chaussee ", - "cite ", - "clos ", - "coin ", - "corniche ", - "cote ", - "cour ", - "cours ", - "domaine ", - "descente ", - "ecart ", - "esplanade ", - "faubourg ", - "gare ", - "grande rue", - "hameau ", - "halle ", - "ilot ", - "impasse ", - "lieu dit", - "lotissement ", - "marche ", - "montee ", - "parc ", - "passage ", - "place ", - "plan ", - "plaine ", - "plateau ", - "pont ", - "port ", - "promenade ", - "parvis ", - "quartier ", - "quai ", - "residence ", - "ruelle ", - "rocade ", - "rond point", - "route ", - "rue ", - # 'sente - sentier', - "square ", - "tour ", - # 'terre-plein', - "traverse ", - "villa ", - "village ", - "voie ", - "zone artisanale", - "zone d’amenagement concerte", - "zone d’amenagement differe", - "zone industrielle", - "zone ", - # 'r', - "av ", - "pl ", - "bd ", - "cami ", - # 'che', - "chs ", - "dom ", - "ham ", - "ld ", - # 'pro', - # 'rte', - "vlge ", - "za ", - "zac ", - "zad ", - "zi ", - # 'car', - "fg ", - # 'lot', - "imp ", - # 'qu', - "mte", -} - - -def _is(val): - """Repere des adresses""" - if not isinstance(val, str) or len(val) > 150: - return False - val = _process_text(val) - return any(x in val for x in voies) +from csv_detective.parsing.text import _process_text + +proportion = 0.55 +tags = ["fr", "geo"] +labels = [ + "adresse", + "localisation", + "adresse postale", + "adresse geographique", + "adr", + "adresse complete", + "adresse station", +] + +voies = { + "aire ", + "allee ", + "avenue ", + "base ", + "boulevard ", + "cami ", + "carrefour ", + "chemin ", + "cheminement ", + "chaussee ", + "cite ", + "clos ", + "coin ", + "corniche ", + "cote ", + "cour ", + "cours ", + "domaine ", + "descente ", + "ecart ", + "esplanade ", + "faubourg ", + "gare ", + "grande rue", + "hameau ", + "halle ", + "ilot ", + "impasse ", + "lieu dit", + "lotissement ", + "marche ", + "montee ", + "parc ", + "passage ", + "place ", + "plan ", + "plaine ", + "plateau ", + "pont ", + "port ", + "promenade ", + "parvis ", + "quartier ", + "quai ", + "residence ", + "ruelle ", + "rocade ", + "rond point", + "route ", + "rue ", + # 'sente - sentier', + "square ", + "tour ", + # 'terre-plein', + "traverse ", + "villa ", + "village ", + "voie ", + "zone artisanale", + "zone d’amenagement concerte", + "zone d’amenagement differe", + "zone industrielle", + "zone ", + # 'r', + "av ", + "pl ", + "bd ", + "cami ", + # 'che', + "chs ", + "dom ", + "ham ", + "ld ", + # 'pro', + # 'rte', + "vlge ", + "za ", + "zac ", + "zad ", + "zi ", + # 'car', + "fg ", + # 'lot', + "imp ", + # 'qu', + "mte", +} + + +def _is(val): + """Repere des adresses""" + if not isinstance(val, str) or len(val) > 150: + return False + val = _process_text(val) + return any(x in val for x in voies) + + +_test_values = { + True: ["rue du martyr"], + False: ["un batiment"], +} diff --git a/csv_detective/detect_fields/other/booleen/__init__.py b/csv_detective/formats/booleen.py old mode 100644 new mode 100755 similarity index 65% rename from csv_detective/detect_fields/other/booleen/__init__.py rename to csv_detective/formats/booleen.py index a1e7426c..0ef02282 --- a/csv_detective/detect_fields/other/booleen/__init__.py +++ b/csv_detective/formats/booleen.py @@ -1,27 +1,35 @@ -PROPORTION = 1 -bool_mapping = { - "1": True, - "0": False, - "vrai": True, - "faux": False, - "true": True, - "false": False, - "oui": True, - "non": False, - "yes": True, - "no": False, - "y": True, - "n": False, - "o": True, -} - -liste_bool = set(bool_mapping.keys()) - - -def bool_casting(val: str) -> bool: - return bool_mapping.get(val.lower()) - - -def _is(val: str) -> bool: - """Détecte les booléens""" - return isinstance(val, str) and val.lower() in liste_bool +proportion = 1 +tags = ["type"] +labels = ["is ", "has ", "est "] + +bool_mapping = { + "1": True, + "0": False, + "vrai": True, + "faux": False, + "true": True, + "false": False, + "oui": True, + "non": False, + "yes": True, + "no": False, + "y": True, + "n": False, + "o": True, +} + +liste_bool = set(bool_mapping.keys()) + + +def bool_casting(val: str) -> bool: + return bool_mapping.get(val.lower()) + + +def _is(val): + return isinstance(val, str) and val.lower() in liste_bool + + +_test_values = { + True: ["oui", "0", "1", "yes", "false", "True"], + False: ["nein", "ja", "2", "-0"], +} diff --git a/csv_detective/formats/code_commune_insee.py b/csv_detective/formats/code_commune_insee.py new file mode 100755 index 00000000..70d59d64 --- /dev/null +++ b/csv_detective/formats/code_commune_insee.py @@ -0,0 +1,26 @@ +from frformat import CodeCommuneInsee, Millesime + +proportion = 0.75 +tags = ["fr", "geo"] +labels = [ + "code commune insee", + "code insee", + "codes insee", + "code commune", + "code insee commune", + "insee", + "code com", + "com", +] + +_code_commune_insee = CodeCommuneInsee(Millesime.LATEST) + + +def _is(val): + return isinstance(val, str) and _code_commune_insee.is_valid(val) + + +_test_values = { + True: ["91471", "01053"], + False: ["914712", "01000"], +} diff --git a/csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py b/csv_detective/formats/code_csp_insee.py old mode 100644 new mode 100755 similarity index 74% rename from csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py rename to csv_detective/formats/code_csp_insee.py index abedb1cf..b9c931b0 --- a/csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +++ b/csv_detective/formats/code_csp_insee.py @@ -1,29 +1,36 @@ -import re - -from csv_detective.parsing.text import _process_text - -PROPORTION = 1 - - -def _is(val): - """Repère les code csp telles que définies par l'INSEE""" - if not isinstance(val, str): - return False - val = _process_text(val) - if len(val) != 4: - return False - a = bool(re.match(r"^[123456][0-9]{2}[abcdefghijkl]$", val)) - b = val in { - "7100", - "7200", - "7400", - "7500", - "7700", - "7800", - "8100", - "8300", - "8400", - "8500", - "8600", - } - return a or b +import re + +from csv_detective.parsing.text import _process_text + +proportion = 1 +tags = ["fr"] +labels = ["code csp insee", "code csp"] + + +def _is(val): + if not isinstance(val, str): + return False + val = _process_text(val) + if len(val) != 4: + return False + a = bool(re.match(r"^[123456][0-9]{2}[abcdefghijkl]$", val)) + b = val in { + "7100", + "7200", + "7400", + "7500", + "7700", + "7800", + "8100", + "8300", + "8400", + "8500", + "8600", + } + return a or b + + +_test_values = { + True: ["121f"], + False: ["121x"], +} diff --git a/csv_detective/detect_fields/FR/geo/code_departement/__init__.py b/csv_detective/formats/code_departement.py old mode 100644 new mode 100755 similarity index 57% rename from csv_detective/detect_fields/FR/geo/code_departement/__init__.py rename to csv_detective/formats/code_departement.py index 2edb50b8..a9358a2e --- a/csv_detective/detect_fields/FR/geo/code_departement/__init__.py +++ b/csv_detective/formats/code_departement.py @@ -1,15 +1,29 @@ -from frformat import Millesime, NumeroDepartement, Options - -PROPORTION = 1 - -_options = Options( - ignore_case=True, - ignore_accents=True, - replace_non_alphanumeric_with_space=True, - ignore_extra_whitespace=True, -) -_numero_departement = NumeroDepartement(Millesime.LATEST, _options) - - -def _is(val): - return isinstance(val, str) and _numero_departement.is_valid(val) +from frformat import Millesime, NumeroDepartement, Options + +proportion = 1 +tags = ["fr", "geo"] +labels = [ + "code departement", + "code_departement", + "dep", + "departement", + "dept", +] + +_options = Options( + ignore_case=True, + ignore_accents=True, + replace_non_alphanumeric_with_space=True, + ignore_extra_whitespace=True, +) +_numero_departement = NumeroDepartement(Millesime.LATEST, _options) + + +def _is(val): + return isinstance(val, str) and _numero_departement.is_valid(val) + + +_test_values = { + True: ["75", "2A", "2b", "974", "01"], + False: ["00", "96", "101"], +} diff --git a/csv_detective/formats/code_fantoir.py b/csv_detective/formats/code_fantoir.py new file mode 100755 index 00000000..dc262fba --- /dev/null +++ b/csv_detective/formats/code_fantoir.py @@ -0,0 +1,21 @@ +from frformat import CodeFantoir + +proportion = 1 +tags = ["fr", "geo"] +labels = [ + "cadastre1", + "code fantoir", + "fantoir", +] + +_code_fantoir = CodeFantoir() + + +def _is(val): + return isinstance(val, str) and _code_fantoir.is_valid(val) + + +_test_values = { + True: ["7755A", "B150B", "ZA04C", "ZB03D"], + False: ["7755", "ZA99A"], +} diff --git a/csv_detective/detect_fields/FR/other/code_import/__init__.py b/csv_detective/formats/code_import.py old mode 100644 new mode 100755 similarity index 52% rename from csv_detective/detect_fields/FR/other/code_import/__init__.py rename to csv_detective/formats/code_import.py index 023442c4..43e39e84 --- a/csv_detective/detect_fields/FR/other/code_import/__init__.py +++ b/csv_detective/formats/code_import.py @@ -1,9 +1,17 @@ -import re - -PROPORTION = 0.9 -regex = r"^(\d{3}[SP]\d{4,10}(.\w{1,3}\d{0,5})?|\d[A-Z0-9]\d[SP]\w(\w-?\w{0,2}\d{0,6})?)$" - - -def _is(val): - """Repere le code Import (ancien RNA)""" - return isinstance(val, str) and bool(re.match(regex, val)) +import re + +proportion = 0.9 +tags = ["fr"] +labels = ["code"] + +regex = r"^(\d{3}[SP]\d{4,10}(.\w{1,3}\d{0,5})?|\d[A-Z0-9]\d[SP]\w(\w-?\w{0,2}\d{0,6})?)$" + + +def _is(val): + return isinstance(val, str) and bool(re.match(regex, val)) + + +_test_values = { + True: ["123S1871092288"], + False: ["AA751PEE00188854", "W123456789"], +} diff --git a/csv_detective/formats/code_postal.py b/csv_detective/formats/code_postal.py new file mode 100755 index 00000000..4b0ec994 --- /dev/null +++ b/csv_detective/formats/code_postal.py @@ -0,0 +1,25 @@ +from frformat import CodePostal + +proportion = 0.9 +tags = ["fr", "geo"] +labels = [ + "code postal", + "postal code", + "postcode", + "post code", + "cp", + "codes postaux", + "location postcode", +] + +_code_postal = CodePostal() + + +def _is(val): + return isinstance(val, str) and _code_postal.is_valid(val) + + +_test_values = { + True: ["75020", "01000"], + False: ["77777", "018339"], +} diff --git a/csv_detective/formats/code_region.py b/csv_detective/formats/code_region.py new file mode 100755 index 00000000..92aff448 --- /dev/null +++ b/csv_detective/formats/code_region.py @@ -0,0 +1,22 @@ +from frformat import CodeRegion, Millesime + +proportion = 1 +tags = ["fr", "geo"] +labels = [ + "code region", + "reg", + "code insee region", + "region", +] + +_code_region = CodeRegion(Millesime.LATEST) + + +def _is(val): + return isinstance(val, str) and _code_region.is_valid(val) + + +_test_values = { + True: ["32"], + False: ["55"], +} diff --git a/csv_detective/formats/code_rna.py b/csv_detective/formats/code_rna.py new file mode 100755 index 00000000..c31b7194 --- /dev/null +++ b/csv_detective/formats/code_rna.py @@ -0,0 +1,29 @@ +from frformat import CodeRNA + +proportion = 0.9 +tags = ["fr"] +labels = [ + "code rna", + "rna", + "n° inscription association", + "identifiant association", +] + +_code_rna = CodeRNA() + + +def _is(val): + return isinstance(val, str) and _code_rna.is_valid(val) + + +_test_values = { + True: ["W751515517"], + False: [ + "W111111111111111111111111111111111111", + "w143788974", + "W12", + "678W23456", + "165789325", + "Wa1#89sf&h", + ], +} diff --git a/csv_detective/formats/code_waldec.py b/csv_detective/formats/code_waldec.py new file mode 100755 index 00000000..bca1dfff --- /dev/null +++ b/csv_detective/formats/code_waldec.py @@ -0,0 +1,17 @@ +import re + +proportion = 0.9 +tags = ["fr"] +labels = ["code waldec", "waldec"] + +regex = r"^W\d[\dA-Z]\d{7}$" + + +def _is(val): + return isinstance(val, str) and bool(re.match(regex, val)) + + +_test_values = { + True: ["W123456789", "W2D1234567"], + False: ["AA751PEE00188854"], +} diff --git a/csv_detective/detect_fields/FR/geo/commune/__init__.py b/csv_detective/formats/commune.py old mode 100644 new mode 100755 similarity index 60% rename from csv_detective/detect_fields/FR/geo/commune/__init__.py rename to csv_detective/formats/commune.py index e27bdf97..9037305f --- a/csv_detective/detect_fields/FR/geo/commune/__init__.py +++ b/csv_detective/formats/commune.py @@ -1,16 +1,27 @@ -from frformat import Commune, Millesime, Options - -PROPORTION = 0.9 - -_options = Options( - ignore_case=True, - ignore_accents=True, - replace_non_alphanumeric_with_space=True, - ignore_extra_whitespace=True, -) -_commune = Commune(Millesime.LATEST, _options) - - -def _is(val): - """Match avec le nom des communes""" - return isinstance(val, str) and _commune.is_valid(val) +from frformat import Commune, Millesime, Options + +proportion = 0.8 +tags = ["fr", "geo"] +labels = [ + "commune", + "ville", + "libelle commune", +] + +_options = Options( + ignore_case=True, + ignore_accents=True, + replace_non_alphanumeric_with_space=True, + ignore_extra_whitespace=True, +) +_commune = Commune(Millesime.LATEST, _options) + + +def _is(val): + return isinstance(val, str) and _commune.is_valid(val) + + +_test_values = { + True: ["saint denis"], + False: ["new york", "lion"], +} diff --git a/csv_detective/detect_fields/FR/other/csp_insee/__init__.py b/csv_detective/formats/csp_insee.py old mode 100644 new mode 100755 similarity index 55% rename from csv_detective/detect_fields/FR/other/csp_insee/__init__.py rename to csv_detective/formats/csp_insee.py index f2801895..709cba9b --- a/csv_detective/detect_fields/FR/other/csp_insee/__init__.py +++ b/csv_detective/formats/csp_insee.py @@ -1,19 +1,31 @@ -from os.path import dirname, join - -from csv_detective.parsing.text import _process_text - -PROPORTION = 1 -f = open(join(dirname(__file__), "csp_insee.txt"), "r") -codes_insee = f.read().split("\n") -# removing empty str due to additionnal line in file -del codes_insee[-1] -codes_insee = set(codes_insee) -f.close() - - -def _is(val): - """Repère les csp telles que définies par l'INSEE""" - if not isinstance(val, str): - return False - val = _process_text(val) - return val in codes_insee +from os.path import dirname, join + +from csv_detective.parsing.text import _process_text + +proportion = 1 +tags = ["fr"] +labels = [ + "csp insee", + "csp", + "categorie socioprofessionnelle", +] + +f = open(join(dirname(__file__), "data", "csp_insee.txt"), "r") +codes_insee = f.read().split("\n") +# removing empty str due to additionnal line in file +del codes_insee[-1] +codes_insee = set(codes_insee) +f.close() + + +def _is(val): + if not isinstance(val, str): + return False + val = _process_text(val) + return val in codes_insee + + +_test_values = { + True: ["employes de la poste"], + False: ["super-heros"], +} diff --git a/csv_detective/detect_fields/FR/other/csp_insee/csp_insee.txt b/csv_detective/formats/data/csp_insee.txt similarity index 100% rename from csv_detective/detect_fields/FR/other/csp_insee/csp_insee.txt rename to csv_detective/formats/data/csp_insee.txt diff --git a/csv_detective/detect_fields/FR/other/insee_ape700/insee_ape700.txt b/csv_detective/formats/data/insee_ape700.txt old mode 100644 new mode 100755 similarity index 100% rename from csv_detective/detect_fields/FR/other/insee_ape700/insee_ape700.txt rename to csv_detective/formats/data/insee_ape700.txt diff --git a/csv_detective/detect_fields/geo/iso_country_code_alpha2/iso_country_code_alpha2.txt b/csv_detective/formats/data/iso_country_code_alpha2.txt similarity index 100% rename from csv_detective/detect_fields/geo/iso_country_code_alpha2/iso_country_code_alpha2.txt rename to csv_detective/formats/data/iso_country_code_alpha2.txt diff --git a/csv_detective/detect_fields/geo/iso_country_code_alpha3/iso_country_code_alpha3.txt b/csv_detective/formats/data/iso_country_code_alpha3.txt similarity index 100% rename from csv_detective/detect_fields/geo/iso_country_code_alpha3/iso_country_code_alpha3.txt rename to csv_detective/formats/data/iso_country_code_alpha3.txt diff --git a/csv_detective/detect_fields/geo/iso_country_code_numeric/iso_country_code_numeric.txt b/csv_detective/formats/data/iso_country_code_numeric.txt similarity index 100% rename from csv_detective/detect_fields/geo/iso_country_code_numeric/iso_country_code_numeric.txt rename to csv_detective/formats/data/iso_country_code_numeric.txt diff --git a/csv_detective/detect_fields/temp/date/__init__.py b/csv_detective/formats/date.py old mode 100644 new mode 100755 similarity index 72% rename from csv_detective/detect_fields/temp/date/__init__.py rename to csv_detective/formats/date.py index b6a66cfb..68f5a45b --- a/csv_detective/detect_fields/temp/date/__init__.py +++ b/csv_detective/formats/date.py @@ -1,62 +1,99 @@ -import re -from datetime import datetime - -from dateparser import parse as date_parser -from dateutil.parser import ParserError -from dateutil.parser import parse as dateutil_parser - -PROPORTION = 1 -# /!\ this is only for dates, not datetimes which are handled by other utils - - -def date_casting(val: str) -> datetime | None: - """For performance reasons, we try first with dateutil and fallback on dateparser""" - try: - return dateutil_parser(val) - except ParserError: - return date_parser(val) - except Exception: - return None - - -seps = r"[\s/\-\*_\|;.,]" -# matches JJ-MM-AAAA with any of the listed separators -jjmmaaaa_pattern = r"^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$".replace( - "SEP", seps -) -# matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR -aaaammjj_pattern = r"^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$".replace( - "SEP", seps + "?" -) -# matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR -string_month_pattern = ( - r"^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr" - r"|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|" - r"mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP" - r"([0-9]{2}$|(19|20)[0-9]{2}$)" -).replace("SEP", seps + "?") - -threshold = 0.3 - - -def _is(val): - """Renvoie True si val peut être une date, False sinon""" - # early stops, to cut processing time - if not isinstance(val, str) or len(val) > 20 or len(val) < 8: - return False - # if it's a usual date pattern - if any( - # with this syntax, if any of the first value is True, the next ones are not computed - [ - bool(re.match(jjmmaaaa_pattern, val)) - or bool(re.match(aaaammjj_pattern, val)) - or bool(re.match(string_month_pattern, val, re.IGNORECASE)) - ] - ): - return True - if sum([char.isdigit() for char in val]) / len(val) < threshold: - return False - res = date_casting(val) - if not res or res.hour or res.minute or res.second: - return False - return True +import re +from datetime import datetime + +from dateparser import parse as date_parser +from dateutil.parser import ParserError +from dateutil.parser import parse as dateutil_parser + +proportion = 1 +tags = ["temp", "type"] +SHARED_DATE_LABELS = [ + "date", + "mise à jour", + "modifie", + "maj", + "datemaj", + "update", + "created", + "modified", +] +labels = SHARED_DATE_LABELS + [ + "jour", + "periode", + "dpc", + "yyyymmdd", + "aaaammjj", +] + + +def date_casting(val: str) -> datetime | None: + """For performance reasons, we try first with dateutil and fallback on dateparser""" + try: + return dateutil_parser(val) + except ParserError: + return date_parser(val) + except Exception: + return None + + +threshold = 0.3 +seps = r"[\s/\-\*_\|;.,]" +# matches JJ-MM-AAAA with any of the listed separators +jjmmaaaa_pattern = r"^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$".replace( + "SEP", seps +) +# matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR +aaaammjj_pattern = r"^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$".replace( + "SEP", seps + "?" +) +# matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR +string_month_pattern = ( + r"^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr" + r"|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|" + r"mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP" + r"([0-9]{2}$|(19|20)[0-9]{2}$)" +).replace("SEP", seps + "?") + + +def _is(val): + # early stops, to cut processing time + if not isinstance(val, str) or len(val) > 20 or len(val) < 8: + return False + # if it's a usual date pattern + if any( + # with this syntax, if any of the first value is True, the next ones are not computed + [ + bool(re.match(jjmmaaaa_pattern, val)) + or bool(re.match(aaaammjj_pattern, val)) + or bool(re.match(string_month_pattern, val, re.IGNORECASE)) + ] + ): + return True + if sum([char.isdigit() for char in val]) / len(val) < threshold: + return False + res = date_casting(val) + if not res or res.hour or res.minute or res.second: + return False + return True + + +_test_values = { + True: [ + "1960-08-07", + "12/02/2007", + "15 jan 1985", + "15 décembre 1985", + "02 05 2003", + "20030502", + "1993-12/02", + ], + False: [ + "1993-1993-1993", + "39-10-1993", + "19-15-1993", + "15 tambour 1985", + "12152003", + "20031512", + "02052003", + ], +} diff --git a/csv_detective/formats/date_fr.py b/csv_detective/formats/date_fr.py new file mode 100755 index 00000000..2ba04928 --- /dev/null +++ b/csv_detective/formats/date_fr.py @@ -0,0 +1,22 @@ +import re + +from csv_detective.parsing.text import _process_text + +proportion = 1 +tags = ["fr", "temp"] +labels = ["date"] + +pattern = ( + r"^(0?[1-9]|[12][0-9]|3[01])[ \-/](janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre" + r"|octobre|novembre|decembre)[ \-/]\d{4}$" +) + + +def _is(val): + return isinstance(val, str) and bool(re.match(pattern, _process_text(val))) + + +_test_values = { + True: ["13 février 1996", "15 decembre 2024"], + False: ["44 march 2025"], +} diff --git a/csv_detective/detect_fields/temp/datetime_aware/__init__.py b/csv_detective/formats/datetime_aware.py similarity index 61% rename from csv_detective/detect_fields/temp/datetime_aware/__init__.py rename to csv_detective/formats/datetime_aware.py index 5f7470a6..426e9e48 100755 --- a/csv_detective/detect_fields/temp/datetime_aware/__init__.py +++ b/csv_detective/formats/datetime_aware.py @@ -1,12 +1,12 @@ import re -from typing import Any -from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting +from csv_detective.formats.date import SHARED_DATE_LABELS, aaaammjj_pattern, date_casting -PROPORTION = 1 -threshold = 0.7 +proportion = 1 +tags = ["temp", "type"] +labels = SHARED_DATE_LABELS + ["datetime", "timestamp"] -# matches AAAA-MM-JJTHH:MM:SS(.dddddd)(±HH:MM|Z) with any of the listed separators for the date OR NO SEPARATOR +threshold = 0.7 pat = ( aaaammjj_pattern.replace("$", "") + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})" @@ -14,8 +14,7 @@ ) -def _is(val: Any | None) -> bool: - """Detects timezone-aware datetimes only""" +def _is(val): # early stops, to cut processing time # 16 is the minimal length of a datetime format YYMMDDTHH:MM:SSZ # 32 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd+HH:MM, keeping some slack @@ -32,3 +31,15 @@ def _is(val: Any | None) -> bool: and bool(res.hour or res.minute or res.second or res.microsecond) and bool(res.tzinfo) ) + + +_test_values = { + True: [ + "2021-06-22 10:20:10-04:00", + "2030-06-22 00:00:00.0028+02:00", + "2000-12-21 10:20:10.1Z", + "2024-12-19T10:53:36.428000+00:00", + "1996/06/22 10:20:10 GMT", + ], + False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"], +} diff --git a/csv_detective/detect_fields/temp/datetime_naive/__init__.py b/csv_detective/formats/datetime_naive.py similarity index 62% rename from csv_detective/detect_fields/temp/datetime_naive/__init__.py rename to csv_detective/formats/datetime_naive.py index 464cb2ad..28b94d98 100755 --- a/csv_detective/detect_fields/temp/datetime_naive/__init__.py +++ b/csv_detective/formats/datetime_naive.py @@ -1,9 +1,11 @@ import re from typing import Any -from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting +from csv_detective.formats.date import aaaammjj_pattern, date_casting +from csv_detective.formats.datetime_aware import labels # noqa -PROPORTION = 1 +proportion = 1 +tags = ["temp", "type"] threshold = 0.7 # matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR @@ -27,3 +29,20 @@ def _is(val: Any | None) -> bool: return False res = date_casting(val) return res is not None and not bool(res.tzinfo) + + +_test_values = { + True: [ + "2021-06-22 10:20:10", + "2030/06-22 00:00:00", + "2030/06/22 00:00:00.0028", + ], + False: [ + "2021-06-22T30:20:10", + "Sun, 06 Nov 1994 08:49:37 GMT", + "2021-06-44 10:20:10+02:00", + "1999-12-01T00:00:00Z", + "2021-06-44", + "15 décembre 1985", + ], +} diff --git a/csv_detective/detect_fields/temp/datetime_rfc822/__init__.py b/csv_detective/formats/datetime_rfc822.py old mode 100644 new mode 100755 similarity index 61% rename from csv_detective/detect_fields/temp/datetime_rfc822/__init__.py rename to csv_detective/formats/datetime_rfc822.py index ea2f6078..4cd9d8fd --- a/csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +++ b/csv_detective/formats/datetime_rfc822.py @@ -1,18 +1,24 @@ -import re - -PROPORTION = 1 - - -def _is(val): - """Renvoie True si val peut être une date au format rfc822, False sinon - Exemple: Tue, 19 Dec 2023 15:30:45 +0000""" - - return isinstance(val, str) and bool( - re.match( - r"^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} " - r"([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) " - r"(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$", - val.lower(), - re.IGNORECASE, - ) - ) +import re + +from csv_detective.formats.datetime_aware import labels # noqa + +proportion = 1 +tags = ["temp", "type"] + + +def _is(val): + return isinstance(val, str) and bool( + re.match( + r"^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} " + r"([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) " + r"(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$", + val.lower(), + re.IGNORECASE, + ) + ) + + +_test_values = { + True: ["Sun, 06 Nov 1994 08:49:37 GMT"], + False: ["2021-06-22T10:20:10"], +} diff --git a/csv_detective/formats/departement.py b/csv_detective/formats/departement.py new file mode 100755 index 00000000..e25d2224 --- /dev/null +++ b/csv_detective/formats/departement.py @@ -0,0 +1,37 @@ +from frformat import Departement, Millesime, Options + +proportion = 0.9 +tags = ["fr", "geo"] +labels = [ + "departement", + "libelle du departement", + "deplib", + "nom dept", + "dept", + "libdepartement", + "nom departement", + "libelle dep", + "libelle departement", + "lb departements", + "dep libusage", + "lb departement", + "nom dep", +] + +_options = Options( + ignore_case=True, + ignore_accents=True, + replace_non_alphanumeric_with_space=True, + ignore_extra_whitespace=True, +) +_departement = Departement(Millesime.LATEST, _options) + + +def _is(val): + return isinstance(val, str) and _departement.is_valid(val) + + +_test_values = { + True: ["essonne"], + False: ["alabama", "auvergne"], +} diff --git a/csv_detective/formats/email.py b/csv_detective/formats/email.py new file mode 100755 index 00000000..87e98f1d --- /dev/null +++ b/csv_detective/formats/email.py @@ -0,0 +1,28 @@ +import re + +proportion = 0.9 +labels = [ + "email", + "mail", + "courriel", + "contact", + "mel", + "lieucourriel", + "coordinates.emailcontact", + "e mail", + "mo mail", + "adresse mail", + "adresse email", +] + + +def _is(val): + return isinstance(val, str) and bool( + re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", val, re.IGNORECASE) + ) + + +_test_values = { + True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"], + False: ["cdo@@gouv.sfd"], +} diff --git a/csv_detective/detect_fields/other/float/__init__.py b/csv_detective/formats/float.py old mode 100644 new mode 100755 similarity index 67% rename from csv_detective/detect_fields/other/float/__init__.py rename to csv_detective/formats/float.py index f11606e7..46b33e22 --- a/csv_detective/detect_fields/other/float/__init__.py +++ b/csv_detective/formats/float.py @@ -1,21 +1,29 @@ -PROPORTION = 1 - - -def float_casting(val: str) -> float: - return float(val.replace(",", ".")) - - -def _is(val): - """Detects floats, assuming that tables will not have scientific - notations (3e6) or "+" in the string. "-" is still accepted.""" - try: - if ( - not isinstance(val, str) - or any([k in val for k in ["_", "+", "e", "E"]]) - or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","]) - ): - return False - float_casting(val) - return True - except ValueError: - return False +proportion = 1 +tags = ["type"] +labels = ["part", "ratio", "taux"] + + +def float_casting(val: str) -> float: + return float(val.replace(",", ".")) + + +def _is(val): + """Detects floats, assuming that tables will not have scientific + notations (3e6) or "+" in the string. "-" is still accepted.""" + try: + if ( + not isinstance(val, str) + or any([k in val for k in ["_", "+", "e", "E"]]) + or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","]) + ): + return False + float_casting(val) + return True + except ValueError: + return False + + +_test_values = { + True: ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"], + False: ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"], +} diff --git a/csv_detective/formats/geojson.py b/csv_detective/formats/geojson.py new file mode 100755 index 00000000..48d28712 --- /dev/null +++ b/csv_detective/formats/geojson.py @@ -0,0 +1,36 @@ +import json + +proportion = 1 +tags = ["geo"] +labels = [ + "json geojson", + "json", + "geojson", + "geo shape", + "geom", + "geometry", + "geo shape", + "geoshape", +] + + +def _is(val) -> bool: + try: + j = json.loads(val) + if isinstance(j, dict): + if "type" in j and "coordinates" in j: + return True + if "geometry" in j and "coordinates" in j["geometry"]: + return True + except Exception: + pass + return False + + +_test_values = { + True: [ + '{"coordinates": [45.783753, 3.049342], "type": "63870"}', + '{"geometry": {"coordinates": [45.783753, 3.049342]}}', + ], + False: ['{"pomme": "fruit", "reponse": 42}'], +} diff --git a/csv_detective/detect_fields/FR/other/insee_ape700/__init__.py b/csv_detective/formats/insee_ape700.py old mode 100644 new mode 100755 similarity index 51% rename from csv_detective/detect_fields/FR/other/insee_ape700/__init__.py rename to csv_detective/formats/insee_ape700.py index 2accd788..193fd963 --- a/csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +++ b/csv_detective/formats/insee_ape700.py @@ -1,19 +1,31 @@ -from os.path import dirname, join - -from csv_detective.parsing.text import _process_text - -PROPORTION = 1 -f = open(join(dirname(__file__), "insee_ape700.txt"), "r") -condes_insee_ape = f.read().split("\n") -# removing empty str due to additionnal line in file -del condes_insee_ape[-1] -condes_insee_ape = set(condes_insee_ape) -f.close() - - -def _is(val): - """Repère les codes APE700 de l'INSEE""" - if not isinstance(val, str): - return False - val = _process_text(val).upper() - return val in condes_insee_ape +from os.path import dirname, join + +from csv_detective.parsing.text import _process_text + +proportion = 0.8 +tags = ["fr"] +labels = [ + "code ape", + "code activite (ape)", + "code naf", + "code naf organisme designe", + "code naf organisme designant", + "base sirene : code ape de l'etablissement siege", +] + +f = open(join(dirname(__file__), "data", "insee_ape700.txt"), "r") +condes_insee_ape = f.read().split("\n") +# removing empty str due to additionnal line in file +del condes_insee_ape[-1] +condes_insee_ape = set(condes_insee_ape) +f.close() + + +def _is(val): + if not isinstance(val, str): + return False + val = _process_text(val).upper() + return val in condes_insee_ape + + +_test_values = {True: ["0116Z"], False: ["0116A"]} diff --git a/csv_detective/detect_fields/FR/geo/insee_canton/__init__.py b/csv_detective/formats/insee_canton.py old mode 100644 new mode 100755 similarity index 59% rename from csv_detective/detect_fields/FR/geo/insee_canton/__init__.py rename to csv_detective/formats/insee_canton.py index d18d3ac1..cc622b2f --- a/csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +++ b/csv_detective/formats/insee_canton.py @@ -1,15 +1,28 @@ -from frformat import Canton, Millesime, Options - -PROPORTION = 0.9 -_options = Options( - ignore_case=True, - ignore_accents=True, - replace_non_alphanumeric_with_space=True, - ignore_extra_whitespace=True, -) -_canton = Canton(Millesime.LATEST, _options) - - -def _is(val): - """Match avec le nom des cantons""" - return isinstance(val, str) and _canton.is_valid(val) +from frformat import Canton, Millesime, Options + +proportion = 0.9 +tags = ["fr", "geo"] +labels = [ + "insee canton", + "canton", + "cant", + "nom canton", +] + +_options = Options( + ignore_case=True, + ignore_accents=True, + replace_non_alphanumeric_with_space=True, + ignore_extra_whitespace=True, +) +_canton = Canton(Millesime.LATEST, _options) + + +def _is(val): + return isinstance(val, str) and _canton.is_valid(val) + + +_test_values = { + True: ["nantua"], + False: ["california"], +} diff --git a/csv_detective/detect_fields/other/int/__init__.py b/csv_detective/formats/int.py old mode 100644 new mode 100755 similarity index 63% rename from csv_detective/detect_fields/other/int/__init__.py rename to csv_detective/formats/int.py index 37b82c83..d2c72062 --- a/csv_detective/detect_fields/other/int/__init__.py +++ b/csv_detective/formats/int.py @@ -1,16 +1,23 @@ -PROPORTION = 1 - - -def _is(val): - """Detects integers""" - if ( - not isinstance(val, str) - or any([v in val for v in [".", "_", "+"]]) - or (val.startswith("0") and len(val) > 1) - ): - return False - try: - int(val) - return True - except ValueError: - return False +labels = ["nb", "nombre", "nbre"] +tag = ["type"] + + +def _is(val): + """Detects integers""" + if ( + not isinstance(val, str) + or any([v in val for v in [".", "_", "+"]]) + or (val.startswith("0") and len(val) > 1) + ): + return False + try: + int(val) + return True + except ValueError: + return False + + +_test_values = { + True: ["1", "0", "1764", "-24"], + False: ["01053", "1.2", "123_456", "+35"], +} diff --git a/csv_detective/formats/iso_country_code_alpha2.py b/csv_detective/formats/iso_country_code_alpha2.py new file mode 100755 index 00000000..ce7f5e4d --- /dev/null +++ b/csv_detective/formats/iso_country_code_alpha2.py @@ -0,0 +1,30 @@ +import re +from os.path import dirname, join + +proportion = 1 +tags = ["geo"] +labels = [ + "iso country code", + "code pays", + "pays", + "country", + "nation", + "pays code", + "code pays (iso)", +] + +with open(join(dirname(__file__), "data", "iso_country_code_alpha2.txt"), "r") as iofile: + liste_pays = iofile.read().split("\n") +liste_pays = set(liste_pays) + + +def _is(val): + if not isinstance(val, str) or not bool(re.match(r"[A-Z]{2}$", val)): + return False + return val in liste_pays + + +_test_values = { + True: ["FR"], + False: ["XX", "A", "FRA"], +} diff --git a/csv_detective/formats/iso_country_code_alpha3.py b/csv_detective/formats/iso_country_code_alpha3.py new file mode 100755 index 00000000..c44b6d1d --- /dev/null +++ b/csv_detective/formats/iso_country_code_alpha3.py @@ -0,0 +1,30 @@ +import re +from os.path import dirname, join + +proportion = 1 +tags = ["geo"] +labels = [ + "iso country code", + "code pays", + "pays", + "country", + "nation", + "pays code", + "code pays (iso)", +] + +with open(join(dirname(__file__), "data", "iso_country_code_alpha3.txt"), "r") as iofile: + liste_pays = iofile.read().split("\n") + + +def _is(val): + """Renvoie True si val peut etre un code iso pays alpha-3, False sinon""" + if not isinstance(val, str) or not bool(re.match(r"[A-Z]{3}$", val)): + return False + return val in set(liste_pays) + + +_test_values = { + True: ["FRA"], + False: ["XXX", "FR", "A"], +} diff --git a/csv_detective/formats/iso_country_code_numeric.py b/csv_detective/formats/iso_country_code_numeric.py new file mode 100755 index 00000000..ca2d298a --- /dev/null +++ b/csv_detective/formats/iso_country_code_numeric.py @@ -0,0 +1,31 @@ +import re +from os.path import dirname, join + +proportion = 1 +tags = ["geo"] +labels = [ + "iso country code", + "code pays", + "pays", + "country", + "nation", + "pays code", + "code pays (iso)", +] + +with open(join(dirname(__file__), "data", "iso_country_code_numeric.txt"), "r") as iofile: + liste_pays = iofile.read().split("\n") +liste_pays = set(liste_pays) + + +def _is(val): + """Renvoie True si val peut etre un code iso pays numerique, False sinon""" + if not isinstance(val, str) or not bool(re.match(r"[0-9]{3}$", val)): + return False + return val in liste_pays + + +_test_values = { + True: ["250"], + False: ["003"], +} diff --git a/csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py b/csv_detective/formats/jour_de_la_semaine.py old mode 100644 new mode 100755 similarity index 50% rename from csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py rename to csv_detective/formats/jour_de_la_semaine.py index cc711605..99de99ae --- a/csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +++ b/csv_detective/formats/jour_de_la_semaine.py @@ -1,25 +1,41 @@ -PROPORTION = 1 -jours = { - "lundi", - "mardi", - "mercredi", - "jeudi", - "vendredi", - "samedi", - "dimanche", - "lun", - "mar", - "mer", - "jeu", - "ven", - "sam", - "dim", -} - - -def _is(val): - """Renvoie True si les champs peuvent être des jours de la semaine""" - if not isinstance(val, str): - return False - val = val.lower() - return val in jours +proportion = 0.8 +tags = ["fr", "temp"] +labels = [ + "jour semaine", + "type jour", + "jour de la semaine", + "saufjour", + "nomjour", + "jour", + "jour de fermeture", +] + +jours = { + "lundi", + "mardi", + "mercredi", + "jeudi", + "vendredi", + "samedi", + "dimanche", + "lun", + "mar", + "mer", + "jeu", + "ven", + "sam", + "dim", +} + + +def _is(val): + if not isinstance(val, str): + return False + val = val.lower() + return val in jours + + +_test_values = { + True: ["lundi"], + False: ["jour de la biere"], +} diff --git a/csv_detective/detect_fields/other/json/__init__.py b/csv_detective/formats/json.py old mode 100644 new mode 100755 similarity index 60% rename from csv_detective/detect_fields/other/json/__init__.py rename to csv_detective/formats/json.py index e4051b1f..634c4adb --- a/csv_detective/detect_fields/other/json/__init__.py +++ b/csv_detective/formats/json.py @@ -1,14 +1,20 @@ -import json -from json import JSONDecodeError - -PROPORTION = 1 - - -def _is(val): - """Detects json""" - try: - loaded = json.loads(val) - # we don't want to consider integers for instance - return isinstance(loaded, (list, dict)) - except (JSONDecodeError, TypeError): - return False +import json +from json import JSONDecodeError + +proportion = 1 +tags = ["type"] + + +def _is(val): + try: + loaded = json.loads(val) + # we don't want to consider integers for instance + return isinstance(loaded, (list, dict)) + except (JSONDecodeError, TypeError): + return False + + +_test_values = { + True: ['{"pomme": "fruit", "reponse": 42}', "[1,2,3,4]"], + False: ["5", '{"zefib":', '{"a"}'], +} diff --git a/csv_detective/formats/latitude_l93.py b/csv_detective/formats/latitude_l93.py new file mode 100755 index 00000000..4866c1bb --- /dev/null +++ b/csv_detective/formats/latitude_l93.py @@ -0,0 +1,48 @@ +from frformat import LatitudeL93 + +from csv_detective.formats.float import _is as is_float +from csv_detective.formats.float import float_casting + +proportion = 1 +tags = ["fr", "geo"] +labels = [ + "latitude", + "lat", + "y", + "yf", + "yd", + "y l93", + "coordonnee y", + "latitude lb93", + "coord y", + "ycoord", + "geocodage y gps", + "location latitude", + "ylatitude", + "ylat", + "latitude (y)", + "latitudeorg", + "coordinates.latitude", + "googlemap latitude", + "latitudelieu", + "latitude googlemap", +] + +_latitudel93 = LatitudeL93() + + +def _is(val): + try: + if isinstance(val, str) and is_float(val): + return _latitudel93.is_valid(float_casting(val)) + + return False + + except (ValueError, OverflowError): + return False + + +_test_values = { + True: ["6037008", "7123528.5", "7124528,5"], + False: ["0", "-6734529.6", "7245669.8", "3422674,78", "32_34"], +} diff --git a/csv_detective/formats/latitude_wgs.py b/csv_detective/formats/latitude_wgs.py new file mode 100755 index 00000000..58701e7a --- /dev/null +++ b/csv_detective/formats/latitude_wgs.py @@ -0,0 +1,42 @@ +from csv_detective.formats.float import _is as is_float + +proportion = 1 +tags = ["geo"] +labels = [ + "latitude", + "lat", + "y", + "yf", + "yd", + "coordonnee y", + "coord y", + "ycoord", + "geocodage y gps", + "location latitude", + "ylatitude", + "ylat", + "latitude (y)", + "latitudeorg", + "coordinates.latitude", + "googlemap latitude", + "latitudelieu", + "latitude googlemap", + "latitude wgs84", + "y wgs84", + "latitude (wgs84)", +] + + +def _is(val): + try: + return is_float(val) and float(val) >= -90 and float(val) <= 90 + except ValueError: + return False + except OverflowError: + return False + + +_test_values = { + True: ["43.2", "-22"], + False: ["100"], +} diff --git a/csv_detective/formats/latitude_wgs_fr_metropole.py b/csv_detective/formats/latitude_wgs_fr_metropole.py new file mode 100755 index 00000000..d7489831 --- /dev/null +++ b/csv_detective/formats/latitude_wgs_fr_metropole.py @@ -0,0 +1,42 @@ +from csv_detective.formats.float import _is as is_float + +proportion = 1 +tags = ["fr", "geo"] +labels = [ + "latitude", + "lat", + "y", + "yf", + "yd", + "coordonnee y", + "coord y", + "ycoord", + "geocodage y gps", + "location latitude", + "ylatitude", + "ylat", + "latitude (y)", + "latitudeorg", + "coordinates.latitude", + "googlemap latitude", + "latitudelieu", + "latitude googlemap", + "latitude wgs84", + "y wgs84", + "latitude (wgs84)", +] + + +def _is(val): + try: + return is_float(val) and float(val) >= 41.3 and float(val) <= 51.3 + except ValueError: + return False + except OverflowError: + return False + + +_test_values = { + True: ["42.5"], + False: ["22.5", "62.5"], +} diff --git a/csv_detective/formats/latlon_wgs.py b/csv_detective/formats/latlon_wgs.py new file mode 100755 index 00000000..8a486d03 --- /dev/null +++ b/csv_detective/formats/latlon_wgs.py @@ -0,0 +1,53 @@ +from csv_detective.formats.latitude_wgs import _is as is_lat +from csv_detective.formats.longitude_wgs import _is as is_lon + +proportion = 1 +tags = ["geo"] + +SHARED_COORDS_LABELS = [ + "ban", + "coordinates", + "coordonnees", + "coordonnees insee", + "geo", + "geopoint", + "geoloc", + "geolocalisation", + "geom", + "geometry", + "gps", + "localisation", + "point", + "position", + "wgs84", +] + +specific = [ + "latlon", + "lat lon", + "x y", + "xy", +] + +# we aim wide to catch exact matches if possible for the highest possible score +labels = ( + SHARED_COORDS_LABELS + + specific + + [w + sep + suf for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]] +) + + +def _is(val): + if not isinstance(val, str) or val.count(",") != 1: + return False + lat, lon = val.split(",") + # handling [lat,lon] + if lat.startswith("[") and lon.endswith("]"): + lat, lon = lat[1:], lon[:-1] + return is_lat(lat) and is_lon(lon.replace(" ", "")) + + +_test_values = { + True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8", "[12,-0.28]"], + False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27"], +} diff --git a/csv_detective/formats/longitude_l93.py b/csv_detective/formats/longitude_l93.py new file mode 100755 index 00000000..d44d812a --- /dev/null +++ b/csv_detective/formats/longitude_l93.py @@ -0,0 +1,39 @@ +from frformat import LongitudeL93 + +from csv_detective.formats.float import _is as is_float +from csv_detective.formats.float import float_casting + +proportion = 1 +tags = ["fr", "geo"] +labels = [ + "longitude", + "lon", + "long", + "geocodage x gps", + "location longitude", + "xlongitude", + "lng", + "xlong", + "x", + "xf", + "xd", +] + +_longitudel93 = LongitudeL93() + + +def _is(val): + try: + if isinstance(val, str) and is_float(val): + return _longitudel93.is_valid(float_casting(val)) + + return False + + except (ValueError, OverflowError): + return False + + +_test_values = { + True: ["0", "-154", "1265783,45", "34723.4"], + False: ["1456669.8", "-776225", "346_3214"], +} diff --git a/csv_detective/formats/longitude_wgs.py b/csv_detective/formats/longitude_wgs.py new file mode 100755 index 00000000..b0ded2e1 --- /dev/null +++ b/csv_detective/formats/longitude_wgs.py @@ -0,0 +1,32 @@ +from csv_detective.formats.float import _is as is_float + +proportion = 1 +tags = ["geo"] +labels = [ + "longitude", + "lon", + "long", + "geocodage x gps", + "location longitude", + "xlongitude", + "lng", + "xlong", + "x", + "xf", + "xd", +] + + +def _is(val): + try: + return is_float(val) and float(val) >= -180 and float(val) <= 180 + except ValueError: + return False + except OverflowError: + return False + + +_test_values = { + True: ["120", "-20.2"], + False: ["-200"], +} diff --git a/csv_detective/formats/longitude_wgs_fr_metropole.py b/csv_detective/formats/longitude_wgs_fr_metropole.py new file mode 100755 index 00000000..20a3be05 --- /dev/null +++ b/csv_detective/formats/longitude_wgs_fr_metropole.py @@ -0,0 +1,32 @@ +from csv_detective.formats.float import _is as is_float + +proportion = 1 +tags = ["fr", "geo"] +labels = [ + "longitude", + "lon", + "long", + "geocodage x gps", + "location longitude", + "xlongitude", + "lng", + "xlong", + "x", + "xf", + "xd", +] + + +def _is(val): + try: + return is_float(val) and float(val) >= -5.5 and float(val) <= 9.8 + except ValueError: + return False + except OverflowError: + return False + + +_test_values = { + True: ["-2.5"], + False: ["12.8"], +} diff --git a/csv_detective/formats/lonlat_wgs.py b/csv_detective/formats/lonlat_wgs.py new file mode 100755 index 00000000..293851fd --- /dev/null +++ b/csv_detective/formats/lonlat_wgs.py @@ -0,0 +1,36 @@ +from csv_detective.formats.latitude_wgs import _is as is_lat +from csv_detective.formats.latlon_wgs import SHARED_COORDS_LABELS +from csv_detective.formats.longitude_wgs import _is as is_lon + +proportion = 1 +tags = ["geo"] + +specific = [ + "lonlat", + "lon lat", + "y x", + "yx", +] + +# we aim wide to catch exact matches if possible for the highest possible score +words = ( + SHARED_COORDS_LABELS + + specific + + [w + sep + suf for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]] +) + + +def _is(val): + if not isinstance(val, str) or val.count(",") != 1: + return False + lon, lat = val.split(",") + # handling [lon,lat] + if lon.startswith("[") and lat.endswith("]"): + lon, lat = lon[1:], lat[:-1] + return is_lon(lon) and is_lat(lat.replace(" ", "")) + + +_test_values = { + True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"], + False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"], +} diff --git a/csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py b/csv_detective/formats/mois_de_lannee.py old mode 100644 new mode 100755 similarity index 74% rename from csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py rename to csv_detective/formats/mois_de_lannee.py index 26a7449b..d0320bdd --- a/csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +++ b/csv_detective/formats/mois_de_lannee.py @@ -1,39 +1,48 @@ -from unidecode import unidecode - -PROPORTION = 1 -mois = { - "janvier", - "fevrier", - "mars", - "avril", - "mai", - "juin", - "juillet", - "aout", - "septembre", - "octobre", - "novembre", - "decembre", - "jan", - "fev", - "mar", - "avr", - "mai", - "jun", - "jui", - "juil", - "aou", - "sep", - "sept", - "oct", - "nov", - "dec", -} - - -def _is(val): - """Renvoie True si les champs peuvent être des mois de l'année""" - if not isinstance(val, str): - return False - val = unidecode(val.lower()) - return val in mois +from unidecode import unidecode + +proportion = 1 +tags = ["fr", "temp"] +labels = ["mois", "month"] + +mois = { + "janvier", + "fevrier", + "mars", + "avril", + "mai", + "juin", + "juillet", + "aout", + "septembre", + "octobre", + "novembre", + "decembre", + "jan", + "fev", + "mar", + "avr", + "mai", + "jun", + "jui", + "juil", + "aou", + "sep", + "sept", + "oct", + "nov", + "dec", +} + + +def _is(val): + """Renvoie True si les champs peuvent être des mois de l'année""" + if not isinstance(val, str): + return False + val = unidecode(val.lower()) + return val in mois + + +_test_values = { + True: ["JUIN", "décembre"], + False: ["november"], +} diff --git a/csv_detective/formats/money.py b/csv_detective/formats/money.py new file mode 100755 index 00000000..81bbbc58 --- /dev/null +++ b/csv_detective/formats/money.py @@ -0,0 +1,18 @@ +from csv_detective.formats.float import _is as is_float + +proportion = 0.8 +labels = ["budget", "salaire", "euro", "euros", "prêt", "montant"] + +currencies = {"€", "$", "£", "¥"} + + +def _is(val): + if not isinstance(val, str) or val[-1] not in currencies: + return False + return is_float(val[:-1]) + + +_test_values = { + True: ["120€", "-20.2$"], + False: ["200", "100 euros"], +} diff --git a/csv_detective/formats/mongo_object_id.py b/csv_detective/formats/mongo_object_id.py new file mode 100755 index 00000000..43c4ceb9 --- /dev/null +++ b/csv_detective/formats/mongo_object_id.py @@ -0,0 +1,14 @@ +import re + +proportion = 0.8 +labels = ["id", "objectid"] + + +def _is(val): + return isinstance(val, str) and bool(re.match(r"^[0-9a-fA-F]{24}$", val)) + + +_test_values = { + True: ["62320e50f981bc2b57bcc044"], + False: ["884762be-51f3-44c3-b811-1e14c5d89262", "0230240284a66e"], +} diff --git a/csv_detective/formats/pays.py b/csv_detective/formats/pays.py new file mode 100755 index 00000000..e52d778b --- /dev/null +++ b/csv_detective/formats/pays.py @@ -0,0 +1,35 @@ +from frformat import Millesime, Options, Pays + +proportion = 0.6 +tags = ["fr", "geo"] +labels = [ + "pays", + "payslieu", + "paysorg", + "country", + "pays lib", + "lieupays", + "pays beneficiaire", + "nom du pays", + "journey start country", + "libelle pays", + "journey end country", +] + +_options = Options( + ignore_case=True, + ignore_accents=True, + replace_non_alphanumeric_with_space=True, + ignore_extra_whitespace=True, +) +_pays = Pays(Millesime.LATEST, _options) + + +def _is(val): + return isinstance(val, str) and _pays.is_valid(val) + + +_test_values = { + True: ["france", "italie"], + False: ["amerique", "paris"], +} diff --git a/csv_detective/formats/percent.py b/csv_detective/formats/percent.py new file mode 100755 index 00000000..31529e7c --- /dev/null +++ b/csv_detective/formats/percent.py @@ -0,0 +1,16 @@ +from csv_detective.formats.float import _is as is_float + +proportion = 0.8 +labels = [] + + +def _is(val): + if not isinstance(val, str) or val[-1] != "%": + return False + return is_float(val[:-1]) + + +_test_values = { + True: ["120%", "-20.2%"], + False: ["200", "100 pourcents"], +} diff --git a/csv_detective/detect_fields/FR/geo/region/__init__.py b/csv_detective/formats/region.py old mode 100644 new mode 100755 similarity index 71% rename from csv_detective/detect_fields/FR/geo/region/__init__.py rename to csv_detective/formats/region.py index dbcbc8db..f336d6fc --- a/csv_detective/detect_fields/FR/geo/region/__init__.py +++ b/csv_detective/formats/region.py @@ -1,50 +1,70 @@ -from frformat import Millesime, Options, Region - -PROPORTION = 1 - -_extra_valid_values_set = frozenset( - { - "alsace", - "aquitaine", - "ara", - "aura", - "auvergne", - "auvergne et rhone alpes", - "basse normandie", - "bfc", - "bourgogne", - "bourgogne et franche comte", - "centre", - "champagne ardenne", - "franche comte", - "ge", - "haute normandie", - "hdf", - "languedoc roussillon", - "limousin", - "lorraine", - "midi pyrenees", - "nord pas de calais", - "npdc", - "paca", - "picardie", - "poitou charentes", - "reunion", - "rhone alpes", - } -) - - -_options = Options( - ignore_case=True, - ignore_accents=True, - replace_non_alphanumeric_with_space=True, - ignore_extra_whitespace=True, - extra_valid_values=_extra_valid_values_set, -) -_region = Region(Millesime.LATEST, _options) - - -def _is(val): - """Match avec le nom des regions""" - return isinstance(val, str) and _region.is_valid(val) +from frformat import Millesime, Options, Region + +proportion = 1 +tags = ["fr", "geo"] +labels = [ + "region", + "libelle region", + "nom region", + "libelle reg", + "nom reg", + "reg libusage", + "nom de la region", + "regionorg", + "regionlieu", + "reg", + "nom officiel region", +] + +_extra_valid_values_set = frozenset( + { + "alsace", + "aquitaine", + "ara", + "aura", + "auvergne", + "auvergne et rhone alpes", + "basse normandie", + "bfc", + "bourgogne", + "bourgogne et franche comte", + "centre", + "champagne ardenne", + "franche comte", + "ge", + "haute normandie", + "hdf", + "languedoc roussillon", + "limousin", + "lorraine", + "midi pyrenees", + "nord pas de calais", + "npdc", + "paca", + "picardie", + "poitou charentes", + "reunion", + "rhone alpes", + } +) + + +_options = Options( + ignore_case=True, + ignore_accents=True, + replace_non_alphanumeric_with_space=True, + ignore_extra_whitespace=True, + extra_valid_values=_extra_valid_values_set, +) +_region = Region(Millesime.LATEST, _options) + + +def _is(val): + """Match avec le nom des regions""" + return isinstance(val, str) and _region.is_valid(val) + + +_test_values = { + True: ["bretagne", "ile-de-france"], + False: ["baviere", "overgne"], +} diff --git a/csv_detective/formats/sexe.py b/csv_detective/formats/sexe.py new file mode 100755 index 00000000..f86e5791 --- /dev/null +++ b/csv_detective/formats/sexe.py @@ -0,0 +1,17 @@ +from csv_detective.parsing.text import _process_text + +proportion = 1 +tags = ["fr"] +labels = ["sexe", "sex", "civilite", "genre", "id sexe"] + + +def _is(val): + if not isinstance(val, str): + return False + return _process_text(val) in {"homme", "femme", "h", "f", "m", "masculin", "feminin"} + + +_test_values = { + True: ["femme", "H"], + False: ["adulte"], +} diff --git a/csv_detective/detect_fields/FR/other/siren/__init__.py b/csv_detective/formats/siren.py old mode 100644 new mode 100755 similarity index 56% rename from csv_detective/detect_fields/FR/other/siren/__init__.py rename to csv_detective/formats/siren.py index 58f4426e..175c948f --- a/csv_detective/detect_fields/FR/other/siren/__init__.py +++ b/csv_detective/formats/siren.py @@ -1,20 +1,37 @@ -import re - -PROPORTION = 0.9 - - -def _is(val): - """Repere les codes SIREN""" - if not isinstance(val, str): - return False - val = val.replace(" ", "") - if not bool(re.match(r"^[0-9]{9}$", val)): - return False - # Vérification par clé propre aux codes siren - cle = 0 - pair = False - for x in val: - y = int(x) * (1 + pair) - cle += y // 10 + y % 10 - pair = not pair - return cle % 10 == 0 +import re + +proportion = 0.9 +tags = ["fr"] +labels = [ + "siren", + "siren organisme designe", + "siren organisme designant", + "n° siren", + "siren organisme", + "siren titulaire", + "numero siren", + "epci", +] + + +def _is(val): + """Repere les codes SIREN""" + if not isinstance(val, str): + return False + val = val.replace(" ", "") + if not bool(re.match(r"^[0-9]{9}$", val)): + return False + # Vérification par clé propre aux codes siren + cle = 0 + pair = False + for x in val: + y = int(x) * (1 + pair) + cle += y // 10 + y % 10 + pair = not pair + return cle % 10 == 0 + + +_test_values = { + True: ["552 100 554", "552100554"], + False: ["42"], +} diff --git a/csv_detective/detect_fields/FR/other/siret/__init__.py b/csv_detective/formats/siret.py old mode 100644 new mode 100755 similarity index 69% rename from csv_detective/detect_fields/FR/other/siret/__init__.py rename to csv_detective/formats/siret.py index e8ac0e98..7cee4a78 --- a/csv_detective/detect_fields/FR/other/siret/__init__.py +++ b/csv_detective/formats/siret.py @@ -1,31 +1,47 @@ -import re - -PROPORTION = 0.8 - - -def _is(val): - """Détection des identifiants SIRET (SIRENE)""" - if not isinstance(val, str): - return False - val = val.replace(" ", "") - if not bool(re.match(r"^[0-9]{14}$", val)): - return False - - # Vérification par clé de luhn du SIREN - cle = 0 - pair = False - for x in val[:9]: - y = int(x) * (1 + pair) - cle += y // 10 + y % 10 - pair = not pair - if cle % 10 != 0: - return cle % 10 == 0 - - # Vérification par clé de luhn du SIRET - cle = 0 - pair = len(val) % 2 == 0 - for x in val: - y = int(x) * (1 + pair) - cle += y // 10 + y % 10 - pair = not pair - return cle % 10 == 0 +import re + +proportion = 0.8 +tags = ["fr"] +labels = [ + "siret", + "siret d", + "num siret", + "siretacheteur", + "n° siret", + "coll siret", + "epci", +] + + +def _is(val): + """Détection des identifiants SIRET (SIRENE)""" + if not isinstance(val, str): + return False + val = val.replace(" ", "") + if not bool(re.match(r"^[0-9]{14}$", val)): + return False + + # Vérification par clé de luhn du SIREN + cle = 0 + pair = False + for x in val[:9]: + y = int(x) * (1 + pair) + cle += y // 10 + y % 10 + pair = not pair + if cle % 10 != 0: + return cle % 10 == 0 + + # Vérification par clé de luhn du SIRET + cle = 0 + pair = len(val) % 2 == 0 + for x in val: + y = int(x) * (1 + pair) + cle += y // 10 + y % 10 + pair = not pair + return cle % 10 == 0 + + +_test_values = { + True: ["13002526500013", "130 025 265 00013"], + False: ["13002526500012"], +} diff --git a/csv_detective/formats/tel_fr.py b/csv_detective/formats/tel_fr.py new file mode 100755 index 00000000..bf5028a7 --- /dev/null +++ b/csv_detective/formats/tel_fr.py @@ -0,0 +1,36 @@ +import re + +proportion = 0.7 +tags = ["fr"] +labels = [ + "telephone", + "tel", + "tel1", + "tel2", + "phone", + "num tel", + "tel mob", + "telephone sav", + "telephone1", + "coordinates.phone", + "telephone du lieu", +] + + +def _is(val): + if not isinstance(val, str): + return False + + if len(val) < 10: + return False + + val = val.replace(".", "").replace("-", "").replace(" ", "") + + match_1 = bool(re.match(r"^(0|\+33|0033)?[0-9]{9}$", val)) + return match_1 + + +_test_values = { + True: ["0134643467"], + False: ["6625388263", "01288398"], +} diff --git a/csv_detective/formats/uai.py b/csv_detective/formats/uai.py new file mode 100755 index 00000000..f7dcf6d7 --- /dev/null +++ b/csv_detective/formats/uai.py @@ -0,0 +1,36 @@ +import re + +proportion = 0.8 +tags = ["fr"] +labels = [ + "uai", + "code etablissement", + "code uai", + "uai - identifiant", + "numero uai", + "rne", + "numero de l'etablissement", + "code rne", + "codeetab", + "code uai de l'etablissement", + "ref uai", + "cd rne", + "numerouai", + "numero d etablissement", + "code etablissement", + "numero etablissement", +] + + +def _is(val): + if not isinstance(val, str) or len(val) != 8: + return False + if not bool(re.match(r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$", val)): + return False + return True + + +_test_values = { + True: ["0422170F"], + False: ["04292E"], +} diff --git a/csv_detective/formats/url.py b/csv_detective/formats/url.py new file mode 100755 index 00000000..de8c0b2c --- /dev/null +++ b/csv_detective/formats/url.py @@ -0,0 +1,45 @@ +import re + +proportion = 1 +labels = [ + "url", + "url source", + "site web", + "source url", + "site internet", + "remote url", + "web", + "site", + "lien", + "site data", + "lien url", + "lien vers le fichier", + "sitweb", + "interneturl", +] + +pattern = re.compile( + r"^((https?|ftp)://|www\.)(([A-Za-z0-9-]+\.)+[A-Za-z]{2,6})" + r"(/[A-Za-z0-9._~:/?#[@!$&'()*+,;=%-]*)?$" +) + + +def _is(val): + if not isinstance(val, str): + return False + return bool(pattern.match(val)) + + +_test_values = { + True: [ + "www.data.gouv.fr", + "http://data.gouv.fr", + "https://www.youtube.com/@data-gouv-fr", + ( + "https://tabular-api.data.gouv.fr/api/resources/" + "aaaaaaaa-1111-bbbb-2222-cccccccccccc/data/" + "?score__greater=0.9&decompte__exact=13" + ), + ], + False: ["tmp@data.gouv.fr"], +} diff --git a/csv_detective/formats/username.py b/csv_detective/formats/username.py new file mode 100755 index 00000000..4e4952ad --- /dev/null +++ b/csv_detective/formats/username.py @@ -0,0 +1,14 @@ +import re + +proportion = 1 +labels = ["account", "username", "user"] + + +def _is(val): + return isinstance(val, str) and bool(re.match(r"^@[A-Za-z0-9_]+$", val)) + + +_test_values = { + True: ["@accueil1"], + False: ["adresse@mail"], +} diff --git a/csv_detective/formats/uuid.py b/csv_detective/formats/uuid.py new file mode 100755 index 00000000..7aeaa017 --- /dev/null +++ b/csv_detective/formats/uuid.py @@ -0,0 +1,16 @@ +import re + +proportion = 0.8 +labels = ["id", "identifiant"] + + +def _is(val) -> bool: + return isinstance(val, str) and bool( + re.match(r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$", val) + ) + + +_test_values = { + True: ["884762be-51f3-44c3-b811-1e14c5d89262"], + False: ["0610928327"], +} diff --git a/csv_detective/formats/year.py b/csv_detective/formats/year.py new file mode 100755 index 00000000..4de3dd3c --- /dev/null +++ b/csv_detective/formats/year.py @@ -0,0 +1,28 @@ +proportion = 1 +tags = ["temp"] +labels = [ + "year", + "annee", + "annee depot", + "an nais", + "exercice", + "data year", + "annee de publication", + "exercice comptable", + "annee de naissance", + "annee ouverture", +] + + +def _is(val): + try: + val = int(val) + except ValueError: + return False + return (1800 <= val) and (val <= 2100) + + +_test_values = { + True: ["2015"], + False: ["20166", "123"], +} diff --git a/csv_detective/load_tests.py b/csv_detective/load_tests.py deleted file mode 100755 index 36a5c032..00000000 --- a/csv_detective/load_tests.py +++ /dev/null @@ -1,59 +0,0 @@ -import os - -from csv_detective import detect_fields, detect_labels # noqa - - -def get_all_packages(detect_type) -> list: - root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type - modules = [] - for dirpath, _, filenames in os.walk(root_dir): - for filename in filenames: - file = os.path.join(dirpath, filename).replace(root_dir, "") - if file.endswith("__init__.py"): - module = file.replace("__init__.py", "").replace("/", ".").replace("\\", ".")[:-1] - if module: - modules.append(detect_type + module) - return modules - - -def return_all_tests( - user_input_tests: str | list, - detect_type: str, -) -> dict[str, dict]: - """ - returns all tests that have a method _is and are listed in the user_input_tests - the function can select a sub_package from csv_detective - user_input_tests may look like this: - - "ALL": all possible tests are made - - "FR.other.siren" (or any other path-like string to one of the tests, or a group of tests, like "FR.geo"): - this specifc (group of) test(s) only - - ["FR.temp.mois_de_annee", "geo", ...]: only the specified tests will be made ; you may also skip - specific (groups of) tests by add "-" at the start (e.g "-temp.date") - """ - assert detect_type in ["detect_fields", "detect_labels"] - all_packages = get_all_packages(detect_type=detect_type) - - if isinstance(user_input_tests, str): - user_input_tests = [user_input_tests] - if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests): - tests_to_do = [detect_type] - else: - tests_to_do = [f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"] - tests_skipped = [f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"] - # removing specified (groups of) tests - all_tests = [ - # this is why we need to import detect_fields/labels - eval(x) - for x in all_packages - if any([y == x[: len(y)] for y in tests_to_do]) - and all([y != x[: len(y)] for y in tests_skipped]) - ] - return { - test.__name__.split(".")[-1]: { - "func": test._is, - "prop": test.PROPORTION, - "module": test, - } - for test in all_tests - if "_is" in dir(test) - } diff --git a/csv_detective/output/__init__.py b/csv_detective/output/__init__.py index 9a0271b6..c5f90a92 100755 --- a/csv_detective/output/__init__.py +++ b/csv_detective/output/__init__.py @@ -4,12 +4,11 @@ import pandas as pd +from csv_detective.output.dataframe import cast_df_chunks +from csv_detective.output.profile import create_profile +from csv_detective.output.schema import generate_table_schema from csv_detective.utils import is_url -from .dataframe import cast_df_chunks -from .profile import create_profile -from .schema import generate_table_schema - def generate_output( table: pd.DataFrame, diff --git a/csv_detective/output/dataframe.py b/csv_detective/output/dataframe.py index 50b6909b..6981e9e6 100755 --- a/csv_detective/output/dataframe.py +++ b/csv_detective/output/dataframe.py @@ -5,9 +5,9 @@ import pandas as pd -from csv_detective.detect_fields.other.booleen import bool_casting -from csv_detective.detect_fields.other.float import float_casting -from csv_detective.detect_fields.temp.date import date_casting +from csv_detective.formats.booleen import bool_casting +from csv_detective.formats.date import date_casting +from csv_detective.formats.float import float_casting from csv_detective.parsing.csv import CHUNK_SIZE from csv_detective.utils import display_logs_depending_process_time diff --git a/csv_detective/output/profile.py b/csv_detective/output/profile.py index 5b45216a..2f3bd452 100755 --- a/csv_detective/output/profile.py +++ b/csv_detective/output/profile.py @@ -1,12 +1,11 @@ import logging from collections import defaultdict from time import time -from typing import Optional import numpy as np import pandas as pd -from csv_detective.detect_fields.other.float import float_casting +from csv_detective.formats.float import float_casting from csv_detective.utils import cast_prevent_nan, display_logs_depending_process_time @@ -17,7 +16,7 @@ def create_profile( limited_output: bool = True, cast_json: bool = True, verbose: bool = False, - _col_values: Optional[dict[str, pd.Series]] = None, + _col_values: dict[str, pd.Series] | None = None, ) -> dict: if verbose: start = time() diff --git a/csv_detective/output/schema.py b/csv_detective/output/schema.py index c78f6284..51577a08 100755 --- a/csv_detective/output/schema.py +++ b/csv_detective/output/schema.py @@ -103,7 +103,7 @@ def get_validata_type(format: str) -> str: "datetime_aware": "datetime", "datetime_naive": "datetime", "datetime_rfc822": "datetime", - "json_geojson": "geojson", + "geojson": "geojson", "latitude": "number", "latitude_l93": "number", "latitude_wgs": "number", @@ -150,7 +150,7 @@ def get_example(format: str) -> str: "iso_country_code_alpha3": "FRA", "iso_country_code_numeric": 250, "jour_de_la_semaine": "lundi", - "json_geojson": '{"type": "Point", "coordinates": [0, 0]}', + "geojson": '{"type": "Point", "coordinates": [0, 0]}', "latitude": 42.42, "latitude_l93": 6037008, "latitude_wgs": 42.42, diff --git a/csv_detective/parsing/columns.py b/csv_detective/parsing/columns.py index 4d4bee32..9e253541 100755 --- a/csv_detective/parsing/columns.py +++ b/csv_detective/parsing/columns.py @@ -5,6 +5,7 @@ import pandas as pd from more_itertools import peekable +from csv_detective.format import Format from csv_detective.parsing.csv import CHUNK_SIZE from csv_detective.utils import display_logs_depending_process_time @@ -14,15 +15,13 @@ def test_col_val( serie: pd.Series, - test_func: Callable, - proportion: float = 0.9, + format: Format, skipna: bool = True, limited_output: bool = False, verbose: bool = False, ) -> float: """Tests values of the serie using test_func. - - skipna : if True indicates that NaNs are not counted as False - - proportion : indicates the proportion of values that have to pass the test + - skipna : if True indicates that NaNs are considered True for the serie to be detected as a certain format """ if verbose: @@ -34,28 +33,28 @@ def apply_test_func(serie: pd.Series, test_func: Callable, _range: int): try: if skipna: - serie = serie[serie.notnull()] + serie = serie.loc[serie.notnull()] ser_len = len(serie) if ser_len == 0: # being here means the whole column is NaN, so if skipna it's a pass return 1.0 if skipna else 0.0 if not limited_output: - result = apply_test_func(serie, test_func, ser_len).sum() / ser_len - return result if result >= proportion else 0.0 + result = apply_test_func(serie, format.func, ser_len).sum() / ser_len + return result if result >= format.proportion else 0.0 else: - if proportion == 1: + if format.proportion == 1: # early stops (1 then 5 rows) to not waste time if directly unsuccessful for _range in [ min(1, ser_len), min(5, ser_len), ser_len, ]: - if not all(apply_test_func(serie, test_func, _range)): + if not all(apply_test_func(serie, format.func, _range)): return 0.0 return 1.0 else: - result = apply_test_func(serie, test_func, ser_len).sum() / ser_len - return result if result >= proportion else 0.0 + result = apply_test_func(serie, format.func, ser_len).sum() / ser_len + return result if result >= format.proportion else 0.0 finally: if verbose and time() - start > 3: display_logs_depending_process_time( @@ -64,42 +63,27 @@ def apply_test_func(serie: pd.Series, test_func: Callable, _range: int): ) -def test_col_label( - label: str, test_func: Callable, proportion: float = 1, limited_output: bool = False -): - """Tests label (from header) using test_func. - - proportion : indicates the minimum score to pass the test for the serie - to be detected as a certain format - """ - if not limited_output: - return test_func(label) - else: - result = test_func(label) - return result if result >= proportion else 0 - - def test_col( table: pd.DataFrame, - all_tests: dict[str, dict], + formats: dict[str, Format], limited_output: bool, skipna: bool = True, verbose: bool = False, ): if verbose: start = time() - logging.info("Testing columns to get types") + logging.info("Testing columns to get formats") return_table = pd.DataFrame(columns=table.columns) - for idx, (name, attributes) in enumerate(all_tests.items()): + for idx, (label, format) in enumerate(formats.items()): if verbose: start_type = time() - logging.info(f"\t- Starting with type '{name}'") + logging.info(f"\t- Starting with format '{label}'") # improvement lead : put the longest tests behind and make them only if previous tests not satisfactory # => the following needs to change, "apply" means all columns are tested for one type at once - return_table.loc[name] = table.apply( + return_table.loc[label] = table.apply( lambda serie: test_col_val( serie, - attributes["func"], - attributes["prop"], + format, skipna=skipna, limited_output=limited_output, verbose=verbose, @@ -107,7 +91,7 @@ def test_col( ) if verbose: display_logs_depending_process_time( - f'\t> Done with type "{name}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(all_tests)})', + f'\t> Done with type "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})', time() - start_type, ) if verbose: @@ -118,23 +102,20 @@ def test_col( def test_label( - columns: list[str], all_tests: dict[str, dict], limited_output: bool, verbose: bool = False + columns: list[str], formats: dict[str, Format], limited_output: bool, verbose: bool = False ): if verbose: start = time() logging.info("Testing labels to get types") return_table = pd.DataFrame(columns=columns) - for idx, (key, value) in enumerate(all_tests.items()): + for idx, (label, format) in enumerate(formats.items()): if verbose: start_type = time() - return_table.loc[key] = [ - test_col_label(col_name, value["func"], value["prop"], limited_output=limited_output) - for col_name in columns - ] + return_table.loc[label] = [format.is_valid_label(col_name) for col_name in columns] if verbose: display_logs_depending_process_time( - f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(all_tests)})', + f'\t- Done with type "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})', time() - start_type, ) if verbose: @@ -148,23 +129,28 @@ def test_col_chunks( table: pd.DataFrame, file_path: str, analysis: dict, - all_tests: list, + formats: dict[str, Format], limited_output: bool, skipna: bool = True, verbose: bool = False, ) -> tuple[pd.DataFrame, dict, dict[str, pd.Series]]: def build_remaining_tests_per_col(return_table: pd.DataFrame) -> dict[str, list[str]]: + # returns a dict with the table's columns as keys and the list of remaining format labels to apply return { - col: [test for test in return_table.index if return_table.loc[test, col] > 0] + col: [ + fmt_label + for fmt_label in return_table.index + if return_table.loc[fmt_label, col] > 0 + ] for col in return_table.columns } if verbose: start = time() - logging.info("Testing columns to get types on chunks") + logging.info("Testing columns to get formats on chunks") # analysing the sample to get a first guess - return_table = test_col(table, all_tests, limited_output, skipna=skipna, verbose=verbose) + return_table = test_col(table, formats, limited_output, skipna=skipna, verbose=verbose) remaining_tests_per_col = build_remaining_tests_per_col(return_table) # hashing rows to get nb_duplicates @@ -217,23 +203,22 @@ def build_remaining_tests_per_col(return_table: pd.DataFrame) -> dict[str, list[ if not any(remaining_tests for remaining_tests in remaining_tests_per_col.values()): # no more potential tests to do on any column, early stop break - for col, tests in remaining_tests_per_col.items(): + for col, fmt_labels in remaining_tests_per_col.items(): # testing each column with the tests that are still competing # after previous batchs analyses - for test in tests: + for label in fmt_labels: batch_col_test = test_col_val( batch[col], - all_tests[test]["func"], - all_tests[test]["prop"], + formats[label], limited_output=limited_output, skipna=skipna, ) - return_table.loc[test, col] = ( + return_table.loc[label, col] = ( # if this batch's column tested 0 then test fails overall 0 if batch_col_test == 0 # otherwise updating the score with weighted average - else ((return_table.loc[test, col] * idx + batch_col_test) / (idx + 1)) + else ((return_table.loc[label, col] * idx + batch_col_test) / (idx + 1)) ) remaining_tests_per_col = build_remaining_tests_per_col(return_table) batch, batch_number = [], batch_number + 1 diff --git a/csv_detective/parsing/csv.py b/csv_detective/parsing/csv.py index c1e06995..501696f2 100755 --- a/csv_detective/parsing/csv.py +++ b/csv_detective/parsing/csv.py @@ -1,6 +1,6 @@ import logging from time import time -from typing import Optional, TextIO +from typing import TextIO import pandas as pd @@ -18,7 +18,7 @@ def parse_csv( skiprows: int, random_state: int = 42, verbose: bool = False, -) -> tuple[pd.DataFrame, Optional[int], Optional[int]]: +) -> tuple[pd.DataFrame, int | None, int | None]: if verbose: start = time() logging.info("Parsing table") diff --git a/csv_detective/parsing/load.py b/csv_detective/parsing/load.py index 4ad7d6d8..5c20567b 100755 --- a/csv_detective/parsing/load.py +++ b/csv_detective/parsing/load.py @@ -12,14 +12,13 @@ ) from csv_detective.detection.headers import detect_headers from csv_detective.detection.separator import detect_separator -from csv_detective.utils import is_url - -from .compression import unzip -from .csv import parse_csv -from .excel import ( +from csv_detective.parsing.compression import unzip +from csv_detective.parsing.csv import parse_csv +from csv_detective.parsing.excel import ( XLS_LIKE_EXT, parse_excel, ) +from csv_detective.utils import is_url def load_file( diff --git a/csv_detective/validate.py b/csv_detective/validate.py index 6dd1cc86..90a49e6e 100755 --- a/csv_detective/validate.py +++ b/csv_detective/validate.py @@ -2,13 +2,13 @@ import pandas as pd -from csv_detective.load_tests import return_all_tests +from csv_detective.format import FormatsManager from csv_detective.parsing.columns import MAX_NUMBER_CATEGORICAL_VALUES, test_col_val VALIDATION_CHUNK_SIZE = int(1e5) logging.basicConfig(level=logging.INFO) -tests = return_all_tests("ALL", "detect_fields") +formats = FormatsManager().formats def validate( @@ -19,6 +19,12 @@ def validate( ) -> tuple[bool, pd.DataFrame | None, dict | None, dict[str, pd.Series] | None]: """ Verify is the given file has the same fields and types as in the given analysis. + + Args: + file_path: the path of the file to validate + previous_analysis: the previous analysis to validate against (expected in the same structure as the output of the routine) + verbose: whether the code displays the steps it's going through + skipna: whether to ignore NaN values in the checks """ try: if previous_analysis.get("separator"): @@ -101,8 +107,7 @@ def validate( continue test_result: float = test_col_val( serie=chunk[col_name], - test_func=tests[args["format"]]["func"], - proportion=tests[args["format"]]["prop"], + format=formats[args["format"]], skipna=skipna, ) if not bool(test_result): diff --git a/tests/data/a_test_file.json b/tests/data/a_test_file.json index a9b61062..6dc68571 100755 --- a/tests/data/a_test_file.json +++ b/tests/data/a_test_file.json @@ -81,7 +81,7 @@ }, "GEO_INFO": { "python_type": "json", - "format": "json_geojson", + "format": "geojson", "score": 1.0 } }, @@ -195,7 +195,7 @@ }, "GEO_INFO": { "python_type": "json", - "format": "json_geojson", + "format": "geojson", "score": 1.0 } }, @@ -226,7 +226,7 @@ "json": [ "STRUCTURED_INFO" ], - "json_geojson": [ + "geojson": [ "GEO_INFO" ] }, diff --git a/tests/test_fields.py b/tests/test_fields.py index 3bef2cfc..c4cc49cd 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -6,87 +6,22 @@ import pytest from numpy import random -from csv_detective.detect_fields.FR.geo import ( - adresse, - code_commune_insee, - code_departement, - code_fantoir, - code_postal, - code_region, - commune, - departement, - insee_canton, - latitude_l93, - latitude_wgs_fr_metropole, - longitude_l93, - longitude_wgs_fr_metropole, - pays, - region, -) -from csv_detective.detect_fields.FR.other import ( - code_csp_insee, - code_import, - code_rna, - code_waldec, - csp_insee, - date_fr, - insee_ape700, - sexe, - siren, - siret, - tel_fr, - uai, -) -from csv_detective.detect_fields.FR.temp import jour_de_la_semaine, mois_de_annee -from csv_detective.detect_fields.geo import ( - iso_country_code_alpha2, - iso_country_code_alpha3, - iso_country_code_numeric, - json_geojson, - latitude_wgs, - latlon_wgs, - longitude_wgs, - lonlat_wgs, -) -from csv_detective.detect_fields.other import ( - booleen, - email, - json, - money, - mongo_object_id, - percent, - twitter, - url, - uuid, -) -from csv_detective.detect_fields.other import ( - float as test_float, -) -from csv_detective.detect_fields.other import ( - int as test_int, -) -from csv_detective.detect_fields.temp import ( - date, - datetime_aware, - datetime_naive, - datetime_rfc822, - year, -) from csv_detective.detection.variables import ( detect_categorical_variable, detect_continuous_variable, ) -from csv_detective.load_tests import return_all_tests +from csv_detective.format import FormatsManager from csv_detective.output.dataframe import cast from csv_detective.output.utils import prepare_output_dict from csv_detective.parsing.columns import test_col as col_test # to prevent pytest from testing it +fmtm = FormatsManager() + -def test_all_tests_return_bool(): - all_tests = return_all_tests("ALL", "detect_fields") - for attr in all_tests.values(): +def test_all_format_funcs_return_bool(): + for format in fmtm.formats.values(): for tmp in ["a", "9", "3.14", "[]", float("nan"), "2021-06-22 10:20:10"]: - assert isinstance(attr["func"](tmp), bool) + assert isinstance(format.func(tmp), bool) # categorical @@ -124,292 +59,37 @@ def test_detect_continuous_variable(): assert res2.values and res2.values[0] == "cont" -fields = { - adresse: { - True: ["rue du martyr"], - False: ["un batiment"], - }, - code_commune_insee: { - True: ["91471", "01053"], - False: ["914712", "01000"], - }, - code_departement: { - True: ["75", "2A", "2b", "974", "01"], - False: ["00", "96", "101"], - }, - code_fantoir: { - True: ["7755A", "B150B", "ZA04C", "ZB03D"], - False: ["7755", "ZA99A"], - }, - code_postal: { - True: ["75020", "01000"], - False: ["77777", "018339"], - }, - code_region: { - True: ["32"], - False: ["55"], - }, - commune: { - True: ["saint denis"], - False: ["new york", "lion"], - }, - departement: { - True: ["essonne"], - False: ["alabama", "auvergne"], - }, - insee_canton: { - True: ["nantua"], - False: ["california"], - }, - latitude_l93: { - True: ["6037008", "7123528.5", "7124528,5"], - False: ["0", "-6734529.6", "7245669.8", "3422674,78", "32_34"], - }, - longitude_l93: { - True: ["0", "-154", "1265783,45", "34723.4"], - False: ["1456669.8", "-776225", "346_3214"], - }, - latitude_wgs_fr_metropole: { - True: ["42.5"], - False: ["22.5", "62.5"], - }, - longitude_wgs_fr_metropole: { - True: ["-2.5"], - False: ["12.8"], - }, - pays: { - True: ["france", "italie"], - False: ["amerique", "paris"], - }, - region: { - True: ["bretagne", "ile-de-france"], - False: ["baviere", "overgne"], - }, - code_csp_insee: { - True: ["121f"], - False: ["121x"], - }, - code_rna: { - True: ["W751515517"], - False: [ - "W111111111111111111111111111111111111", - "w143788974", - "W12", - "678W23456", - "165789325", - "Wa1#89sf&h", - ], - }, - code_import: { - True: ["123S1871092288"], - False: ["AA751PEE00188854", "W123456789"], - }, - code_waldec: { - True: ["W123456789", "W2D1234567"], - False: ["AA751PEE00188854"], - }, - csp_insee: { - True: ["employes de la poste"], - False: ["super-heros"], - }, - sexe: { - True: ["homme"], - False: ["hermaphrodite"], - }, - siren: { - True: ["552 100 554", "552100554"], - False: ["42"], - }, - siret: { - True: ["13002526500013", "130 025 265 00013"], - False: ["13002526500012"], - }, - uai: { - True: ["0422170F"], - False: ["04292E"], - }, - date_fr: { - True: ["13 fevrier 1996"], - False: ["44 march 2025"], - }, - insee_ape700: {True: ["0116Z"], False: ["0116A"]}, - tel_fr: { - True: ["0134643467"], - False: ["6625388263", "01288398"], - }, - jour_de_la_semaine: { - True: ["lundi"], - False: ["jour de la biere"], - }, - mois_de_annee: { - True: ["juin", "décembre"], - False: ["november"], - }, - iso_country_code_alpha2: { - True: ["FR"], - False: ["XX", "A", "FRA"], - }, - iso_country_code_alpha3: { - True: ["FRA"], - False: ["XXX", "FR", "A"], - }, - iso_country_code_numeric: { - True: ["250"], - False: ["003"], - }, - json_geojson: { - True: [ - '{"coordinates": [45.783753, 3.049342], "type": "63870"}', - '{"geometry": {"coordinates": [45.783753, 3.049342]}}', - ], - False: ['{"pomme": "fruit", "reponse": 42}'], - }, - latitude_wgs: { - True: ["43.2", "-22"], - False: ["100"], - }, - latlon_wgs: { - True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8", "[12,-0.28]"], - False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27"], - }, - longitude_wgs: { - True: ["120", "-20.2"], - False: ["-200"], - }, - lonlat_wgs: { - True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"], - False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"], - }, - booleen: { - True: ["oui", "0", "1", "yes", "false", "True"], - False: ["nein", "ja", "2", "-0"], - }, - email: { - True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"], - False: ["cdo@@gouv.sfd"], - }, - json: { - True: ['{"pomme": "fruit", "reponse": 42}', "[1,2,3,4]"], - False: ["5", '{"zefib":', '{"a"}'], - }, - money: { - True: ["120€", "-20.2$"], - False: ["200", "100 euros"], - }, - mongo_object_id: { - True: ["62320e50f981bc2b57bcc044"], - False: ["884762be-51f3-44c3-b811-1e14c5d89262", "0230240284a66e"], - }, - percent: { - True: ["120%", "-20.2%"], - False: ["200", "100 pourcents"], - }, - twitter: { - True: ["@accueil1"], - False: ["adresse@mail"], - }, - url: { - True: [ - "www.data.gouv.fr", - "http://data.gouv.fr", - "https://www.youtube.com/@data-gouv-fr", - ( - "https://tabular-api.data.gouv.fr/api/resources/" - "aaaaaaaa-1111-bbbb-2222-cccccccccccc/data/" - "?score__greater=0.9&decompte__exact=13" - ), - ], - False: ["tmp@data.gouv.fr"], - }, - uuid: { - True: ["884762be-51f3-44c3-b811-1e14c5d89262"], - False: ["0610928327"], - }, - test_int: { - True: ["1", "0", "1764", "-24"], - False: ["01053", "1.2", "123_456", "+35"], - }, - test_float: { - True: ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"], - False: ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"], - }, - date: { - True: [ - "1960-08-07", - "12/02/2007", - "15 jan 1985", - "15 décembre 1985", - "02 05 2003", - "20030502", - "1993-12/02", - ], - False: [ - "1993-1993-1993", - "39-10-1993", - "19-15-1993", - "15 tambour 1985", - "12152003", - "20031512", - "02052003", - ], - }, - datetime_aware: { - True: [ - "2021-06-22 10:20:10-04:00", - "2030-06-22 00:00:00.0028+02:00", - "2000-12-21 10:20:10.1Z", - "2024-12-19T10:53:36.428000+00:00", - "1996/06/22 10:20:10 GMT", - ], - False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"], - }, - datetime_naive: { - True: [ - "2021-06-22 10:20:10", - "2030/06-22 00:00:00", - "2030/06/22 00:00:00.0028", - ], - False: [ - "2021-06-22T30:20:10", - "Sun, 06 Nov 1994 08:49:37 GMT", - "2021-06-44 10:20:10+02:00", - "1999-12-01T00:00:00Z", - "2021-06-44", - "15 décembre 1985", - ], - }, - datetime_rfc822: { - True: ["Sun, 06 Nov 1994 08:49:37 GMT"], - False: ["2021-06-22T10:20:10"], - }, - year: { - True: ["2015"], - False: ["20166"], - }, -} - # we could also have a function here to add all True values of (almost) -# each field to the False values of all others +# each field to the False values of all others (to do when parenthood is added) def test_all_fields_have_tests(): - all_tests = return_all_tests("ALL", "detect_fields") - for attr in all_tests.values(): - assert fields.get(attr["module"]) - - + for format in fmtm.formats.values(): + valid = format._test_values + # checking structure + assert all( + isinstance(key, bool) + and isinstance(vals, list) + and all(isinstance(val, str) for val in vals) + for key, vals in valid.items() + ) + # checking that we have valid and invalid cases for each + assert all(b in valid.keys() for b in [True, False]) + + +# this is based on the _test_values of each .py file @pytest.mark.parametrize( "args", ( - (field, value, valid) - for field in fields + (format.func, value, valid) for valid in [True, False] - for value in fields[field][valid] + for format in fmtm.formats.values() + for value in format._test_values[valid] ), ) def test_fields_with_values(args): - field, value, valid = args - assert field._is(value) is valid + func, value, valid = args + assert func(value) is valid @pytest.mark.parametrize( @@ -456,37 +136,32 @@ def test_priority(args): @pytest.mark.parametrize( "args", ( - ("1996-02-13", date), - ("28/01/2000", date), - ("2025-08-20T14:30:00+02:00", datetime_aware), - ("2025/08/20 14:30:00.2763-12:00", datetime_aware), - ("1925_12_20T14:30:00.2763", datetime_naive), - ("1925 12 20 14:30:00Z", datetime_aware), + ("1996-02-13", fmtm.formats["date"]), + ("28/01/2000", fmtm.formats["date"]), + ("2025-08-20T14:30:00+02:00", fmtm.formats["datetime_aware"]), + ("2025/08/20 14:30:00.2763-12:00", fmtm.formats["datetime_aware"]), + ("1925_12_20T14:30:00.2763", fmtm.formats["datetime_naive"]), + ("1925 12 20 14:30:00Z", fmtm.formats["datetime_aware"]), ), ) def test_early_detection(args): - value, module = args - with patch("csv_detective.detect_fields.temp.date.date_casting") as mock_func: - res = module._is(value) + value, format = args + with patch("csv_detective.formats.date.date_casting") as mock_func: + res = format.func(value) assert res mock_func.assert_not_called() def test_all_proportion_1(): - all_tests = return_all_tests("ALL", "detect_fields") - prop_1 = { - name: eval(name if name not in ["int", "float"] else "test_" + name) - for name, attr in all_tests.items() - if attr["prop"] == 1 - } # building a table that uses only correct values for these formats, except on one row table = pd.DataFrame( { - test_name: (fields[test_module][True] * 100)[:100] + ["not_suitable"] - for test_name, test_module in prop_1.items() + name: (format._test_values[True] * 100)[:100] + ["not_suitable"] + for name, format in fmtm.formats.items() + if format.proportion == 1 } ) # testing columns for all formats - returned_table = col_test(table, all_tests, limited_output=True) + returned_table = col_test(table, fmtm.formats, limited_output=True) # the analysis should have found no match on any format assert all(returned_table[col].sum() == 0 for col in table.columns) diff --git a/tests/test_file.py b/tests/test_file.py index ee670011..5e632fcd 100644 --- a/tests/test_file.py +++ b/tests/test_file.py @@ -49,7 +49,7 @@ def test_columns_output_on_file(chunk_size): assert output["columns"]["STRUCTURED_INFO"]["python_type"] == "json" assert output["columns"]["STRUCTURED_INFO"]["format"] == "json" assert output["columns"]["GEO_INFO"]["python_type"] == "json" - assert output["columns"]["GEO_INFO"]["format"] == "json_geojson" + assert output["columns"]["GEO_INFO"]["format"] == "geojson" def test_profile_output_on_file(): diff --git a/tests/test_labels.py b/tests/test_labels.py index 0b8a7b3b..a6370614 100644 --- a/tests/test_labels.py +++ b/tests/test_labels.py @@ -1,12 +1,14 @@ import pytest -from csv_detective.detect_labels import latitude_wgs, money +from csv_detective.format import FormatsManager + +fmtm = FormatsManager() # money labels def test_money_labels(): header = "Montant total" - assert money._is(header) == 0.5 + assert fmtm.formats["money"].is_valid_label(header) == 0.5 @pytest.mark.parametrize( @@ -21,4 +23,4 @@ def test_money_labels(): ) def test_latitude(params): header, expected = params - assert expected == latitude_wgs._is(header) + assert expected == fmtm.formats["latitude_wgs"].is_valid_label(header) diff --git a/tests/test_structure.py b/tests/test_structure.py index e4a02591..458dc11c 100755 --- a/tests/test_structure.py +++ b/tests/test_structure.py @@ -1,41 +1,45 @@ import os -from csv_detective import detect_fields, detect_labels # noqa -from csv_detective.load_tests import return_all_tests - - -def tests_conformity(): - """ - Check that all tests are properly structured: - - an __init__.py file in the test folder - - an _is function in the __init__.py file - """ - for _type in ["fields", "labels"]: - _dir = f"csv_detective/detect_{_type}" - subfolders = [] - for dirpath, dirnames, _ in os.walk(_dir): - for dirname in dirnames: - if "__pycache__" not in dirname: - subfolders.append(os.path.join(dirpath, dirname)) - final_subfolders = [ - sf - for sf in subfolders - if not any(other_sf.startswith(sf) for other_sf in subfolders if sf != other_sf) - ] - for f_sf in final_subfolders: - assert "__init__.py" in os.listdir(f_sf) - _package = eval( - f_sf.replace("csv_detective/", "") - # locally we have "\\", but in CI for instance there is "/" - .replace("\\", ".") - .replace("/", ".") - ) - assert "_is" in dir(_package) +import pytest + +from csv_detective.format import Format, FormatsManager + +fmtm = FormatsManager() def test_all_tests_have_unique_name(): - names = [ - attr["module"].__name__.split(".")[-1] - for attr in return_all_tests("ALL", "detect_fields").values() - ] - assert len(names) == len(set(names)) + formats: list[str] = os.listdir("csv_detective/formats") + assert "__init__.py" in formats + assert len(formats) == len(set(formats)) + + +def test_conformity(): + for name, format in fmtm.formats.items(): + assert isinstance(name, str) + assert isinstance(format, Format) + assert all( + getattr(format, attr) is not None + for attr in [ + "name", + "func", + "_test_values", + "labels", + "proportion", + "tags", + ] + ) + + +@pytest.mark.parametrize( + "tags", + ( + ["type"], + ["temp", "fr"], + ), +) +def test_get_from_tags(tags): + fmts = fmtm.get_formats_from_tags(tags) + assert len(fmts) + for fmt in fmts.values(): + for tag in tags: + assert tag in fmt.tags