From af0fa0eec60ba5a6ce04261b319018c103abf898 Mon Sep 17 00:00:00 2001 From: Alraune Zech Date: Tue, 30 Sep 2025 11:57:59 +0200 Subject: [PATCH 1/7] adding check on duplicate column names and giving warning --- mibiscreen/data/load_data.py | 71 ++++++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 12 deletions(-) diff --git a/mibiscreen/data/load_data.py b/mibiscreen/data/load_data.py index ee54e86..2bd9381 100644 --- a/mibiscreen/data/load_data.py +++ b/mibiscreen/data/load_data.py @@ -5,6 +5,7 @@ @author: Alraune Zech """ import os.path +import re import numpy as np import pandas as pd @@ -29,7 +30,7 @@ def load_excel( store_provenance: Boolean To add! **kwargs: optional keyword arguments to pass to pandas' routine - read_excel() + read_excel(), e.g. sep = ',' or sep = ';' Returns: ------- @@ -51,6 +52,11 @@ def load_excel( >>> load_excel(example_data.xlsx) """ + if verbose: + print('===================================') + print(" Running function 'load_excel()'") + print('===================================') + if file_path is None: raise ValueError('Specify file path and file name!') if not os.path.isfile(file_path): @@ -59,18 +65,16 @@ def load_excel( data = pd.read_excel(file_path, sheet_name = sheet_name, **kwargs) - if ";" in data.iloc[1].iloc[0]: - data = pd.read_excel(file_path, - sep=";", - sheet_name = sheet_name, - **kwargs) + + if verbose: + print("Reading data from file: {}".format(file_path)) + print('------------------------------------------------------------------') + + _check_duplicates(data) units = data.drop(labels = np.arange(1,data.shape[0])) if verbose: - print('==============================================================') - print(" Running function 'load_excel()' on data file ", file_path) - print('==============================================================') print("Unit of quantities:") print('-------------------') print(units) @@ -118,20 +122,29 @@ def load_csv( >>> load_excel(example_data.csv) """ + if verbose: + print('==================================') + print(" Running function 'load_csv()'") + print('==================================') + if file_path is None: raise ValueError('Specify file path and file name!') if not os.path.isfile(file_path): raise OSError('Cannot access file at : ',file_path) + if verbose: + print("Reading data from file: {}".format(file_path)) + print('------------------------------------------------------------------') + data = pd.read_csv(file_path, encoding="unicode_escape") if ";" in data.iloc[1].iloc[0]: data = pd.read_csv(file_path, sep=";", encoding="unicode_escape") + + _check_duplicates(data) + units = data.drop(labels = np.arange(1,data.shape[0])) if verbose: - print('================================================================') - print(" Running function 'load_csv()' on data file ", file_path) - print('================================================================') print("Units of quantities:") print('-------------------') print(units) @@ -142,3 +155,37 @@ def load_csv( print('================================================================') return data, units + +def _check_duplicates(data): + """Detects duplicate column names in a pandas DataFrame. + + When a DataFrame contains identical column names they are automatically + renamed by pandas (e.g., 'Column', 'Column.1', 'Column.2'). This function + identifies if such column names exists and prints a warning message. + + This function checks for column names that match the pandas auto-renaming pattern (`.1`, `.2`, etc.) + indicating that duplicate column names were present in the original data source (e.g., an Excel file). + + Args: + ----- + data (pd.DataFrame): The DataFrame to check for renamed duplicate columns. + + Returns: + -------- + None + """ + # Check for duplicated column names + renamed_pattern = re.compile(r"^(.*)\.(\d+)$") # Pattern to match renamed columns + duplicate_columns = {} + for col in data.columns: + if (match := renamed_pattern.match(col)): + base = match.group(1) + duplicate_columns.setdefault(base, []).append(col) + if duplicate_columns: + print("WARNING: Duplicate column names detected. \n They were automatically renamed by pandas into:") + for base, renamed_list in duplicate_columns.items(): + for renamed in renamed_list: + print(f" - '{renamed}'") + print("Duplicate column names will not be identified as standard names.") + print("Consider renaming them.") + print('------------------------------------------------------------------') From 5ed34d5496afac390ae2dd6ccc4c78142d31db4e Mon Sep 17 00:00:00 2001 From: Alraune Zech Date: Tue, 30 Sep 2025 13:11:22 +0200 Subject: [PATCH 2/7] adding unit test for routine _check_duplicates() --- tests/test_data.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/test_data.py b/tests/test_data.py index a0cee77..38af0bd 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -13,6 +13,7 @@ from mibiscreen.data.check_data import standard_names from mibiscreen.data.check_data import standardize from mibiscreen.data.example_data.example_data import example_data +from mibiscreen.data.load_data import _check_duplicates from mibiscreen.data.load_data import load_csv from mibiscreen.data.load_data import load_excel from mibiscreen.data.set_data import compare_lists @@ -96,6 +97,34 @@ def test_load_excel_04(self,capsys): assert len(out)>0 + def test_check_duplicates_01(self,capsys): + """Testing routine _check_duplicates(). + + Testing Warning in case of duplicates in dataframe. + """ + # Create a DataFrame with duplicate column names (simulated by manually renaming) + data = pd.DataFrame({ + 'Name': [1, 2], + 'Age': [30, 40], + 'Name.1': [3, 4], # Simulate pandas auto-renamed column + 'Age.1': [50, 60] + }) + + # Call the function + _check_duplicates(data) + + # Capture printed output + captured = capsys.readouterr().out + + # Check the warning is in the output + assert "WARNING: Duplicate column names detected" in captured + + # Check that the renamed columns are listed + assert " - 'Name.1'" in captured + assert " - 'Age.1'" in captured + + # Check verbose messages appear + assert "Consider renaming them." in captured class TestExampleData: """Class for testing example data of data module of mibiscreen.""" From 43cad6346a0bd4c7f79b0eeac42efdb908777a72 Mon Sep 17 00:00:00 2001 From: Alraune Zech Date: Fri, 3 Oct 2025 09:26:54 +0200 Subject: [PATCH 3/7] add more PAHs and other measurement quantities --- mibiscreen/data/check_data.py | 3 +- mibiscreen/data/settings/contaminants.py | 465 ++++++++++++++++++-- mibiscreen/data/settings/environment.py | 55 +++ mibiscreen/data/settings/sample_settings.py | 6 + mibiscreen/data/settings/standard_names.py | 61 ++- mibiscreen/data/settings/unit_settings.py | 19 +- 6 files changed, 551 insertions(+), 58 deletions(-) diff --git a/mibiscreen/data/check_data.py b/mibiscreen/data/check_data.py index e3de185..38e6120 100644 --- a/mibiscreen/data/check_data.py +++ b/mibiscreen/data/check_data.py @@ -76,7 +76,7 @@ def standard_names(name_list, **contaminants_analysis, } dict_names=_generate_dict_other_names(properties_all) - +# print(dict_names) other_names_contaminants = _generate_dict_other_names(properties_contaminants) other_names_isotopes = _generate_dict_other_names(properties_isotopes) @@ -86,7 +86,6 @@ def standard_names(name_list, y = dict_names.get(x, False) x_isotope = x.split('-')[0] y_isotopes = other_names_isotopes.get(x_isotope.lower(), False) - if y_isotopes is not False: x_molecule = x.removeprefix(x_isotope+'-') y_molecule = other_names_contaminants.get(x_molecule.lower(), False) diff --git a/mibiscreen/data/settings/contaminants.py b/mibiscreen/data/settings/contaminants.py index b876996..8582a12 100644 --- a/mibiscreen/data/settings/contaminants.py +++ b/mibiscreen/data/settings/contaminants.py @@ -9,6 +9,10 @@ import mibiscreen.data.settings.standard_names as names properties_contaminants = dict() + +############################################################################### +### MAHs + properties_contaminants[names.name_benzene]=dict( chemical_formula = 'c6h6', molecular_mass = 78., @@ -49,7 +53,7 @@ hydrogen_atoms = 10, # factor_stoichiometry = 42., # thresholds_for_intervention_NL = 70., - other_names = ["xylene", + other_names = ["xylene","xyleen", "c6h4ch3ch3"], standard_unit = names.unit_microgperl, ) @@ -84,41 +88,8 @@ hydrogen_atoms = 10, # factor_stoichiometry = 42., # thresholds_for_intervention_NL = 70., - other_names = ["o-xylene","o xylene","o_xylene","oxylene"], - standard_unit = names.unit_microgperl, - ) - -properties_contaminants[names.name_indene]=dict( - chemical_formula = "c9h8", - molecular_mass = 116., - carbon_atoms = 9, - hydrogen_atoms = 8, - # factor_stoichiometry = 44., - # thresholds_for_intervention_NL = 70., - other_names = ["indene","indeen","c9h8"], - standard_unit = names.unit_microgperl, - ) - -properties_contaminants[names.name_indane]=dict( - chemical_formula = "c9h10", - molecular_mass = 118., - carbon_atoms = 9, - hydrogen_atoms = 10, - # factor_stoichiometry = 46., - # thresholds_for_intervention_NL = 70., - other_names = ["indane","c9h10"], - standard_unit = names.unit_microgperl, - ) - -properties_contaminants[names.name_naphthalene]=dict( - chemical_formula = "c10h8", - molecular_mass = 128., - carbon_atoms = 10, - hydrogen_atoms = 8, - # factor_stoichiometry = 48., - # thresholds_for_intervention_NL = 70., - other_names = ["naphthalene","naphthaleen","naphthaline", - "naphtaline","naphtalene","naphtaleen","c10h8"], + other_names = ["o-xylene","o xylene","o_xylene","oxylene", + "o-xyleen","o xyleen","o_xyleen","oxyleen"], standard_unit = names.unit_microgperl, ) @@ -334,7 +305,231 @@ '1,2,3,5 tetramethylbenzene','1,2,3,5tetramethylbenzene','isodurene'], standard_unit = names.unit_microgperl, ) +############################################################################### +### PAHs + +properties_contaminants[names.name_indene]=dict( + chemical_formula = "c9h8", + molecular_mass = 116., + carbon_atoms = 9, + hydrogen_atoms = 8, + # factor_stoichiometry = 44., + # thresholds_for_intervention_NL = 70., + other_names = ["indene","indeen","c9h8"], + standard_unit = names.unit_microgperl, + ) + +properties_contaminants[names.name_indane]=dict( + chemical_formula = "c9h10", + molecular_mass = 118., + carbon_atoms = 9, + hydrogen_atoms = 10, + # factor_stoichiometry = 46., + # thresholds_for_intervention_NL = 70., + other_names = ["indane","c9h10"], + standard_unit = names.unit_microgperl, + ) + +properties_contaminants[names.name_naphthalene]=dict( + chemical_formula = "c10h8", + molecular_mass = 128., + carbon_atoms = 10, + hydrogen_atoms = 8, + # factor_stoichiometry = 48., + # thresholds_for_intervention_NL = 70., + other_names = ["naphthalene","naphthaleen","naphthaline", + "naphtaline","naphtalene","naphtaleen","c10h8"], + standard_unit = names.unit_microgperl, + ) + +properties_contaminants[names.name_naphthalene_VOC]=dict( + chemical_formula = "c10h8", + molecular_mass = 128., + carbon_atoms = 10, + hydrogen_atoms = 8, + # factor_stoichiometry = 48., + # thresholds_for_intervention_NL = 70., + other_names = ['naphthalene_voc',"naphthalene_voc","naphthalenevoc","naphthalene-voc","naphthalene voc", + "naphtalene_voc","naphtalenevoc","naphtalene-voc","naphtalenevoc", + "naphthaline_voc","naphthalinevoc","naphthaline-voc","naphthaline voc", + "naphtaline_voc","naphtalinevoc","naphtaline-voc","naphtaline voc", + "naphthaleen_voc","naphthaleenvoc","naphthaleen-voc","naphthaleen voc", + "naphtaleen_voc","naphtaleenvoc","naphtaleen-voc","naphtaleen voc", + "c10h8_voc"], + standard_unit = names.unit_microgperl, + ) + +properties_contaminants[names.name_naphthalene_PAH]=dict( + chemical_formula = "c10h8", + molecular_mass = 128., + carbon_atoms = 10, + hydrogen_atoms = 8, + # factor_stoichiometry = 48., + # thresholds_for_intervention_NL = 70., + other_names = ["naphthalene_pah","naphthalenepah","naphthalene-pah","naphthalene pah", + "naphtalene_pah","naphtalenepah","naphtalene-pah","naphtalenepah", + "naphthaline_pah","naphthalinepah","naphthaline-pah","naphthaline pah", + "naphtaline_pah","naphtalinepah","naphtaline-pah","naphtaline pah", + "naphthaleen_pah","naphthaleenpah","naphthaleen-pah","naphthaleen pah", + "naphtaleen_pah","naphtaleenpah","naphtaleen-pah","naphtaleen pah", + "c10h8_pah"], + standard_unit = names.unit_microgperl, + ) + +properties_contaminants[names.name_naphthalene] = dict( + chemical_formula = "C10H8", + molecular_mass = 128.0, + carbon_atoms = 10, + hydrogen_atoms = 8, + other_names = ["naphthalene","naphthaleen","naphthaline", + "naphtaline","naphtalene","naphtaleen","C10H8"], + standard_unit = names.unit_microgperl, +) + +# New ones: + +properties_contaminants[names.name_acenaphthylene] = dict( + chemical_formula = "C12H8", + molecular_mass = 152.192, + carbon_atoms = 12, + hydrogen_atoms = 8, + other_names = ["acenaphthylene","cyclopenta[de]naphthalene","C12H8"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_acenaphthene] = dict( + chemical_formula = "C12H10", + molecular_mass = 154.212, # approximate + carbon_atoms = 12, + hydrogen_atoms = 10, + other_names = ["acenaphthene","acenaphthene (C12H10)","C12H10"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_fluorene] = dict( + chemical_formula = "C13H10", + molecular_mass = 166.22, + carbon_atoms = 13, + hydrogen_atoms = 10, + other_names = ["fluorene","C13H10"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_phenanthrene] = dict( + chemical_formula = "C14H10", + molecular_mass = 178.226, + carbon_atoms = 14, + hydrogen_atoms = 10, + other_names = ["phenanthrene","C14H10"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_anthracene] = dict( + chemical_formula = "C14H10", + molecular_mass = 178.226, + carbon_atoms = 14, + hydrogen_atoms = 10, + other_names = ["anthracene","C14H10"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_fluoranthene] = dict( + chemical_formula = "C16H10", + molecular_mass = 202.26, + carbon_atoms = 16, + hydrogen_atoms = 10, + other_names = ["fluoranthene","C16H10"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_pyrene] = dict( + chemical_formula = "C16H10", + molecular_mass = 202.26, + carbon_atoms = 16, + hydrogen_atoms = 10, + other_names = ["pyrene","C16H10"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_chrysene] = dict( + chemical_formula = "C18H12", + molecular_mass = 228.29, + carbon_atoms = 18, + hydrogen_atoms = 12, + other_names = ["chrysene","C18H12"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_benzo_a_anthracene] = dict( + chemical_formula = "C18H12", + molecular_mass = 228.29, + carbon_atoms = 18, + hydrogen_atoms = 12, + other_names = ["benzo[a]anthracene","benzo(a)anthracene","benzoaanthracene", + "benzo-a-anthracene","C18H12","BaA"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_benzo_b_fluoranthene] = dict( + chemical_formula = "C20H12", + molecular_mass = 252.31, + carbon_atoms = 20, + hydrogen_atoms = 12, + other_names = ["benzo[b]fluoranthene","benzo(b)fluoranthene", + "benzobfluoranthene","benzo-b-fluoranthene","BbF","C20H12"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_benzo_k_fluoranthene] = dict( + chemical_formula = "C20H12", + molecular_mass = 252.31, + carbon_atoms = 20, + hydrogen_atoms = 12, + other_names = ["benzo[k]fluoranthene","benzo(k)fluoranthene" + "benzokfluoranthene","benzo-k-fluoranthene","BkF","C20H12"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_benzo_a_pyrene] = dict( + chemical_formula = "C20H12", + molecular_mass = 252.31, + carbon_atoms = 20, + hydrogen_atoms = 12, + other_names = ["benzo[a]pyrene","benzo(a)pyrene","benzoapyrene", + "benzo-a-pyrene","BaP","C20H12"], + standard_unit = names.unit_microgperl, +) +properties_contaminants[names.name_dibenz_ah_anthracene] = dict( + chemical_formula = "C22H14", + molecular_mass = 278.33, + carbon_atoms = 22, + hydrogen_atoms = 14, + other_names = ["dibenz[a,h]anthracene","dibenz(a,h)anthracene","dibenzahanthracene","dibenz-a,h-anthracene", + "dibenz-a-h-anthracene","DBahA","C22H14"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_benzo_ghi_perylene] = dict( + chemical_formula = "C22H12", + molecular_mass = 276.33, + carbon_atoms = 22, + hydrogen_atoms = 12, + other_names = ["benzo[ghi]perylene","benzo(ghi)perylene","benzoghiperylene","benzo-ghi-perylene", + "C22H12","BghiP"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_indeno_123cd_pyrene] = dict( + chemical_formula = "C22H12", + molecular_mass = 276.33, # from data: 276.33 :contentReference[oaicite:0]{index=0} + carbon_atoms = 22, + hydrogen_atoms = 12, + other_names = ["indeno[1,2,3-cd]pyrene","indeno(1,2,3‑cd)pyrene", + "indeno1,2,3‑cdpyrene","indeno-1,2,3‑cd-pyrene", + "C22H12"], + standard_unit = names.unit_microgperl, +) properties_contaminants[names.name_methylindene]=dict( chemical_formula = "c9h7ch3", @@ -598,7 +793,84 @@ ############################################################################### ############################################################################### + contaminants_analysis = dict() + +contaminants_analysis[names.name_PAH_total_16] = dict( + other_names = ['pah_total_16','pah_total 16','pah_total16','pah_total-16', + 'pah total_16','pah total 16','pah total16','pah total-16', + 'pahtotal_16','pahtotal 16','pahtotal16','pahtotal-16' + 'pah-total_16','pah-total 16','pah-total16','pah-total-16', + 'pah_16','pah 16','pah16','pah-16', + 'total_pah_16','total_pah 16','total_pah16','total_pah-16', + 'total pah_16','total pah 16','total pah16','total pah-16', + 'totalpah_16','totalpah 16','totalpah16','totalpah-16', + 'total-pah_16','total-pah 16','total-pah16','total-pah-16', + ], + standard_unit = names.unit_microgperl, + ) + +contaminants_analysis[names.name_PAH_total_10] = dict( + other_names = ['pah_total_10','pah_total 10','pah_total10','pah_total-10', + 'pah total_10','pah total 10','pah total10','pah total-10', + 'pahtotal_10','pahtotal 10','pahtotal10','pahtotal-10' + 'pah-total_10','pah-total 10','pah-total10','pah-total-10', + 'pah_10','pah 10','pah10','pah-10', + 'total_pah_10','total_pah 10','total_pah10','total_pah-10', + 'total pah_10','total pah 10','total pah10','total pah-10', + 'totalpah_10','totalpah 10','totalpah10','totalpah-10', + 'total-pah_10','total-pah 10','total-pah10','total-pah-10', + ], + standard_unit = names.unit_microgperl, + ) + +contaminants_analysis[names.name_fraction_C10_C12] = dict( + other_names = [ + 'fraction_c10-c12','fraction c10-c12','fraction c10_c12', + 'c10-c12 fraction','c10_c12 fraction', + 'c10 to c12','c10_c12','c10–c12','c10–c12 fraction', + ], + standard_unit = names.unit_microgperl, +) + +contaminants_analysis[names.name_fraction_C12_C22] = dict( + other_names = [ + 'fraction_c12-c22','fraction c12-c22','fraction c12_c22', + 'c12-c22 fraction','c12_c22 fraction', + 'c12 to c22','c12_c22','c12–c22','c12–c22 fraction', + ], + standard_unit = names.unit_microgperl, +) + +contaminants_analysis[names.name_fraction_C22_C30] = dict( + other_names = [ + 'fraction_c22-c30','fraction c22-c30','fraction c22_c30', + 'c22-c30 fraction','c22_c30 fraction', + 'c22 to c30','c22_c30','c22–c30','c22–c30 fraction', + ], + standard_unit = names.unit_microgperl, +) + + +contaminants_analysis[names.name_fraction_C30_C40] = dict( + other_names = [ + 'fraction_c30-c40','fraction c30-c40','fraction c30_c40', + 'c30-c40 fraction','c30_c40 fraction', + 'c30 to c40','c30_c40','c30–c40','c30–c40 fraction', + ], + standard_unit = names.unit_microgperl, +) + +contaminants_analysis[names.name_total_C10_C40] = dict( + other_names = [ + 'total_c10-c40','total c10-c40','total c10_c40', + 'c10-c40 total','c10_c40 total', + 'c10 to c40','c10_c40','c10–c40','c10–c40 total', + 'c10-c40','c10_c40', + ], + standard_unit = names.unit_microgperl, +) + contaminants_analysis[names.name_total_contaminants] = dict( other_names = ["sum_contaminants","sum-contaminants","sum contaminants","sumcontaminants", "total_contaminants","total-contaminants","total contaminants","totalcontaminants", @@ -607,17 +879,18 @@ standard_unit = names.unit_microgperl, ) contaminants_analysis[names.name_total_BTEX] = dict( - other_names = ["sum_BTEX","sum-BTEX","sum BTEX","sumBTEX", - "total_BTEX","total-BTEX","total BTEX","totalBTEX", - "concentration -BTEX","concentration-BTEX","concentration BTEX", - "concentrationBTEX"], + other_names = ["sum_btex","sum-btex","sum btex","sumbtex", + "total_btex","total-btex","total btex","totalbtex", + "btex_total","btex-total","btex total","btextotal", + "concentration -btex","concentration-btex","concentration btex", + "concentrationbtex"], standard_unit = names.unit_microgperl, ) contaminants_analysis[names.name_total_BTEXIIN] = dict( - other_names = ["sum_BTEXIIN","sum-BTEXIIN","sum BTEXIIN","sumBTEXIIN", - "total_BTEXIIN","total-BTEXIIN","total BTEXIIN","totalBTEXIIN", - "concentration -BTEXIIN","concentration-BTEXIIN","concentration BTEXIIN", - "concentrationBTEXIIN"], + other_names = ["sum_btexiin","sum-btexiin","sum btexiin","sumbtexiin", + "total_btexiin","total-btexiin","total btexiin","totalbtexiin", + "concentration -btexiin","concentration-btexiin","concentration btexiin", + "concentrationbtexiin"], standard_unit = names.unit_microgperl, ) contaminants_analysis[names.name_total_oxidators] = dict( @@ -647,7 +920,7 @@ ) contaminants_analysis[names.name_NP_avail] = dict( - other_names = ["NP_avail"] + other_names = ["np_avail"] ) ### List with all quantities of particular data type in standard names: @@ -672,6 +945,106 @@ names.name_xylene, names.name_indane, names.name_indene, - names.name_naphthalene], + names.name_naphthalene, + # names.name_naphthalene_VOC, + # names.name_naphthalene_PAC, + ], + MAH = [names.name_benzene, + names.name_toluene, + names.name_ethylbenzene, + names.name_pm_xylene, + names.name_o_xylene, + names.name_xylene, + names.name_styrene, + names.name_isopropylbenzene, + names.name_n_propylbenzene, + names.name_ethyltoluene, + names.name_2_ethyltoluene, + names.name_3_ethyltoluene, + names.name_4_ethyltoluene, + names.name_trimethylbenzene, + names.name_123_trimethylbenzene, + names.name_124_trimethylbenzene, + names.name_135_trimethylbenzene, + names.name_4_isopropyltouene, + names.name_diethylbenzene, + names.name_12_diethylbenzene, + names.name_13_diethylbenzene, + names.name_14_diethylbenzene, + names.name_tetramethylbenzene, + names.name_1234_tetramethylbenzene, + names.name_1245_tetramethylbenzene, + names.name_1235_tetramethylbenzene, + ], + PAH = [names.name_indane, + names.name_indene, + names.name_naphthalene, + names.name_naphthalene_VOC, + names.name_naphthalene_PAH, + names.name_acenaphthylene, + names.name_acenaphthene, + names.name_fluorene, + names.name_phenanthrene, + names.name_anthracene, + names.name_fluoranthene, + names.name_pyrene, + names.name_chrysene, + names.name_benzo_a_anthracene, + names.name_benzo_b_fluoranthene, + names.name_benzo_k_fluoranthene, + names.name_benzo_a_pyrene, + names.name_dibenz_ah_anthracene, + names.name_benzo_ghi_perylene, + names.name_indeno_123cd_pyrene, + names.name_methylindene, + names.name_1_methylindene, + names.name_2_methylindene, + names.name_methylnaphthalene, + names.name_1_methylnaphthalene, + names.name_2_methylnaphthalene, + names.name_ethylnaphthalene, + names.name_1_ethylnaphthalene, + names.name_2_ethylnaphthalene, + names.name_dimethylnaphthalene, + names.name_12_dimethylnaphthalene, + names.name_13_dimethylnaphthalene, + names.name_14_dimethylnaphthalene, + names.name_15_dimethylnaphthalene, + names.name_16_dimethylnaphthalene, + names.name_17_dimethylnaphthalene, + names.name_18_dimethylnaphthalene, + names.name_23_dimethylnaphthalene, + names.name_26_dimethylnaphthalene, + names.name_27_dimethylnaphthalene, + ], + PAH_total_10 = [#Dutch RIVM defines environmental quality objectives for 10 PAHs + names.name_naphthalene, + names.name_anthracene, + names.name_benzo_a_anthracene, + names.name_benzo_a_pyrene, + names.name_benzo_b_fluoranthene, + names.name_benzo_k_fluoranthene, + names.name_chrysene, + names.name_dibenz_ah_anthracene, + names.name_fluoranthene, + names.name_pyrene, + ], + PAH_total_16 = [names.name_naphthalene, #the 16 EPA priority PAHs + names.name_acenaphthylene, + names.name_acenaphthene, + names.name_fluorene, + names.name_phenanthrene, + names.name_anthracene, + names.name_fluoranthene, + names.name_pyrene, + names.name_benzo_a_anthracene, + names.name_chrysene, + names.name_benzo_b_fluoranthene, + names.name_benzo_k_fluoranthene, + names.name_benzo_a_pyrene, + names.name_indeno_123cd_pyrene, + names.name_dibenz_ah_anthracene, + names.name_benzo_ghi_perylene, + ], all_cont = list(properties_contaminants.keys()) ) diff --git a/mibiscreen/data/settings/environment.py b/mibiscreen/data/settings/environment.py index b33ef89..c32f661 100644 --- a/mibiscreen/data/settings/environment.py +++ b/mibiscreen/data/settings/environment.py @@ -22,6 +22,11 @@ standard_unit = names.unit_less, ) +properties_geochemicals[names.name_temperature]=dict( + other_names = ["temperature","T","temp"], + standard_unit = names.unit_celsius, + ) + properties_geochemicals[names.name_EC]=dict( other_names = ["ec"], standard_unit = names.unit_microsimpercm, @@ -66,6 +71,19 @@ standard_unit = names.unit_mgperl, ) +properties_geochemicals[names.name_nitrateN]=dict( + chemical_formula = 'N', + molecular_mass = 14.01, + other_names = ["nitraten","nitrate-n","nitrate n","nitrate_n", + "no3n","no3-n","no3 n","no3_n", + "no_3n","no_3-n","no_3 n","no_3_n", + "no 3n","no 3-n","no 3 n","no 3_n", + "no3-n","no3--n","no3- n","no3-_n", + "no_3-n","no_3--n","no_3- n","no_3-_n", + "no 3-n","no 3--n","no 3- n","no 3-_n"], + standard_unit = names.unit_mgNperl, + ) + properties_geochemicals[names.name_nitrite]=dict( chemical_formula = 'no2', molecular_mass = 46., @@ -74,6 +92,19 @@ standard_unit = names.unit_mgperl, ) +properties_geochemicals[names.name_nitriteN]=dict( + chemical_formula = 'N', + molecular_mass = 14.01, + other_names = ["nitriten","nitrite n","nitrite-n","nitrite_n", + "no2n","no2 n","no2-n","no2_n", + "no_2n","no_2 n","no_2-n","no_2_n", + "no 2n","no 2 n","no 2-n","no 2_n", + "no2-n","no2- n","no2--n","no2-_n", + "no_2-n","no_2- n","no_2--n","no_2-_n" + "no 2-n","no 2- n","no 2--n","no 2-_n"], + standard_unit = names.unit_mgNperl, + ) + properties_geochemicals[names.name_sulfate]=dict( chemical_formula = "so42-", molecular_mass = 96.1, @@ -168,6 +199,15 @@ standard_unit = names.unit_mgperl, ) +properties_geochemicals[names.name_phosphorus]=dict( + chemical_formula = "P", + other_names = ["phosphorus","P","TP","PT", + "phosphorustotal","phosphorus_total","phosphorus total","phosphorus-total", + "totalphosphorus","total_phosphorus","total phosphorus","total-phosphorus"], + standard_unit = names.unit_mgPperl, + ) + + properties_geochemicals[names.name_chloride]=dict( chemical_formula = "cl-", other_names = ['chloride','cl','cl-'], @@ -216,12 +256,23 @@ standard_unit = names.unit_mgperl, ) +properties_geochemicals[names.name_cyanide]=dict( + chemical_formula = 'cn-', + other_names = ['cyanides','cn-','total cyanides', + 'total cyanides','total-cyanides','totalcyanides','total_cyanides', + 'cyanides total','cyanides-total', 'cyanidestotal','cyanides_total', + ], + standard_unit = names.unit_mgperl, + ) + + ### List with all quantities of particular data type in standard names: environment = list(properties_geochemicals.keys()) environment_groups = dict( environmental_conditions = [names.name_redox, names.name_pH, + names.name_temperature, names.name_EC, names.name_pE, ], @@ -234,9 +285,12 @@ names.name_manganese4, names.name_methane, names.name_nitrite, + names.name_nitriteN, + names.name_nitrateN, names.name_sulfide, names.name_ammonium, names.name_phosphate, + names.name_phosphorus, names.name_chloride, names.name_bromide, names.name_fluoride, @@ -245,6 +299,7 @@ names.name_potassium, names.name_calcium, names.name_acetate, + names.name_cyanide, names.name_DOC, names.name_NPOC, names.name_TOC, diff --git a/mibiscreen/data/settings/sample_settings.py b/mibiscreen/data/settings/sample_settings.py index c07318c..c83ef07 100644 --- a/mibiscreen/data/settings/sample_settings.py +++ b/mibiscreen/data/settings/sample_settings.py @@ -57,5 +57,11 @@ standard_unit = names.unit_less, ) +properties_sample_settings[names.name_date]=dict( + other_names = ["date",'time'], + standard_unit = names.unit_date, +) + + ### List with all quantities of particular data type in standard names: sample_settings = list(properties_sample_settings.keys()) diff --git a/mibiscreen/data/settings/standard_names.py b/mibiscreen/data/settings/standard_names.py index 0243561..661872b 100644 --- a/mibiscreen/data/settings/standard_names.py +++ b/mibiscreen/data/settings/standard_names.py @@ -14,7 +14,12 @@ unit_microsimpercm = 'uS/cm' unit_permil = 'permil' unit_count = 'nr' +unit_date = 'date' unit_less = '' +unit_mgNperl = "mg/l" +unit_mgPperl = "mg/l" +unit_celsius = 'C' +unit_date = 'date' ### Standard names for settings name_sample = "sample_nr" @@ -22,10 +27,12 @@ name_well_type = "well_type" name_sample_depth = "depth" name_aquifer = 'aquifer' +name_date = 'date' ### Standard names for environmental parameters name_redox = "redoxpot" name_pH = "pH" +name_temperature = 'temperature' name_EC = "EC" #name_Eh = "Eh" #Reduction potential name_pE = "pE" #Alternative mathematical formulation of redox potential/reduction potential @@ -37,17 +44,20 @@ name_TOC = "TOC" # Total Organic Carbon name_oxygen = 'oxygen' #o2 -name_nitrate = 'nitrate' #no3 +name_nitrate = 'nitrate' #no3- -- full nitrate ion +name_nitrateN = 'nitrate_N' #no3-N -- amount of nitrogen (N) within the nitrate ion. name_sulfate = 'sulfate' #"so4" name_iron2 = "iron2" #"fe_II" name_iron3 = "iron3" #"fe_II" name_manganese2 = 'manganese2' #"mn_II" name_manganese4 = 'manganese4' #"mn_II" name_methane = 'methane' #"ch4" -name_nitrite = 'nitrite' #no2 +name_nitrite = 'nitrite' #no2 -- full nitrite ion +name_nitriteN = 'nitrite_N' #no2- - amount of nitrogen (N) within the nitrite ion. name_sulfide = 'sulfide' #"s2min" name_ammonium = 'ammonium' #"nh4+" -name_phosphate = 'phosphate' # "po4" +name_phosphate = 'phosphate' # "po4" - orthosphosphate ion, bioavailable form of phosphorus +name_phosphorus = 'phosphorus_total' #P - all phosphorus containing compounds name_chloride = 'chloride' name_bromide = 'bromide' name_fluoride = 'fluoride' @@ -56,19 +66,18 @@ name_potassium = 'potassium' name_calcium = 'calcium' name_acetate = 'acetate' +name_cyanide = 'cyanide_total' # sum of all cyanide species that can potentially release free cyanide ### Standard names for main contaminants +### MAHs - MONOCYCLIC AROMATIC HYDROCARBONS name_benzene = 'benzene' name_toluene = 'toluene' name_ethylbenzene = 'ethylbenzene' name_pm_xylene = 'pm_xylene' name_o_xylene = 'o_xylene' name_xylene = 'xylene' -name_indane = 'indane' -name_indene = 'indene' -name_naphthalene = 'naphthalene' -### Standard names for additional contaminants +### Standard names for additional MAH contaminants name_styrene = 'styrene' name_isopropylbenzene = 'isopropylbenzene' name_n_propylbenzene = 'n_propylbenzene' @@ -89,6 +98,33 @@ name_1234_tetramethylbenzene = '1234_tetramethylbenzene' name_1245_tetramethylbenzene = '1245_tetramethylbenzene' name_1235_tetramethylbenzene = '1235_tetramethylbenzene' + + + +### PAHs - POLYCYCLIC AROMATIC HYDROCARBONS +name_indane = 'indane' +name_indene = 'indene' +name_naphthalene = 'naphthalene' + +name_naphthalene_VOC = 'naphthalene_VOC' #measured via volatile organic carbon +name_naphthalene_PAH = 'naphthalene_PAH' #measured via polyaromatic hydrocarbons +name_acenaphthylene = 'acenaphthylene' +name_acenaphthene = 'acenaphthene' +name_fluorene = 'fluorene' +name_phenanthrene = 'phenanthrene' +name_anthracene = 'anthracene' +name_fluoranthene = 'fluoranthene' +name_pyrene = 'pyrene' +name_chrysene = 'chrysene' +name_benzo_a_anthracene = 'benzo[a]anthracene' +name_benzo_b_fluoranthene = 'benzo[b]fluoranthene' +name_benzo_k_fluoranthene = 'benzo[k]fluoranthene' +name_benzo_a_pyrene = 'benzo[a]pyrene' +name_dibenz_ah_anthracene = 'dibenz[a,h]anthracene' +name_benzo_ghi_perylene = 'benzo[ghi]perylene' +name_indeno_123cd_pyrene = 'indeno[1,2,3-cd]pyrene' + +### Standard names for additional PAH contaminants name_methylindene = 'methylindene' name_1_methylindene = '1_methylindene' name_2_methylindene = '2_methylindene' @@ -110,6 +146,7 @@ name_26_dimethylnaphthalene = '26_dimethylnaphthalene' name_27_dimethylnaphthalene = '27_dimethylnaphthalene' + ### Standard names for a selection of metabolites name_phenol = "phenol" name_cinnamic_acid = "cinnamic_acid" @@ -124,7 +161,7 @@ name_benzylsuccinic_acid = "benzylsuccinic_acid" name_3o_toluoyl_propionic_acid = "3o_toluoyl_propionic_acid" -### Standard names for metabolite related quantities +### Standard names for summed up quantities #name_total_contaminants = "total_contaminants" name_total_contaminants = "concentration_contaminants" name_total_BTEX = "concentration_BTEX" @@ -133,7 +170,13 @@ name_total_BTEX_count = "count_BTEX" name_total_BTEXIIN_count = "count_BTEXIIN" - +name_PAH_total_10 = 'PAH_total_10' +name_PAH_total_16 = 'PAH_total_16' +name_fraction_C10_C12 = 'fraction_C10-C12' +name_fraction_C12_C22 = 'fraction_C12-C22' +name_fraction_C22_C30 = 'fraction_C22-C30' +name_fraction_C30_C40 = 'fraction_C30-C40' +name_total_C10_C40 = 'total_C10-C40' ### Standard names for metabolite related quantities name_metabolites_conc = "metabolites_concentration" # name_metabolites_variety = 'metabolites_count' diff --git a/mibiscreen/data/settings/unit_settings.py b/mibiscreen/data/settings/unit_settings.py index ac49eed..b66fc40 100644 --- a/mibiscreen/data/settings/unit_settings.py +++ b/mibiscreen/data/settings/unit_settings.py @@ -14,6 +14,14 @@ other_names = ["mg/l",'ppm'], ) +properties_units[names.unit_mgNperl]=dict( + other_names = ["mg/l", 'ppm', "mgN/l",'ppm N','ppmN','ppm-N','ppm_N',"mg/l as N","ppm as N"], + ) + +properties_units[names.unit_mgPperl]=dict( + other_names = ["mg/l", 'ppm', "mgP/l",'ppm P','ppmP','ppm-P','ppm_P',"mg/l as P","ppm as P"], + ) + properties_units[names.unit_microgperl]=dict( other_names = ["ug/l","µg/l","\u00B5g/l","\u03BCg/l","micro g/l",r"$\mu$ g/l",], ) @@ -22,6 +30,12 @@ other_names = ["mV","mv"], ) +properties_units[names.unit_celsius]=dict( + other_names = ["C","c","Celsius","celsius", + "\u00B0C","\u00B0c","\u00B0Celsius","\u00B0celsius", + "\u00B0 C","\u00B0 c","\u00B0 Celsius","\u00B0 celsius"], + ) + properties_units[names.unit_meter]=dict( other_names = ['m',"meter"], ) @@ -39,11 +53,14 @@ other_names =['nr','number','count'], ) +properties_units[names.unit_date]=dict( + other_names = ['date','time','hr'], + ) + properties_units[names.unit_less]=dict( other_names = ['',' ',' ','-',np.nan], ) - all_units = [] for key in properties_units.keys(): if key != names.unit_less: From 53f2387d8f3c6d5cc2754841534993206f6c899f Mon Sep 17 00:00:00 2001 From: Alraune Zech Date: Fri, 3 Oct 2025 09:45:27 +0200 Subject: [PATCH 4/7] correct unit handling for new quantities --- mibiscreen/data/settings/environment.py | 2 +- mibiscreen/data/settings/sample_settings.py | 2 +- mibiscreen/data/settings/standard_names.py | 4 ++-- mibiscreen/data/settings/unit_settings.py | 11 ++++++----- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/mibiscreen/data/settings/environment.py b/mibiscreen/data/settings/environment.py index c32f661..dbd85db 100644 --- a/mibiscreen/data/settings/environment.py +++ b/mibiscreen/data/settings/environment.py @@ -262,7 +262,7 @@ 'total cyanides','total-cyanides','totalcyanides','total_cyanides', 'cyanides total','cyanides-total', 'cyanidestotal','cyanides_total', ], - standard_unit = names.unit_mgperl, + standard_unit = names.unit_microgperl, ) diff --git a/mibiscreen/data/settings/sample_settings.py b/mibiscreen/data/settings/sample_settings.py index c83ef07..11e422c 100644 --- a/mibiscreen/data/settings/sample_settings.py +++ b/mibiscreen/data/settings/sample_settings.py @@ -59,7 +59,7 @@ properties_sample_settings[names.name_date]=dict( other_names = ["date",'time'], - standard_unit = names.unit_date, + standard_unit = names.unit_less, ) diff --git a/mibiscreen/data/settings/standard_names.py b/mibiscreen/data/settings/standard_names.py index 661872b..fe15674 100644 --- a/mibiscreen/data/settings/standard_names.py +++ b/mibiscreen/data/settings/standard_names.py @@ -16,8 +16,8 @@ unit_count = 'nr' unit_date = 'date' unit_less = '' -unit_mgNperl = "mg/l" -unit_mgPperl = "mg/l" +unit_mgNperl = "mg/l-N" +unit_mgPperl = "mg/l-P" unit_celsius = 'C' unit_date = 'date' diff --git a/mibiscreen/data/settings/unit_settings.py b/mibiscreen/data/settings/unit_settings.py index b66fc40..326d995 100644 --- a/mibiscreen/data/settings/unit_settings.py +++ b/mibiscreen/data/settings/unit_settings.py @@ -15,11 +15,11 @@ ) properties_units[names.unit_mgNperl]=dict( - other_names = ["mg/l", 'ppm', "mgN/l",'ppm N','ppmN','ppm-N','ppm_N',"mg/l as N","ppm as N"], + other_names = ["mg/l", 'ppm', "mgn/l",'ppm n','ppmn','ppm-n','ppm_n',"mg/l as n","ppm as n"], ) properties_units[names.unit_mgPperl]=dict( - other_names = ["mg/l", 'ppm', "mgP/l",'ppm P','ppmP','ppm-P','ppm_P',"mg/l as P","ppm as P"], + other_names = ["mg/l", 'ppm', "mgp/l",'ppm p','ppmp','ppm-p','ppm_p',"mg/l as p","ppm as p"], ) properties_units[names.unit_microgperl]=dict( @@ -53,9 +53,9 @@ other_names =['nr','number','count'], ) -properties_units[names.unit_date]=dict( - other_names = ['date','time','hr'], - ) +# properties_units[names.unit_date]=dict( +# other_names = ['date','time','hr'], +# ) properties_units[names.unit_less]=dict( other_names = ['',' ',' ','-',np.nan], @@ -66,3 +66,4 @@ if key != names.unit_less: all_units = all_units + properties_units[key]['other_names'] +properties_units \ No newline at end of file From 456796ef9fb7847ec51ea3cf5d44c944270a746b Mon Sep 17 00:00:00 2001 From: Alraune Zech Date: Fri, 3 Oct 2025 09:46:52 +0200 Subject: [PATCH 5/7] correct unit handling for new quantities --- mibiscreen/data/check_data.py | 4 ++++ mibiscreen/data/settings/unit_settings.py | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/mibiscreen/data/check_data.py b/mibiscreen/data/check_data.py index 38e6120..1a81616 100644 --- a/mibiscreen/data/check_data.py +++ b/mibiscreen/data/check_data.py @@ -341,6 +341,7 @@ def check_units(data, **properties_contaminants, **properties_metabolites, **properties_isotopes, + **contaminants_analysis, } ### run through all quantity columns and check their units @@ -353,6 +354,8 @@ def check_units(data, else: col_not_checked.append(quantity) continue + + ### check on given unit (also considering alternative unit names) if standard_unit != names.unit_less: other_names_unit = properties_units[standard_unit]['other_names'] if str(units[quantity][0]).lower() not in other_names_unit: @@ -369,6 +372,7 @@ def check_units(data, print(" Quantities not in requested units:") print(*col_check_list, sep='\n') if len(col_not_checked) != 0: + print('________________________________________________________________') print(" Quantities not identified (and thus not checked on units):") print(*col_not_checked, sep='\n') print('================================================================') diff --git a/mibiscreen/data/settings/unit_settings.py b/mibiscreen/data/settings/unit_settings.py index 326d995..22d8ddc 100644 --- a/mibiscreen/data/settings/unit_settings.py +++ b/mibiscreen/data/settings/unit_settings.py @@ -65,5 +65,3 @@ for key in properties_units.keys(): if key != names.unit_less: all_units = all_units + properties_units[key]['other_names'] - -properties_units \ No newline at end of file From fc1cc9afeaeb09c3451b236b51cdbabecaa79d6a Mon Sep 17 00:00:00 2001 From: Alraune Zech Date: Fri, 3 Oct 2025 10:36:33 +0200 Subject: [PATCH 6/7] add alkylphenols to list of contaminants that are identified by default --- mibiscreen/data/settings/contaminants.py | 262 +++++++++++++++++++++ mibiscreen/data/settings/standard_names.py | 26 +- tests/test_concentrations.py | 2 +- 3 files changed, 288 insertions(+), 2 deletions(-) diff --git a/mibiscreen/data/settings/contaminants.py b/mibiscreen/data/settings/contaminants.py index 8582a12..bd9b1f5 100644 --- a/mibiscreen/data/settings/contaminants.py +++ b/mibiscreen/data/settings/contaminants.py @@ -788,6 +788,224 @@ standard_unit = names.unit_microgperl, ) +### Alkylphenols +properties_contaminants[names.name_phenol] = dict( + chemical_formula = "C6H6O", + molecular_mass = 94.11, + carbon_atoms = 6, + hydrogen_atoms = 6, + other_names = ['phenol', 'hydroxybenzene', 'benzenol'], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_thymol] = dict( + chemical_formula = "C10H14O", + molecular_mass = 150.22, + carbon_atoms = 10, + hydrogen_atoms = 14, + other_names = ['thymol', '2_isopropyl_5_methylphenol', '5_methyl_2_isopropylphenol'], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_cresol] = dict( + chemical_formula = None, + molecular_mass = None, + carbon_atoms = None, + hydrogen_atoms = None, + other_names = ['cresol', 'methylphenol', 'hydroxytoluene'], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_m_cresol] = dict( + chemical_formula = "C7H8O", + molecular_mass = 108.14, + carbon_atoms = 7, + hydrogen_atoms = 8, + other_names = ['m-cresol','m cresol','mcresol','m_cresol', 'methylphenol', '3_methylphenol', '3-methylphenol'], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_o_cresol] = dict( + chemical_formula = "C7H8O", + molecular_mass = 108.14, + carbon_atoms = 7, + hydrogen_atoms = 8, + other_names = ['o-cresol','o cresol','ocresol''o_cresol', '2_methylphenol', '2-methylphenol'], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_p_cresol] = dict( + chemical_formula = "C7H8O", + molecular_mass = 108.14, + carbon_atoms = 7, + hydrogen_atoms = 8, + other_names = ['p-cresol','p cresol','pcresol','p_cresol', '4_methylphenol', '4-methylphenol'], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_2_ethylphenol] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = ['2_ethylphenol', '2-ethylphenol'], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_3_ethylphenol] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = ['3_ethylphenol', '3-ethylphenol'], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_23_dimethylphenol] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = [ + '23_dimethylphenol','23-dimethylphenol','23 dimethylphenol','23dimethylphenol', + '2,3_dimethylphenol','2,3-dimethylphenol','2,3 dimethylphenol','2,3dimethylphenol', + '23_dimethylphenol','23-dimethylphenol','23 dimethylphenol','23dimethylphenol', + '2,3_dimethylphenol','2,3-dimethylphenol','2,3 dimethylphenol','2,3dimethylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_24_dimethylphenol] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = [ + '24_dimethylphenol','24-dimethylphenol','24 dimethylphenol','24dimethylphenol', + '2,4_dimethylphenol','2,4-dimethylphenol','2,4 dimethylphenol','2,4dimethylphenol', + '24_dimethylphenol','24-dimethylphenol','24 dimethylphenol','24dimethylphenol', + '2,4_dimethylphenol','2,4-dimethylphenol','2,4 dimethylphenol','2,4dimethylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_25_dimethylphenol] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = [ + '25_dimethylphenol','25-dimethylphenol','25 dimethylphenol','25dimethylphenol', + '2,5_dimethylphenol','2,5-dimethylphenol','2,5 dimethylphenol','2,5dimethylphenol', + '25_dimethylphenol','25-dimethylphenol','25 dimethylphenol','25dimethylphenol', + '2,5_dimethylphenol','2,5-dimethylphenol','2,5 dimethylphenol','2,5dimethylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_26_dimethylphenol26] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = [ + '26_dimethylphenol','26-dimethylphenol','26 dimethylphenol','26dimethylphenol', + '2,6_dimethylphenol','2,6-dimethylphenol','2,6 dimethylphenol','2,6dimethylphenol', + '26_dimethylphenol','26-dimethylphenol','26 dimethylphenol','26dimethylphenol', + '2,6_dimethylphenol','2,6-dimethylphenol','2,6 dimethylphenol','2,6dimethylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_34_dimethylphenol] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = [ + '34_dimethylphenol','34-dimethylphenol','34 dimethylphenol','34dimethylphenol', + '3,4_dimethylphenol','3,4-dimethylphenol','3,4 dimethylphenol','3,4dimethylphenol', + '34_dimethylphenol','34-dimethylphenol','34 dimethylphenol','34dimethylphenol', + '3,4_dimethylphenol','3,4-dimethylphenol','3,4 dimethylphenol','3,4dimethylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_35_dimethylphenol] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = [ + '35_dimethylphenol','35-dimethylphenol','35 dimethylphenol','35dimethylphenol', + '3,5_dimethylphenol','3,5-dimethylphenol','3,5 dimethylphenol','3,5dimethylphenol', + '35_dimethylphenol','35-dimethylphenol','35 dimethylphenol','35dimethylphenol', + '3,5_dimethylphenol','3,5-dimethylphenol','3,5 dimethylphenol','3,5dimethylphenol' + ], + standard_unit = names.unit_microgperl, +) + + +properties_contaminants[names.name_235_trimethylphenol] = dict( + chemical_formula = "C9H12O", + molecular_mass = 138.19, + carbon_atoms = 9, + hydrogen_atoms = 12, + other_names = [ + '235_trimethylphenol', '235-trimethylphenol', '235 trimethylphenol', '235trimethylphenol', + '2,3,5_trimethylphenol', '2,3,5-trimethylphenol', '2,3,5 trimethylphenol', '2,3,5trimethylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_345_trimethylphenol] = dict( + chemical_formula = "C9H12O", + molecular_mass = 138.19, + carbon_atoms = 9, + hydrogen_atoms = 12, + other_names = [ + '345_trimethylphenol', '345-trimethylphenol', '345 trimethylphenol', '345trimethylphenol', + '3,4,5_trimethylphenol', '3,4,5-trimethylphenol', '3,4,5 trimethylphenol', '3,4,5trimethylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_4_ethylphenol] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = [ + '4_ethylphenol', '4-ethylphenol', '4 ethylphenol', '4ethylphenol', + 'p_ethylphenol', 'p-ethylphenol', 'p ethylphenol', 'pethylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_2_isopropylphenol] = dict( + chemical_formula = "C9H12O", + molecular_mass = 136.19, + carbon_atoms = 9, + hydrogen_atoms = 12, + other_names = [ + '2_isopropylphenol', '2-isopropylphenol', '2 isopropylphenol', '2isopropylphenol', + 'o_isopropylphenol', 'o-isopropylphenol', 'o isopropylphenol', 'oisopropylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_ptertbutylphenol] = dict( + chemical_formula = "C10H14O", + molecular_mass = 150.22, + carbon_atoms = 10, + hydrogen_atoms = 14, + other_names = [ + 'p_[tert]butylphenol', 'p-(tert)butylphenol', 'p tert butyl phenol', 'ptertbutylphenol', + 'para_tert_butyl_phenol', 'para-(tert)-butyl-phenol', 'para tert butyl phenol' + ], + standard_unit = names.unit_microgperl, +) + ############################################################################### ############################################################################### @@ -871,6 +1089,50 @@ standard_unit = names.unit_microgperl, ) +contaminants_analysis[names.name_total_c2_alkylphenols] = dict( + other_names = [ + 'c2-alkylphenols_total',"c2_alkylphenols_total","c2alkylphenols_total", "c2 alkylphenols_total", + "c2-alkylphenols-total","c2_alkylphenols-total","c2alkylphenols-total", "c2 alkylphenols-total", + "c2 alkylphenols total","c2_alkylphenols total","c2alkylphenols total", "c2 alkylphenols total", + "c2alkylphenolstotal","c2_alkylphenolstotal","c2alkylphenolstotal", "c2 alkylphenolstotal", + "total_c2_alkylphenols", "total-c2-alkylphenols", "total c2 alkylphenols", "totalc2alkylphenols", + "alkylphenols_c2_total", "alkylphenols-c2-total", "alkylphenols c2 total", "alkylphenolsc2total", + "c2_alkylphenols", "c2-alkylphenols", "c2 alkylphenols", "c2alkylphenols", + "alkylphenols_c2", "alkylphenols-c2", "alkylphenols c2", "alkylphenolsc2" + ], + standard_unit = names.unit_microgperl, +) + +contaminants_analysis[names.name_total_c3_alkylphenols] = dict( + other_names = [ + 'c3-alkylphenols_total',"c3_alkylphenols_total","c3alkylphenols_total", "c3 alkylphenols_total", + "c3-alkylphenols-total","c3_alkylphenols-total","c3alkylphenols-total", "c3 alkylphenols-total", + "c3 alkylphenols total","c3_alkylphenols total","c3alkylphenols total", "c3 alkylphenols total", + "c3alkylphenolstotal","c3_alkylphenolstotal","c3alkylphenolstotal", "c3 alkylphenolstotal", + "c3_alkylphenols_total", "c3-alkylphenols-total", "c3 alkylphenols total", "c3alkylphenolstotal", + "total_c3_alkylphenols", "total-c3-alkylphenols", "total c3 alkylphenols", "totalc3alkylphenols", + "alkylphenols_c3_total", "alkylphenols-c3-total", "alkylphenols c3 total", "alkylphenolsc3total", + "c3_alkylphenols", "c3-alkylphenols", "c3 alkylphenols", "c3alkylphenols", + "alkylphenols_c3", "alkylphenols-c3", "alkylphenols c3", "alkylphenolsc3" + ], + standard_unit = names.unit_microgperl, +) + +contaminants_analysis[names.name_total_c4_alkylphenols] = dict( + other_names = [ + 'c4-alkylphenols_total',"c4_alkylphenols_total","c4alkylphenols_total", "c4 alkylphenols_total", + "c4-alkylphenols-total","c4_alkylphenols-total","c4alkylphenols-total", "c4 alkylphenols-total", + "c4 alkylphenols total","c4_alkylphenols total","c4alkylphenols total", "c4 alkylphenols total", + "c4alkylphenolstotal","c4_alkylphenolstotal","c4alkylphenolstotal", "c4 alkylphenolstotal", + "c4_alkylphenols_total", "c4-alkylphenols-total", "c4 alkylphenols total", "c4alkylphenolstotal", + "total_c4_alkylphenols", "total-c4-alkylphenols", "total c4 alkylphenols", "totalc4alkylphenols", + "alkylphenols_c4_total", "alkylphenols-c4-total", "alkylphenols c4 total", "alkylphenolsc4total", + "c4_alkylphenols", "c4-alkylphenols", "c4 alkylphenols", "c4alkylphenols", + "alkylphenols_c4", "alkylphenols-c4", "alkylphenols c4", "alkylphenolsc4" + ], + standard_unit = names.unit_microgperl, +) + contaminants_analysis[names.name_total_contaminants] = dict( other_names = ["sum_contaminants","sum-contaminants","sum contaminants","sumcontaminants", "total_contaminants","total-contaminants","total contaminants","totalcontaminants", diff --git a/mibiscreen/data/settings/standard_names.py b/mibiscreen/data/settings/standard_names.py index fe15674..cee848d 100644 --- a/mibiscreen/data/settings/standard_names.py +++ b/mibiscreen/data/settings/standard_names.py @@ -146,9 +146,28 @@ name_26_dimethylnaphthalene = '26_dimethylnaphthalene' name_27_dimethylnaphthalene = '27_dimethylnaphthalene' +### Standard names for alkylphenols +name_phenol = "phenol" +name_thymol = "thymol" +name_cresol = "cresol" +name_m_cresol = "m_cresol" +name_o_cresol = "o_cresol" +name_p_cresol = "p_cresol" +name_2_ethylphenol = "2_ethylphenol" +name_3_ethylphenol = "3_ethylphenol" +name_23_dimethylphenol = "23_dimethylphenol" +name_24_dimethylphenol = "24_dimethylphenol" +name_25_dimethylphenol = "25_dimethylphenol" +name_26_dimethylphenol26 = "26_dimethylphenol" +name_34_dimethylphenol = "34_dimethylphenol" +name_35_dimethylphenol = "35_dimethylphenol" +name_235_trimethylphenol = "235_trimethylphenol" +name_345_trimethylphenol = "345_trimethylphenol" +name_4_ethylphenol = "4_ethylphenol" +name_2_isopropylphenol = "2_isopropylphenol" +name_ptertbutylphenol = "p_[tert]butylphenol" ### Standard names for a selection of metabolites -name_phenol = "phenol" name_cinnamic_acid = "cinnamic_acid" name_benzoic_acid = "benzoic_acid" name_dimethyl_benzoic_acid = 'dimethyl_benzoic_acid' @@ -177,6 +196,11 @@ name_fraction_C22_C30 = 'fraction_C22-C30' name_fraction_C30_C40 = 'fraction_C30-C40' name_total_C10_C40 = 'total_C10-C40' + +name_total_c2_alkylphenols = "c2_alkylphenols_total" +name_total_c3_alkylphenols = "c3_alkylphenols_total" +name_total_c4_alkylphenols = "c4_alkylphenols_total" + ### Standard names for metabolite related quantities name_metabolites_conc = "metabolites_concentration" # name_metabolites_variety = 'metabolites_count' diff --git a/tests/test_concentrations.py b/tests/test_concentrations.py index 92564fe..085bdfb 100644 --- a/tests/test_concentrations.py +++ b/tests/test_concentrations.py @@ -97,7 +97,7 @@ def test_total_contaminant_concentration_01(self): Correct calculation of total amount of contaminants (total concentration). """ - tot_conc_test = 27046.0 + tot_conc_test = 27046.5 tot_conc = np.sum(total_contaminant_concentration(self.data)) assert (tot_conc - tot_conc_test)<1e-5 From a5fd29c50139b1b89ff39d8dabf810a503434c3b Mon Sep 17 00:00:00 2001 From: Alraune Zech Date: Fri, 3 Oct 2025 13:50:49 +0200 Subject: [PATCH 7/7] include check on duplicate column names after standard name identification, extend testing --- mibiscreen/data/check_data.py | 57 +++++++++++-- .../data/example_data/example_duplicate.xlsx | Bin 0 -> 11013 bytes mibiscreen/data/load_data.py | 11 +-- tests/test_data.py | 78 ++++++++++++++++-- 4 files changed, 128 insertions(+), 18 deletions(-) create mode 100644 mibiscreen/data/example_data/example_duplicate.xlsx diff --git a/mibiscreen/data/check_data.py b/mibiscreen/data/check_data.py index 1a81616..32279b0 100644 --- a/mibiscreen/data/check_data.py +++ b/mibiscreen/data/check_data.py @@ -40,10 +40,12 @@ def standard_names(name_list, Returns: ------- - tuple: three list containing names of + tuple: three lists containing names of list with identitied quantities in data (but not standardized names) list with unknown quantities in data (not in list of standardized names) list with standard names of identified quantities + + one dictionary mapping identified quantities to their + standard values for fast name transformation Raises: ------- @@ -60,7 +62,6 @@ def standard_names(name_list, names_unknown = [] names_transform = {} - if isinstance(name_list, str): name_list = [name_list] elif isinstance(name_list, list): @@ -76,12 +77,9 @@ def standard_names(name_list, **contaminants_analysis, } dict_names=_generate_dict_other_names(properties_all) -# print(dict_names) other_names_contaminants = _generate_dict_other_names(properties_contaminants) other_names_isotopes = _generate_dict_other_names(properties_isotopes) - # dict_names= other_names_all.copy() - for x in name_list: y = dict_names.get(x, False) x_isotope = x.split('-')[0] @@ -245,6 +243,21 @@ def check_columns(data_frame, column_names_unknown = results[2] column_names_transform = results[3] + ### check on duplicates in column names after name standardization + ### (i.e. in case the same quantity has been provided with different non-standard names + duplicates_indices = _check_duplicates_in_list(column_names_standard) + + if duplicates_indices: + # duplicates_indices is NOT empty — do something here + print("WARNING: Duplicates found in list of standard names:") + for name, indices in duplicates_indices.items(): + print(f"'{name}' occur has been provided as:") + for i in indices: + # Print the entry from other_list at index i + print(f" - '{column_names_known[i]}' (index {i})") + print('Remove or rename duplicate quantities in data.') + print('________________________________________________________________') + if standardize: data.columns = [column_names_transform.get(x, x) for x in data.columns] @@ -257,7 +270,7 @@ def check_columns(data_frame, print('----------------------------------') for i,name in enumerate(column_names_known): print(name," --> ",column_names_standard[i]) - print('----------------------------------') + print('---------------------------------------------------------') if standardize: print("Identified column names have been standardized") else: @@ -648,3 +661,35 @@ def _generate_dict_other_names(name_dict, other_names_dict[other_name] = key return other_names_dict + +def _check_duplicates_in_list(name_list): + """Finds duplicate strings in a list and returns their indices. + + Parameters: + ----------- + name_list : list of str + The list of strings to check for duplicates. + + Returns: + -------- + dict + A dictionary where keys are duplicated strings and values are lists + of indices where these duplicates occur in the input list. + + Example: + -------- + >>> find_duplicates_with_indices(['a', 'b', 'a', 'c', 'b']) + {'a': [0, 2], 'b': [1, 4]} + """ + duplicates_indices = {} + + for index, name in enumerate(name_list): + if name not in duplicates_indices: + duplicates_indices[name] = [index] + else: + duplicates_indices[name].append(index) + + # Filter to keep only duplicates (more than one occurrence) + duplicates_indices = {name: indices for name, indices in duplicates_indices.items() if len(indices) > 1} + + return duplicates_indices diff --git a/mibiscreen/data/example_data/example_duplicate.xlsx b/mibiscreen/data/example_data/example_duplicate.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..ec9f1d013a2be874dc8a2318d2023bc0944e89e1 GIT binary patch literal 11013 zcmbVy1yEe;(k<>T!Ce#FonQkDguy*Pu;2_ZSa5<%2yVd%4#8c5y95XhA-KEygPi;R zlaqJvt@>Y2jaK!Vz583=>fXCsNgfss4+;qh35rQqQWxqsBYynsYRzWgU}_Cva{)V8 z8G*r8tgbfJtWUo*$BJ8auwwx(1OgI0S>}M_D$HiMC^I!%(EcJdudIQxF*Ws`6S>?N zaH9c7a)Sx$4?%>v(;gTi?lK!f+3+Sn6f`H(ZBQ-DkMj|ik7dsZx=_!f+i^<;ceFQ0 z^?j%-6jN^GmMA@qn78f53HfAEa;)m(Cd6xcPq_6UeCIyL{V{u%(6*Xpt>+A34@bNT zXeAN5_`e=b;)v_>N{gsJyZD^M^fgW-Qih1OB@XN1zmJ?LFy$M|oQKDGk zyj6T|M6ARfe|}n+0ljy&7teSb z!AWZQz|MKW-9?!JV1kom8KN$MN>EVH>gl=?5RBN3QeyPX8Ft;b-;wpZ;M+y=gjDPk zW`{UP0eTcAKa?8~dH9`v^{C-~kSQzG zE{VhYT<)gmNF~jt<}A9&Si{c37l-BSUjCYl^K*Dz?BIZa1V6^LS;9+7?r4U9%z^b~ zVC#?j8a|mI*Srs|qbT*SO{L|DxVYz3xv4{`ZxT!=kj{El-CKl(QR8+z2lJ(R!tQ?9 zqyr-8@a?8a&PH5ZC@1N!1TD697w~%F$W5rXAs1(EOQs3p6VPqpv za>{P$2WLSxF=~dkMsX~EbXL;CY(Q;Vw}j>F0}rvvXM|ZpJ4jno2{y;gkryF)%qaV| z)JfSCa>tzbhGgl>O=fU{{Pug3sPKfB7Sg2K{w<%^z@y(Vu|ZbZZ= z+U_kXpN!mz+`vC^?eYaVI~xoXl<jf>BW_GZ@+4KmJ74ZOshBr4nWYWHq zHU<>n5-+@B6dxK;+Z4`RY_VNrV=TvTM5*|m_>D%#?W9 zKr_j%f_CP)yS4=$KR7uh2SLB7INDk;X)p;U(*hUl#n3Xy*=DbL%n&d;L5)DipWeVC zW%_(H!z01~Ms{k0_9ajlFdgG3U%5hclI)4)?pNY-ik0eB+;>5dLp4H)D~OC~x`W>c zHHKc#EQ~${wPkpn{G)1;#6hGP3TmSzc4=l5W*InN#*(iev5GZvkTFnux&n=9A1-ot z*}sT!8Ib3GU90KE`}1mn?CU%bwJ_lX+lv@5y#%W!n~9mt7h!h&-X!N%ICSJKHuE{U zSB{pI$F}R9V8{1l8u*VnGj5dJQzQBG`&xzw^AL(~C?JrIl&vPiU-%mD`4T)~*f8IT zBR4a)4hM;c|AC?j;%p0f-(EVPLVsCqlTW9M$9~dQ&HR%ki9~;@keo+ZrmU`9lx;H? zA#`InK>d;qJv1S8b{p;v0?*4Vzx@TEk0bceEU(C~|Eq-u_)bOLLZW2&Idsy^2($}5 z8pO1C&WB|iuN~+V6l-Umd=caIslV7;AknmUflXL*{9uAQyV0BCt#Q)c$=-v&*SmJD z4e$U0#d)RK9qtpo&dZM8lRwgH6zRW78Tw!Jf;hgjHibM9YfpRB_7gkaqmXHN86aCE zSK(%E3!2UjsFf6OTg>{wA`}yo5alUcpEY*-Ljk9Fm20c0*?z9L?~9OH_xQN9$O<%M zzc5n?FL(dm5gaNO_^Q*`gDcfTxQlU0%HLmJyvG#?@!0o1bZA~Z70gtl>7_j8096f( z3V_IIx#_N=2J?N=WD8@>%?qddxB$RJ5|#c`uTnlom3Fvu5e`L$f?;LK4GF(+6p8ag z7C1Zp*52I4SSr~oIe2w{z0W>U5bYOaOxgsL0$1Md7+~oo zkyd2vY?*xf%qtF9XJ9q>Y{l? zsB5{Cw0e*QZf>wqU977e&V|Z1z*PMUy2Ku@NkG-8O<_MSJ6%SK0ZIFa z`Q9r`f99+qCwV+{E&UJKYXw7Uyt45}Yf{OyU04;Z8`Phn_3WO7#n!--#(wAi*2Ssc z+9IRj%NP|ZB4)`|yzDT;UDWc{*LSze`16*;LuB$abH(kc5b#L2iQWV;AUa3}ux+oI+ zugD;8Y96=%hOU6ax>bCTgbwwRaF4E_4YNXE9le%usu*c&q? zE34xExUse7fvSWv%JGyKwblo#L>-K=m~`bU)pH^Y;qVK*WTkSN#ll|1@|%sx)|RSxi@YhOTc*Jdh}aOX^@YO z!z>GRGO{fISE8frBvsHfhB3@m3{JUoj!+z+IZH5a{irN*e4{+!ux$?`G+6~^MVAN> zNrTQd$T5WjfoVUYSw1i!QvJx390VY-lbPn+Az^>qQ~S(?kidC_6y%G`n6Xa zLw-L>Sc>HI8_yt}*o%$baq1xwk4}znoGA_H^8`f@;>vWtB&rB{8fnF6Y7(ngk&t9O zB_4o)yHqi?VYw}mgRbDyA~gnMX0+c$ zfy1H6Ykd*coLQHtr~+tZd_yrc;A@#@Q&DS9CQfAaUn7c3X)vdvhQLvQ?1Jep!XwBN zD#Q=af-5BRoM_{EKjfmN9=R2AqlIUo#SoaLvpPxZjmyb+0Cq1B*wRWbwRe;>k51kM zPg^mS=6%3$Lb%J+rd&|Om(?t7IX-XZqU^uU)${EO?oNAa#elMI@EA`X-m_y3yU%C)Vy6{Yx8D7d9u^l0G>Zx9tsS z8HB=!Nx_w(hy1|cozh?lYJ``1doL#ns@aKXo_i0jA_w4$6UdL<_X4)NQ~TJaC<7)Y-^JYy&CpW0}PC^YB)fHGz@oVa63S>`m`l4ix>C+@+D=!)=px6r#dZ7`Sif ziWbxu;SQs{c^~|TIbj|?P*-ur5Xg&hy*E{aDPpRqh(s=vsF)M4QEj1Q{`yKiiK4^K zt7-SXFF4T9{yp?^zFvpvQaQWk)LGxp8bO;+w=hqFn)U(Uhzbt{RsLA${;vg<^QmHK zPSl0W^5gX^UZ}d>e#j~rOX&0 zdY?wbHk$~A*VCcVYA^n9>TX0AcIpB-i5({k^bNa;#< zFs-xgQs7+#dc#a4)JOBOkM`{l8Z?>Uu)|6+U}a_X)M4#z02&#faKlxOD&jb`T|YZ# zCJfkWF3mTOZ0>=gFmgQG{AKm3|MuZ+!RWIB^5ETAZ{BB!Ut>(erw2sun62}&R7Pm2 zZx!gQG2Qy_leI~3><7us8IzmDl>@uMK z8X)mDR<1gMp_xk!w)5p{V(%M0p^30-wb!z*oIR0sediIDZlR`ybcC=+!)lCm&nQ$% zA=T?C6m=HvuaGQ9AxR40RGdSh6~e_3WSMiz*fe&c(oFWH-;MPf$wE;> ziu2m2Tka`<71)@z8!aa}$p<(lOv@@`q1C{=#2H%CZc%48dB~3l2Y1I$vnA7iNH9ph z;Zm-^z7tiKR9j|1zWM?-5ZH_NUZ~%$2~!CfepO!Fc=+A1Q{jkJ&_Q2d;?sJy{!iEl zIJWVEVr0a3p|)KQ84TOW1&bB!*m9LBxPG4979J?f2V)WbRG@I%xDmqU{_E>3AV-vD z>eOa`(VoJ2;q$^Yl167@7WJHYPcMAR2m}%7kIz);hMG5`o?);7prTe!*$T7T#~ttr zEXrHxVKoN4V{8Kqrj=Lum{%fe3K@RX_T*WRk_+;KIVdIh%?3wC0}m7L(VP%B-LDW|6$p1PBzjipqKcNmwesoQ9FJX~7PIwf@Lg3xkJ z`PD}mxGjhylPW=C${N)XlV7P{$k}+8smwK?wP`o=P-d)j)QA^|y{Adt64av%FUPo= z)l~Q6`ih?HoF|zEJIgyk^waWf#WM7!evnUgy)TsXv&SPX3LJCw(Vh&fC_Jm=UGp$~ zX0_qhH9H!++3}E%hICso%Mpx^O=*{k)P_h($fO;V+>#7! z*iM-0j)UKE6lR1sjr(Vt2jfO)Ta?+ihR294+MO8A!y4&Qlge23uL^bPC0Z$GBt0$5kDDawbe5RvKz;p0>TY=cXxC?SMM$M{%cH8X;Ti8XQF-dNFhX zLkf^29#FQUXK)1O)Mdv85BGl?R&OqbK(2 z;QGHg;nQC^fnE62!Wa!YCMP9O7mF3B@SoKOKu8CVnJaV-+fPSBQ{v|xK2 znV9`@<5&o0hxYa`(QJ~O3uMkwxiC6iV~0JOh&PvXcjo2k@K*PA2DLO1%i*9``}{ro z1D&wad8=x806YeYXr(GuKWfjCaww!zrm7RZm^OAzwv}#9u*rn5YdcpM(sbmyy&8;H zb8gqBzv4%a9C8>f-AGCxA!Nue2{X3w{Q1@Bobl}oZw^UB#F z$+d?-ZR9?~i86s4$dp?LM0EnivOOw_QCj9NkZFxNFOf|LaxT&QiZBr+_BBm@)tMgD z8dtO!Z5Jr_!_uf~<^8(4QG@*R5aZWqf$AKPQ~pYZ!vwtKFxO~3m;v!d0TTF@)e{KK zwYroyr4jov!^@XON&qwj>V}ok2c?y&P=2ytSJC7`VzLF$oh1NfNKzW}nQF;5d#8;Z zrJ-C-hi}^i&Pj5`nq+t^9erm0Mb525q{TN^{Ep|GZE{EDrFM;>0D~nbeEAY2ozOmQ z%?jek6c@UkmQHZA6Vw}pQFr6ewEjUWE>23%+j0{QN)B<$zRw?84=b+AP42Qj!#W&y zQ!;qMwLm+F;Sf^JMN0rdZnO+62s&KkM9CA8-Lhs2Q*|r~h`A5l?02FC1<(*N2g}w| z#5V{uuvh&DHqRmMrFWu3hM+bQ2+B29cTKvFIc(`O^gQbRMktqWB zhpLTS@Swvi>{^lmBP`B|0zcUsi$i@7BKdHLI7HTXcm6DL{kzXrRZG6SG>W}mA@+dW z5+R1MGQ@h3SH^i9CR@@oA|qUQTq4678bb26L=_P&{k_y3yvztiI`$k6Qc8iJh>4ps zv0C2UI!;7~r7L>V_%)(nb0&J@yLgg@#f@^ z!jouSy-7r)xoFleNujvolJ@D#%fy8N3tdQfMS+t^dMfmU)OPWUCX8-%%*){C*k_5R zZNam|7S#MRZ04+a2CPBRS+FkJIGe?<_GGKerK)Hvm@K!AOuTH ztncva?|ou;OFqUwxiWoA)<1B8dUmsLL56>DV2HIkNkDjPYY`D@06uz_E1XLZH+=Rz z{Uy#>wx~fMXhp?sD&)@Pz@eogc1b!Q|27cR=v7zz@QLGrDzE#{U0M zMebj!n4k;!O%(@*#8|ubX=^I?tY=N49fc~voJiTovTD(LwCP?SX~Q|#7Y1z^-q{U~ zrbDjCtkYvWJPPHk!s;o*K{ks=)~MsLrv6{@&TE5;F7j&VXtRi35Ot~E5TV|z?ADs} zTxK(}$3NrGFtQ|2ncVooGzOz7%U|4$2Xg3UqFstE)l-Iqb9$<(H~4OJ*#;n0e+Yuy z6H_pc07D6w7t|8vR{U3RfU*X7pP#vd6=#ZWO-c7MzI%$M;t{NWKHCU_^~MJZ;4&wG z`U~l+4dcxAXOvSFg)hNrw@7jeI1>eL4vOUrR80deS8}~DTdXLnsp+Cd8s50an?Kaf z_XD4EJhXF~J4DF&E)`2#d3;=})kz;fbVd!lD;JWVUtiab#@-C-A6({7pC!1z;?zK4 zxeWW6y4h4K)#j`jxtJ?ubO0iG_(|JHkLRa;tYF+^+$g~G5*pR@RU@Leu}=-3T?H#; zhgHhJWR6sk`|D-3F*>{$zU6pX@#Tp#z zbhBA|2fFwpR;Po!$?}jwQ5~jk@;KntnOcG) zQ7HJs_nxx8_8Kg$+G&%@!{#sI3#eBi6b*C(CpkHAHgmBz`!}9JUrzmC$-u5qVF$D^ zUEf6^x9s8cURc#-8GPww)R>v{7#^BUVv6VSf&5%mrAN;aGBn}IW+;80kBNl7l=mtA zRvKy_VmWGI+<+aeu3(g_2;ax~N~U8w5#KF&7WYyyADSjzJ-xMp%NCT%sOI5i(^(cm z7$F7sc4?~=lZb~ZyFAhtvb>Ym`jf}NSvE_%eP(!>_ye-F;ApoXcYT7t^26sH!AkLW zO4TTNqLLCTzMtT=YPs0#YhScU+gsq-%PdhatxE#zinRy<-jtw^%17Mvq-l;iASsZuA(sx|D^%+FSW|8N7@we=|fzAa6 ziDx4l<@LA=WF0rVrl?LFu2sL7se!PZkKs2UZ+Na^jALS}KZ=l;a)TiEjeg^cPK)glvqbf!GY zdSe|C1rKtXHm0;AzQS>u{$Y~&lO8dq1A9xb9P(Q6NoU6A80u$1^0 zXZZ2SLK;-pH)Q%98XZHKlUKg?7iM1`PH)0GMe5XkC>T8~HqJ6I+tf`^5q}MtE1_VW z+w(lfeA2zV$ds9iN8N+Q`fs|2_pe5VxuJupiK?T6g{|4Ienwn;8)%*#OYGn-wD;nt zYeK;L3UQ-}P}mgndTDwhBl+`bSJH;L(w5betI<_sn6z zQ7DjMP2FzS2*u)DX;x;j{Uj^pZWprbJeq-9)_kne=Vo?+9;@dToIiO|1CX>@n*3Lt zSU*}s;o)v3p^2S?Y#a6^+-#J&Rkb)0s^mi60&W`GmrjPRKxiT}nGZ2*$ysS~$5~Xe zU`m8BQ9(kEGfvFrTgnH?ATwfbUkI~nU?O^$2f{5n5;HGit%G*&`l{vX=*OOzm9b2O znzhv{#;H)jWNU7BeDttmW=p+Q5maWU*=oH!twWm|yNCQ)n<8Ew2l89Z7OROb(k&%X z236Y?(q8&BU#%zbOQR38rfXqQ<=KO14nU-esFbD7HXuQ+ou;z*zOilnI{iIQvBqPR zdr51vwCRK&9W4jXDNDVYyK}TcW`= zB#G!sD2Q2D&0nookWZX{n+#7J$sf{FSMp!ods!5{R%+;q__V&<35MEf&`?m2|JtZR zeO%wa#ua{zD3nCEfjXZ0pYEYAE_fEikTP={U&%I7ag(>w+LBa2cTJ!~#DX03)Ll65O@i0`wRgjD?9DW?roG{s{1t4hcx-_b zq;8^I%bL_NRL;F}ud+Sz_?vLhO#@Nh{V?^cA6Pty#UMgsmsP+{pjzMWeKuYjL3I~9 zu$7pI+fcvmU}toSFw}(O&u-@Eu0Z1{`)WTOpNz{uE=(uwc3}NiKQADeJ9+;x2*Zy( zz5jes^jI4}nShOz9Kd!EHe;}Z=~LeKRMAsxf9&a9Xfk^)Hn1UC)I{Yk0da6Ci5D$h zh$c#`X4`)*&SCH>hk!5AAa47bl~?5j&9IC^x>4DTNKuIItLX^TBY1ZLUP`9%s>N2> z!mn<^5f6nmh6CGm;71DMGSCqBv}rPtV+3Z2Xpp;;_cAgC%TNsaY>J32tqWf;Kfn2m z|C1jruMS#L-jdNdu!WRtw5!4-J7>PAfsc_WkeC=tdWA9guy)eN@^lDM1d?N2CjhK_ zPS&gJ`P~E0S&zUG$X0zPbLn(dwJmd2Z&zgG6S_NwWQdaQJ64IX`r7u5esoaJ84SKX zVV)QN*@`3CuxxT9x7)yB>BaCNX>Fk$YFwW2o*K<29G!V?9rt-WK=B^RT3=OpZ*LhD z>;CP8ymW7BRqibF-cc=S|4qzUoiBPS55exkD7_cnwP8gB4vwO1%Igojgr{-%q;)rb zUBzsIxnHcBy&s_74%2)H`GC&Hld?p0d%_mR_QYzlwfs+ZkE~w#ugmo)X@4)5p`G2+ zYNbXhfZP9S?W=cI1N-EHl!&V=W|1v~9L;NA$iMFW?vW2vzPdoofZn(=`C3Ch$EK^8 zJA%Ouw`cufy-tisYb&;7>BzWaI0luD-w^5W(J^p6oCBo>Y?#l&IW*$CB} z>f+hL!F~GK(@xA@Svj3Sed&dw6i$baLglptwO1a#VR#$9FF!>dPO=zl2HUcpjp;cQ3pxe^3-~UFAK2FS^9{Lw#u@^1qw~~j8u;Ko z=rCqVD}g*b%be-qpXdn!#;h~|uu7V}3U|bCd(!SQbc~?dgz`)P?iIC%z}3+<&z(Gm z$>PD~4=|OxO}3ZW+^_RmnuPL6cs%N($LD`PFI`Ent)r=}qk-BxJ5z|>(|HZ4_dV(z zR`Uf@)@$YE8cKhqF>>L^CTUb&&ve z#2tSPi|MdNL#R*-*6+0F={*;t>indR(MMxGsO*~c+=I=0Q~vz%exF$<5%8orZRpM} ztr$9$cyn+m?`$uk*-L~ODXyYvNNH;$N41yy8sTlH8jvvkJ_(k_%1K(jq9Qyaq&JPV z?$TzQC;$2S2;F5guaYoIJz^lh-mDQnJlPrvNn1l>A1u>EG>uUJ6a22zS+|XfPSJewER)jz#LT*>&ztU^>z5^67`Or*^wfs1s*SJOZJXP%MxeB?gyJVnX&3kqk zQ_Ju5(MxQq{^BYhant~f!cnkdmfwS#nVaoGzEnyEA*9u<`~l+&UQ~mVcP8mIsjI3D zFGPP}@|b5x@dzYQxZ!pP{lrWa&uk{&M^-K%KCK`$3?9_q({;a3jy+A+{nP&QtlghD zzfT1{&6)jeqL2Uli1Yi@*`Ln8&wM;>kiV_)v1k40{O1_ap8&tNx1UBs{x)QUUjTm( ziu~#Ndn4zm|NXZaApg&S|3BUEKT&?a_4iMdGK_zV@~Z{(C(7@a%Kt>^e5@1y6~y0< z<=0KaKT&=!JpYL@OZ0D1eqCh$iSm1S^Hik%ZDZuWP#}MntbZc>p8223pueqx@*hRf zpN_xhpr^v%Z=0d|T|NI*Cj9CCyAVGms=qCm<~R3$rL8}`f0w&|a0 - def test_check_duplicates_01(self,capsys): - """Testing routine _check_duplicates(). + def test_load_excel_05(self,capsys): + """Testing routine load_excel(). + + Testing handling of duplicate column names from real excel file. + """ + data_t3= load_excel("{}/example_duplicate.xlsx".format(path_data),sheet_name= 'contaminants')[0] + captured = capsys.readouterr().out + + assert data_t3.shape == (5,12) + assert "WARNING: Looks like duplicate column names detected." in captured + assert " - 'naphthalene.1'" in captured + + def test_check_duplicates_in_df_01(self,capsys): + """Testing routine _check_duplicates_in_df(). Testing Warning in case of duplicates in dataframe. """ @@ -111,18 +124,16 @@ def test_check_duplicates_01(self,capsys): }) # Call the function - _check_duplicates(data) + _check_duplicates_in_df(data) # Capture printed output captured = capsys.readouterr().out # Check the warning is in the output - assert "WARNING: Duplicate column names detected" in captured - + assert "WARNING: Looks like duplicate column names detected." in captured # Check that the renamed columns are listed assert " - 'Name.1'" in captured assert " - 'Age.1'" in captured - # Check verbose messages appear assert "Consider renaming them." in captured @@ -397,6 +408,9 @@ class TestCheckDataColumns: data4check = pd.DataFrame([units,s01],columns = columns_mod) + columns_dup = ["sample","well","Depth",'pH', 'redox' , 'Sulfate', 'CH4','ironII','c6h6', 'benzene'] + data_dup = pd.DataFrame([units,s01],columns = columns_dup) + def test_check_columns_01(self): """Testing check_column() on complete example data. @@ -440,6 +454,54 @@ def test_check_columns_04(self,capsys): assert len(out)>0 + def test_check_columns_05(self,capsys): + """Testing routine check_column() on check and standardization of column names. + + Testing Warning if duplicate column names in identified standard names. + """ + check_columns(self.data_dup) + captured = capsys.readouterr().out + + # Check the warning is in the output + assert "WARNING: Duplicates found in list of standard names:" in captured + assert " - 'c6h6' (index 8)" in captured + assert " - 'benzene' (index 9)" in captured + assert "Remove or rename duplicate quantities in data." in captured + + def test_no_duplicates(self): + """Testing routine _check_duplicates_in_list(). + + Checking if there are no identical strings in a list. + """ + data = ['a', 'b', 'c', 'd'] + assert _check_duplicates_in_list(data) == {} + + def test_with_duplicates(self): + """Testing routine _check_duplicates_in_list(). + + Checking if there are identical strings in a list. + """ + data = ['a', 'b', 'a', 'c', 'b'] + expected = {'a': [0, 2], 'b': [1, 4]} + assert _check_duplicates_in_list(data) == expected + + def test_all_duplicates(self): + """Testing routine _check_duplicates_in_list(). + + Checking if there are only identical strings in a list. + """ + data = ['x', 'x', 'x', 'x'] + expected = {'x': [0, 1, 2, 3]} + assert _check_duplicates_in_list(data) == expected + + def test_empty_list(self): + """Testing routine _check_duplicates_in_list(). + + Checking if it works for an empty list. + """ + data = [] + assert _check_duplicates_in_list(data) == {} + class TestCheckDataUnits: """Class for testing data module of mibiscreen.""" @@ -757,6 +819,8 @@ def test_generate_dict_other_names_02(self): assert set(other_names.keys()) == set(self.other_names_1+self.other_names_2) + + class TestDataCompareLists: """Class for testing data module of mibiscreen."""