diff --git a/mibiscreen/data/check_data.py b/mibiscreen/data/check_data.py index e3de185..32279b0 100644 --- a/mibiscreen/data/check_data.py +++ b/mibiscreen/data/check_data.py @@ -40,10 +40,12 @@ def standard_names(name_list, Returns: ------- - tuple: three list containing names of + tuple: three lists containing names of list with identitied quantities in data (but not standardized names) list with unknown quantities in data (not in list of standardized names) list with standard names of identified quantities + + one dictionary mapping identified quantities to their + standard values for fast name transformation Raises: ------- @@ -60,7 +62,6 @@ def standard_names(name_list, names_unknown = [] names_transform = {} - if isinstance(name_list, str): name_list = [name_list] elif isinstance(name_list, list): @@ -76,17 +77,13 @@ def standard_names(name_list, **contaminants_analysis, } dict_names=_generate_dict_other_names(properties_all) - other_names_contaminants = _generate_dict_other_names(properties_contaminants) other_names_isotopes = _generate_dict_other_names(properties_isotopes) - # dict_names= other_names_all.copy() - for x in name_list: y = dict_names.get(x, False) x_isotope = x.split('-')[0] y_isotopes = other_names_isotopes.get(x_isotope.lower(), False) - if y_isotopes is not False: x_molecule = x.removeprefix(x_isotope+'-') y_molecule = other_names_contaminants.get(x_molecule.lower(), False) @@ -246,6 +243,21 @@ def check_columns(data_frame, column_names_unknown = results[2] column_names_transform = results[3] + ### check on duplicates in column names after name standardization + ### (i.e. in case the same quantity has been provided with different non-standard names + duplicates_indices = _check_duplicates_in_list(column_names_standard) + + if duplicates_indices: + # duplicates_indices is NOT empty — do something here + print("WARNING: Duplicates found in list of standard names:") + for name, indices in duplicates_indices.items(): + print(f"'{name}' occur has been provided as:") + for i in indices: + # Print the entry from other_list at index i + print(f" - '{column_names_known[i]}' (index {i})") + print('Remove or rename duplicate quantities in data.') + print('________________________________________________________________') + if standardize: data.columns = [column_names_transform.get(x, x) for x in data.columns] @@ -258,7 +270,7 @@ def check_columns(data_frame, print('----------------------------------') for i,name in enumerate(column_names_known): print(name," --> ",column_names_standard[i]) - print('----------------------------------') + print('---------------------------------------------------------') if standardize: print("Identified column names have been standardized") else: @@ -342,6 +354,7 @@ def check_units(data, **properties_contaminants, **properties_metabolites, **properties_isotopes, + **contaminants_analysis, } ### run through all quantity columns and check their units @@ -354,6 +367,8 @@ def check_units(data, else: col_not_checked.append(quantity) continue + + ### check on given unit (also considering alternative unit names) if standard_unit != names.unit_less: other_names_unit = properties_units[standard_unit]['other_names'] if str(units[quantity][0]).lower() not in other_names_unit: @@ -370,6 +385,7 @@ def check_units(data, print(" Quantities not in requested units:") print(*col_check_list, sep='\n') if len(col_not_checked) != 0: + print('________________________________________________________________') print(" Quantities not identified (and thus not checked on units):") print(*col_not_checked, sep='\n') print('================================================================') @@ -645,3 +661,35 @@ def _generate_dict_other_names(name_dict, other_names_dict[other_name] = key return other_names_dict + +def _check_duplicates_in_list(name_list): + """Finds duplicate strings in a list and returns their indices. + + Parameters: + ----------- + name_list : list of str + The list of strings to check for duplicates. + + Returns: + -------- + dict + A dictionary where keys are duplicated strings and values are lists + of indices where these duplicates occur in the input list. + + Example: + -------- + >>> find_duplicates_with_indices(['a', 'b', 'a', 'c', 'b']) + {'a': [0, 2], 'b': [1, 4]} + """ + duplicates_indices = {} + + for index, name in enumerate(name_list): + if name not in duplicates_indices: + duplicates_indices[name] = [index] + else: + duplicates_indices[name].append(index) + + # Filter to keep only duplicates (more than one occurrence) + duplicates_indices = {name: indices for name, indices in duplicates_indices.items() if len(indices) > 1} + + return duplicates_indices diff --git a/mibiscreen/data/example_data/example_duplicate.xlsx b/mibiscreen/data/example_data/example_duplicate.xlsx new file mode 100644 index 0000000..ec9f1d0 Binary files /dev/null and b/mibiscreen/data/example_data/example_duplicate.xlsx differ diff --git a/mibiscreen/data/load_data.py b/mibiscreen/data/load_data.py index ee54e86..6c98048 100644 --- a/mibiscreen/data/load_data.py +++ b/mibiscreen/data/load_data.py @@ -5,6 +5,7 @@ @author: Alraune Zech """ import os.path +import re import numpy as np import pandas as pd @@ -29,7 +30,7 @@ def load_excel( store_provenance: Boolean To add! **kwargs: optional keyword arguments to pass to pandas' routine - read_excel() + read_excel(), e.g. sep = ',' or sep = ';' Returns: ------- @@ -51,6 +52,11 @@ def load_excel( >>> load_excel(example_data.xlsx) """ + if verbose: + print('===================================') + print(" Running function 'load_excel()'") + print('===================================') + if file_path is None: raise ValueError('Specify file path and file name!') if not os.path.isfile(file_path): @@ -59,18 +65,16 @@ def load_excel( data = pd.read_excel(file_path, sheet_name = sheet_name, **kwargs) - if ";" in data.iloc[1].iloc[0]: - data = pd.read_excel(file_path, - sep=";", - sheet_name = sheet_name, - **kwargs) + + if verbose: + print("Reading data from file: {}".format(file_path)) + print('------------------------------------------------------------------') + + _check_duplicates_in_df(data) units = data.drop(labels = np.arange(1,data.shape[0])) if verbose: - print('==============================================================') - print(" Running function 'load_excel()' on data file ", file_path) - print('==============================================================') print("Unit of quantities:") print('-------------------') print(units) @@ -118,20 +122,29 @@ def load_csv( >>> load_excel(example_data.csv) """ + if verbose: + print('==================================') + print(" Running function 'load_csv()'") + print('==================================') + if file_path is None: raise ValueError('Specify file path and file name!') if not os.path.isfile(file_path): raise OSError('Cannot access file at : ',file_path) + if verbose: + print("Reading data from file: {}".format(file_path)) + print('------------------------------------------------------------------') + data = pd.read_csv(file_path, encoding="unicode_escape") if ";" in data.iloc[1].iloc[0]: data = pd.read_csv(file_path, sep=";", encoding="unicode_escape") + + _check_duplicates_in_df(data) + units = data.drop(labels = np.arange(1,data.shape[0])) if verbose: - print('================================================================') - print(" Running function 'load_csv()' on data file ", file_path) - print('================================================================') print("Units of quantities:") print('-------------------') print(units) @@ -142,3 +155,38 @@ def load_csv( print('================================================================') return data, units + +def _check_duplicates_in_df(data): + """Detects duplicate column names in a pandas DataFrame. + + When a DataFrame contains identical column names they are automatically + renamed by pandas (e.g., 'Column', 'Column.1', 'Column.2'). This function + identifies if such column names exist and prints a warning message. + + This function checks for column names that match the pandas auto-renaming pattern (`.1`, `.2`, etc.) + indicating that duplicate column names were present in the original data source (e.g., an Excel file). + + Args: + ----- + data (pd.DataFrame): The DataFrame to check for renamed duplicate columns. + + Returns: + -------- + None + """ + # Check for duplicated column names + renamed_pattern = re.compile(r"^(.*)\.(\d+)$") # Pattern to match renamed columns + duplicate_columns = {} + for col in data.columns: + if (match := renamed_pattern.match(col)): + base = match.group(1) + duplicate_columns.setdefault(base, []).append(col) + if duplicate_columns: + print("WARNING: Looks like duplicate column names detected.") + print(" They were automatically renamed by pandas into:") + for base, renamed_list in duplicate_columns.items(): + for renamed in renamed_list: + print(f" - '{renamed}'") + print("Duplicate column names will not be identified as standard names.") + print("Consider renaming them.") + print('------------------------------------------------------------------') diff --git a/mibiscreen/data/settings/contaminants.py b/mibiscreen/data/settings/contaminants.py index b876996..bd9b1f5 100644 --- a/mibiscreen/data/settings/contaminants.py +++ b/mibiscreen/data/settings/contaminants.py @@ -9,6 +9,10 @@ import mibiscreen.data.settings.standard_names as names properties_contaminants = dict() + +############################################################################### +### MAHs + properties_contaminants[names.name_benzene]=dict( chemical_formula = 'c6h6', molecular_mass = 78., @@ -49,7 +53,7 @@ hydrogen_atoms = 10, # factor_stoichiometry = 42., # thresholds_for_intervention_NL = 70., - other_names = ["xylene", + other_names = ["xylene","xyleen", "c6h4ch3ch3"], standard_unit = names.unit_microgperl, ) @@ -84,41 +88,8 @@ hydrogen_atoms = 10, # factor_stoichiometry = 42., # thresholds_for_intervention_NL = 70., - other_names = ["o-xylene","o xylene","o_xylene","oxylene"], - standard_unit = names.unit_microgperl, - ) - -properties_contaminants[names.name_indene]=dict( - chemical_formula = "c9h8", - molecular_mass = 116., - carbon_atoms = 9, - hydrogen_atoms = 8, - # factor_stoichiometry = 44., - # thresholds_for_intervention_NL = 70., - other_names = ["indene","indeen","c9h8"], - standard_unit = names.unit_microgperl, - ) - -properties_contaminants[names.name_indane]=dict( - chemical_formula = "c9h10", - molecular_mass = 118., - carbon_atoms = 9, - hydrogen_atoms = 10, - # factor_stoichiometry = 46., - # thresholds_for_intervention_NL = 70., - other_names = ["indane","c9h10"], - standard_unit = names.unit_microgperl, - ) - -properties_contaminants[names.name_naphthalene]=dict( - chemical_formula = "c10h8", - molecular_mass = 128., - carbon_atoms = 10, - hydrogen_atoms = 8, - # factor_stoichiometry = 48., - # thresholds_for_intervention_NL = 70., - other_names = ["naphthalene","naphthaleen","naphthaline", - "naphtaline","naphtalene","naphtaleen","c10h8"], + other_names = ["o-xylene","o xylene","o_xylene","oxylene", + "o-xyleen","o xyleen","o_xyleen","oxyleen"], standard_unit = names.unit_microgperl, ) @@ -334,7 +305,231 @@ '1,2,3,5 tetramethylbenzene','1,2,3,5tetramethylbenzene','isodurene'], standard_unit = names.unit_microgperl, ) +############################################################################### +### PAHs + +properties_contaminants[names.name_indene]=dict( + chemical_formula = "c9h8", + molecular_mass = 116., + carbon_atoms = 9, + hydrogen_atoms = 8, + # factor_stoichiometry = 44., + # thresholds_for_intervention_NL = 70., + other_names = ["indene","indeen","c9h8"], + standard_unit = names.unit_microgperl, + ) + +properties_contaminants[names.name_indane]=dict( + chemical_formula = "c9h10", + molecular_mass = 118., + carbon_atoms = 9, + hydrogen_atoms = 10, + # factor_stoichiometry = 46., + # thresholds_for_intervention_NL = 70., + other_names = ["indane","c9h10"], + standard_unit = names.unit_microgperl, + ) + +properties_contaminants[names.name_naphthalene]=dict( + chemical_formula = "c10h8", + molecular_mass = 128., + carbon_atoms = 10, + hydrogen_atoms = 8, + # factor_stoichiometry = 48., + # thresholds_for_intervention_NL = 70., + other_names = ["naphthalene","naphthaleen","naphthaline", + "naphtaline","naphtalene","naphtaleen","c10h8"], + standard_unit = names.unit_microgperl, + ) + +properties_contaminants[names.name_naphthalene_VOC]=dict( + chemical_formula = "c10h8", + molecular_mass = 128., + carbon_atoms = 10, + hydrogen_atoms = 8, + # factor_stoichiometry = 48., + # thresholds_for_intervention_NL = 70., + other_names = ['naphthalene_voc',"naphthalene_voc","naphthalenevoc","naphthalene-voc","naphthalene voc", + "naphtalene_voc","naphtalenevoc","naphtalene-voc","naphtalenevoc", + "naphthaline_voc","naphthalinevoc","naphthaline-voc","naphthaline voc", + "naphtaline_voc","naphtalinevoc","naphtaline-voc","naphtaline voc", + "naphthaleen_voc","naphthaleenvoc","naphthaleen-voc","naphthaleen voc", + "naphtaleen_voc","naphtaleenvoc","naphtaleen-voc","naphtaleen voc", + "c10h8_voc"], + standard_unit = names.unit_microgperl, + ) + +properties_contaminants[names.name_naphthalene_PAH]=dict( + chemical_formula = "c10h8", + molecular_mass = 128., + carbon_atoms = 10, + hydrogen_atoms = 8, + # factor_stoichiometry = 48., + # thresholds_for_intervention_NL = 70., + other_names = ["naphthalene_pah","naphthalenepah","naphthalene-pah","naphthalene pah", + "naphtalene_pah","naphtalenepah","naphtalene-pah","naphtalenepah", + "naphthaline_pah","naphthalinepah","naphthaline-pah","naphthaline pah", + "naphtaline_pah","naphtalinepah","naphtaline-pah","naphtaline pah", + "naphthaleen_pah","naphthaleenpah","naphthaleen-pah","naphthaleen pah", + "naphtaleen_pah","naphtaleenpah","naphtaleen-pah","naphtaleen pah", + "c10h8_pah"], + standard_unit = names.unit_microgperl, + ) + +properties_contaminants[names.name_naphthalene] = dict( + chemical_formula = "C10H8", + molecular_mass = 128.0, + carbon_atoms = 10, + hydrogen_atoms = 8, + other_names = ["naphthalene","naphthaleen","naphthaline", + "naphtaline","naphtalene","naphtaleen","C10H8"], + standard_unit = names.unit_microgperl, +) + +# New ones: + +properties_contaminants[names.name_acenaphthylene] = dict( + chemical_formula = "C12H8", + molecular_mass = 152.192, + carbon_atoms = 12, + hydrogen_atoms = 8, + other_names = ["acenaphthylene","cyclopenta[de]naphthalene","C12H8"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_acenaphthene] = dict( + chemical_formula = "C12H10", + molecular_mass = 154.212, # approximate + carbon_atoms = 12, + hydrogen_atoms = 10, + other_names = ["acenaphthene","acenaphthene (C12H10)","C12H10"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_fluorene] = dict( + chemical_formula = "C13H10", + molecular_mass = 166.22, + carbon_atoms = 13, + hydrogen_atoms = 10, + other_names = ["fluorene","C13H10"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_phenanthrene] = dict( + chemical_formula = "C14H10", + molecular_mass = 178.226, + carbon_atoms = 14, + hydrogen_atoms = 10, + other_names = ["phenanthrene","C14H10"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_anthracene] = dict( + chemical_formula = "C14H10", + molecular_mass = 178.226, + carbon_atoms = 14, + hydrogen_atoms = 10, + other_names = ["anthracene","C14H10"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_fluoranthene] = dict( + chemical_formula = "C16H10", + molecular_mass = 202.26, + carbon_atoms = 16, + hydrogen_atoms = 10, + other_names = ["fluoranthene","C16H10"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_pyrene] = dict( + chemical_formula = "C16H10", + molecular_mass = 202.26, + carbon_atoms = 16, + hydrogen_atoms = 10, + other_names = ["pyrene","C16H10"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_chrysene] = dict( + chemical_formula = "C18H12", + molecular_mass = 228.29, + carbon_atoms = 18, + hydrogen_atoms = 12, + other_names = ["chrysene","C18H12"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_benzo_a_anthracene] = dict( + chemical_formula = "C18H12", + molecular_mass = 228.29, + carbon_atoms = 18, + hydrogen_atoms = 12, + other_names = ["benzo[a]anthracene","benzo(a)anthracene","benzoaanthracene", + "benzo-a-anthracene","C18H12","BaA"], + standard_unit = names.unit_microgperl, +) +properties_contaminants[names.name_benzo_b_fluoranthene] = dict( + chemical_formula = "C20H12", + molecular_mass = 252.31, + carbon_atoms = 20, + hydrogen_atoms = 12, + other_names = ["benzo[b]fluoranthene","benzo(b)fluoranthene", + "benzobfluoranthene","benzo-b-fluoranthene","BbF","C20H12"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_benzo_k_fluoranthene] = dict( + chemical_formula = "C20H12", + molecular_mass = 252.31, + carbon_atoms = 20, + hydrogen_atoms = 12, + other_names = ["benzo[k]fluoranthene","benzo(k)fluoranthene" + "benzokfluoranthene","benzo-k-fluoranthene","BkF","C20H12"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_benzo_a_pyrene] = dict( + chemical_formula = "C20H12", + molecular_mass = 252.31, + carbon_atoms = 20, + hydrogen_atoms = 12, + other_names = ["benzo[a]pyrene","benzo(a)pyrene","benzoapyrene", + "benzo-a-pyrene","BaP","C20H12"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_dibenz_ah_anthracene] = dict( + chemical_formula = "C22H14", + molecular_mass = 278.33, + carbon_atoms = 22, + hydrogen_atoms = 14, + other_names = ["dibenz[a,h]anthracene","dibenz(a,h)anthracene","dibenzahanthracene","dibenz-a,h-anthracene", + "dibenz-a-h-anthracene","DBahA","C22H14"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_benzo_ghi_perylene] = dict( + chemical_formula = "C22H12", + molecular_mass = 276.33, + carbon_atoms = 22, + hydrogen_atoms = 12, + other_names = ["benzo[ghi]perylene","benzo(ghi)perylene","benzoghiperylene","benzo-ghi-perylene", + "C22H12","BghiP"], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_indeno_123cd_pyrene] = dict( + chemical_formula = "C22H12", + molecular_mass = 276.33, # from data: 276.33 :contentReference[oaicite:0]{index=0} + carbon_atoms = 22, + hydrogen_atoms = 12, + other_names = ["indeno[1,2,3-cd]pyrene","indeno(1,2,3‑cd)pyrene", + "indeno1,2,3‑cdpyrene","indeno-1,2,3‑cd-pyrene", + "C22H12"], + standard_unit = names.unit_microgperl, +) properties_contaminants[names.name_methylindene]=dict( chemical_formula = "c9h7ch3", @@ -593,12 +788,351 @@ standard_unit = names.unit_microgperl, ) +### Alkylphenols +properties_contaminants[names.name_phenol] = dict( + chemical_formula = "C6H6O", + molecular_mass = 94.11, + carbon_atoms = 6, + hydrogen_atoms = 6, + other_names = ['phenol', 'hydroxybenzene', 'benzenol'], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_thymol] = dict( + chemical_formula = "C10H14O", + molecular_mass = 150.22, + carbon_atoms = 10, + hydrogen_atoms = 14, + other_names = ['thymol', '2_isopropyl_5_methylphenol', '5_methyl_2_isopropylphenol'], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_cresol] = dict( + chemical_formula = None, + molecular_mass = None, + carbon_atoms = None, + hydrogen_atoms = None, + other_names = ['cresol', 'methylphenol', 'hydroxytoluene'], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_m_cresol] = dict( + chemical_formula = "C7H8O", + molecular_mass = 108.14, + carbon_atoms = 7, + hydrogen_atoms = 8, + other_names = ['m-cresol','m cresol','mcresol','m_cresol', 'methylphenol', '3_methylphenol', '3-methylphenol'], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_o_cresol] = dict( + chemical_formula = "C7H8O", + molecular_mass = 108.14, + carbon_atoms = 7, + hydrogen_atoms = 8, + other_names = ['o-cresol','o cresol','ocresol''o_cresol', '2_methylphenol', '2-methylphenol'], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_p_cresol] = dict( + chemical_formula = "C7H8O", + molecular_mass = 108.14, + carbon_atoms = 7, + hydrogen_atoms = 8, + other_names = ['p-cresol','p cresol','pcresol','p_cresol', '4_methylphenol', '4-methylphenol'], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_2_ethylphenol] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = ['2_ethylphenol', '2-ethylphenol'], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_3_ethylphenol] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = ['3_ethylphenol', '3-ethylphenol'], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_23_dimethylphenol] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = [ + '23_dimethylphenol','23-dimethylphenol','23 dimethylphenol','23dimethylphenol', + '2,3_dimethylphenol','2,3-dimethylphenol','2,3 dimethylphenol','2,3dimethylphenol', + '23_dimethylphenol','23-dimethylphenol','23 dimethylphenol','23dimethylphenol', + '2,3_dimethylphenol','2,3-dimethylphenol','2,3 dimethylphenol','2,3dimethylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_24_dimethylphenol] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = [ + '24_dimethylphenol','24-dimethylphenol','24 dimethylphenol','24dimethylphenol', + '2,4_dimethylphenol','2,4-dimethylphenol','2,4 dimethylphenol','2,4dimethylphenol', + '24_dimethylphenol','24-dimethylphenol','24 dimethylphenol','24dimethylphenol', + '2,4_dimethylphenol','2,4-dimethylphenol','2,4 dimethylphenol','2,4dimethylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_25_dimethylphenol] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = [ + '25_dimethylphenol','25-dimethylphenol','25 dimethylphenol','25dimethylphenol', + '2,5_dimethylphenol','2,5-dimethylphenol','2,5 dimethylphenol','2,5dimethylphenol', + '25_dimethylphenol','25-dimethylphenol','25 dimethylphenol','25dimethylphenol', + '2,5_dimethylphenol','2,5-dimethylphenol','2,5 dimethylphenol','2,5dimethylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_26_dimethylphenol26] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = [ + '26_dimethylphenol','26-dimethylphenol','26 dimethylphenol','26dimethylphenol', + '2,6_dimethylphenol','2,6-dimethylphenol','2,6 dimethylphenol','2,6dimethylphenol', + '26_dimethylphenol','26-dimethylphenol','26 dimethylphenol','26dimethylphenol', + '2,6_dimethylphenol','2,6-dimethylphenol','2,6 dimethylphenol','2,6dimethylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_34_dimethylphenol] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = [ + '34_dimethylphenol','34-dimethylphenol','34 dimethylphenol','34dimethylphenol', + '3,4_dimethylphenol','3,4-dimethylphenol','3,4 dimethylphenol','3,4dimethylphenol', + '34_dimethylphenol','34-dimethylphenol','34 dimethylphenol','34dimethylphenol', + '3,4_dimethylphenol','3,4-dimethylphenol','3,4 dimethylphenol','3,4dimethylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_35_dimethylphenol] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = [ + '35_dimethylphenol','35-dimethylphenol','35 dimethylphenol','35dimethylphenol', + '3,5_dimethylphenol','3,5-dimethylphenol','3,5 dimethylphenol','3,5dimethylphenol', + '35_dimethylphenol','35-dimethylphenol','35 dimethylphenol','35dimethylphenol', + '3,5_dimethylphenol','3,5-dimethylphenol','3,5 dimethylphenol','3,5dimethylphenol' + ], + standard_unit = names.unit_microgperl, +) + + +properties_contaminants[names.name_235_trimethylphenol] = dict( + chemical_formula = "C9H12O", + molecular_mass = 138.19, + carbon_atoms = 9, + hydrogen_atoms = 12, + other_names = [ + '235_trimethylphenol', '235-trimethylphenol', '235 trimethylphenol', '235trimethylphenol', + '2,3,5_trimethylphenol', '2,3,5-trimethylphenol', '2,3,5 trimethylphenol', '2,3,5trimethylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_345_trimethylphenol] = dict( + chemical_formula = "C9H12O", + molecular_mass = 138.19, + carbon_atoms = 9, + hydrogen_atoms = 12, + other_names = [ + '345_trimethylphenol', '345-trimethylphenol', '345 trimethylphenol', '345trimethylphenol', + '3,4,5_trimethylphenol', '3,4,5-trimethylphenol', '3,4,5 trimethylphenol', '3,4,5trimethylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_4_ethylphenol] = dict( + chemical_formula = "C8H10O", + molecular_mass = 122.16, + carbon_atoms = 8, + hydrogen_atoms = 10, + other_names = [ + '4_ethylphenol', '4-ethylphenol', '4 ethylphenol', '4ethylphenol', + 'p_ethylphenol', 'p-ethylphenol', 'p ethylphenol', 'pethylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_2_isopropylphenol] = dict( + chemical_formula = "C9H12O", + molecular_mass = 136.19, + carbon_atoms = 9, + hydrogen_atoms = 12, + other_names = [ + '2_isopropylphenol', '2-isopropylphenol', '2 isopropylphenol', '2isopropylphenol', + 'o_isopropylphenol', 'o-isopropylphenol', 'o isopropylphenol', 'oisopropylphenol' + ], + standard_unit = names.unit_microgperl, +) + +properties_contaminants[names.name_ptertbutylphenol] = dict( + chemical_formula = "C10H14O", + molecular_mass = 150.22, + carbon_atoms = 10, + hydrogen_atoms = 14, + other_names = [ + 'p_[tert]butylphenol', 'p-(tert)butylphenol', 'p tert butyl phenol', 'ptertbutylphenol', + 'para_tert_butyl_phenol', 'para-(tert)-butyl-phenol', 'para tert butyl phenol' + ], + standard_unit = names.unit_microgperl, +) + ############################################################################### ############################################################################### ############################################################################### + contaminants_analysis = dict() + +contaminants_analysis[names.name_PAH_total_16] = dict( + other_names = ['pah_total_16','pah_total 16','pah_total16','pah_total-16', + 'pah total_16','pah total 16','pah total16','pah total-16', + 'pahtotal_16','pahtotal 16','pahtotal16','pahtotal-16' + 'pah-total_16','pah-total 16','pah-total16','pah-total-16', + 'pah_16','pah 16','pah16','pah-16', + 'total_pah_16','total_pah 16','total_pah16','total_pah-16', + 'total pah_16','total pah 16','total pah16','total pah-16', + 'totalpah_16','totalpah 16','totalpah16','totalpah-16', + 'total-pah_16','total-pah 16','total-pah16','total-pah-16', + ], + standard_unit = names.unit_microgperl, + ) + +contaminants_analysis[names.name_PAH_total_10] = dict( + other_names = ['pah_total_10','pah_total 10','pah_total10','pah_total-10', + 'pah total_10','pah total 10','pah total10','pah total-10', + 'pahtotal_10','pahtotal 10','pahtotal10','pahtotal-10' + 'pah-total_10','pah-total 10','pah-total10','pah-total-10', + 'pah_10','pah 10','pah10','pah-10', + 'total_pah_10','total_pah 10','total_pah10','total_pah-10', + 'total pah_10','total pah 10','total pah10','total pah-10', + 'totalpah_10','totalpah 10','totalpah10','totalpah-10', + 'total-pah_10','total-pah 10','total-pah10','total-pah-10', + ], + standard_unit = names.unit_microgperl, + ) + +contaminants_analysis[names.name_fraction_C10_C12] = dict( + other_names = [ + 'fraction_c10-c12','fraction c10-c12','fraction c10_c12', + 'c10-c12 fraction','c10_c12 fraction', + 'c10 to c12','c10_c12','c10–c12','c10–c12 fraction', + ], + standard_unit = names.unit_microgperl, +) + +contaminants_analysis[names.name_fraction_C12_C22] = dict( + other_names = [ + 'fraction_c12-c22','fraction c12-c22','fraction c12_c22', + 'c12-c22 fraction','c12_c22 fraction', + 'c12 to c22','c12_c22','c12–c22','c12–c22 fraction', + ], + standard_unit = names.unit_microgperl, +) + +contaminants_analysis[names.name_fraction_C22_C30] = dict( + other_names = [ + 'fraction_c22-c30','fraction c22-c30','fraction c22_c30', + 'c22-c30 fraction','c22_c30 fraction', + 'c22 to c30','c22_c30','c22–c30','c22–c30 fraction', + ], + standard_unit = names.unit_microgperl, +) + + +contaminants_analysis[names.name_fraction_C30_C40] = dict( + other_names = [ + 'fraction_c30-c40','fraction c30-c40','fraction c30_c40', + 'c30-c40 fraction','c30_c40 fraction', + 'c30 to c40','c30_c40','c30–c40','c30–c40 fraction', + ], + standard_unit = names.unit_microgperl, +) + +contaminants_analysis[names.name_total_C10_C40] = dict( + other_names = [ + 'total_c10-c40','total c10-c40','total c10_c40', + 'c10-c40 total','c10_c40 total', + 'c10 to c40','c10_c40','c10–c40','c10–c40 total', + 'c10-c40','c10_c40', + ], + standard_unit = names.unit_microgperl, +) + +contaminants_analysis[names.name_total_c2_alkylphenols] = dict( + other_names = [ + 'c2-alkylphenols_total',"c2_alkylphenols_total","c2alkylphenols_total", "c2 alkylphenols_total", + "c2-alkylphenols-total","c2_alkylphenols-total","c2alkylphenols-total", "c2 alkylphenols-total", + "c2 alkylphenols total","c2_alkylphenols total","c2alkylphenols total", "c2 alkylphenols total", + "c2alkylphenolstotal","c2_alkylphenolstotal","c2alkylphenolstotal", "c2 alkylphenolstotal", + "total_c2_alkylphenols", "total-c2-alkylphenols", "total c2 alkylphenols", "totalc2alkylphenols", + "alkylphenols_c2_total", "alkylphenols-c2-total", "alkylphenols c2 total", "alkylphenolsc2total", + "c2_alkylphenols", "c2-alkylphenols", "c2 alkylphenols", "c2alkylphenols", + "alkylphenols_c2", "alkylphenols-c2", "alkylphenols c2", "alkylphenolsc2" + ], + standard_unit = names.unit_microgperl, +) + +contaminants_analysis[names.name_total_c3_alkylphenols] = dict( + other_names = [ + 'c3-alkylphenols_total',"c3_alkylphenols_total","c3alkylphenols_total", "c3 alkylphenols_total", + "c3-alkylphenols-total","c3_alkylphenols-total","c3alkylphenols-total", "c3 alkylphenols-total", + "c3 alkylphenols total","c3_alkylphenols total","c3alkylphenols total", "c3 alkylphenols total", + "c3alkylphenolstotal","c3_alkylphenolstotal","c3alkylphenolstotal", "c3 alkylphenolstotal", + "c3_alkylphenols_total", "c3-alkylphenols-total", "c3 alkylphenols total", "c3alkylphenolstotal", + "total_c3_alkylphenols", "total-c3-alkylphenols", "total c3 alkylphenols", "totalc3alkylphenols", + "alkylphenols_c3_total", "alkylphenols-c3-total", "alkylphenols c3 total", "alkylphenolsc3total", + "c3_alkylphenols", "c3-alkylphenols", "c3 alkylphenols", "c3alkylphenols", + "alkylphenols_c3", "alkylphenols-c3", "alkylphenols c3", "alkylphenolsc3" + ], + standard_unit = names.unit_microgperl, +) + +contaminants_analysis[names.name_total_c4_alkylphenols] = dict( + other_names = [ + 'c4-alkylphenols_total',"c4_alkylphenols_total","c4alkylphenols_total", "c4 alkylphenols_total", + "c4-alkylphenols-total","c4_alkylphenols-total","c4alkylphenols-total", "c4 alkylphenols-total", + "c4 alkylphenols total","c4_alkylphenols total","c4alkylphenols total", "c4 alkylphenols total", + "c4alkylphenolstotal","c4_alkylphenolstotal","c4alkylphenolstotal", "c4 alkylphenolstotal", + "c4_alkylphenols_total", "c4-alkylphenols-total", "c4 alkylphenols total", "c4alkylphenolstotal", + "total_c4_alkylphenols", "total-c4-alkylphenols", "total c4 alkylphenols", "totalc4alkylphenols", + "alkylphenols_c4_total", "alkylphenols-c4-total", "alkylphenols c4 total", "alkylphenolsc4total", + "c4_alkylphenols", "c4-alkylphenols", "c4 alkylphenols", "c4alkylphenols", + "alkylphenols_c4", "alkylphenols-c4", "alkylphenols c4", "alkylphenolsc4" + ], + standard_unit = names.unit_microgperl, +) + contaminants_analysis[names.name_total_contaminants] = dict( other_names = ["sum_contaminants","sum-contaminants","sum contaminants","sumcontaminants", "total_contaminants","total-contaminants","total contaminants","totalcontaminants", @@ -607,17 +1141,18 @@ standard_unit = names.unit_microgperl, ) contaminants_analysis[names.name_total_BTEX] = dict( - other_names = ["sum_BTEX","sum-BTEX","sum BTEX","sumBTEX", - "total_BTEX","total-BTEX","total BTEX","totalBTEX", - "concentration -BTEX","concentration-BTEX","concentration BTEX", - "concentrationBTEX"], + other_names = ["sum_btex","sum-btex","sum btex","sumbtex", + "total_btex","total-btex","total btex","totalbtex", + "btex_total","btex-total","btex total","btextotal", + "concentration -btex","concentration-btex","concentration btex", + "concentrationbtex"], standard_unit = names.unit_microgperl, ) contaminants_analysis[names.name_total_BTEXIIN] = dict( - other_names = ["sum_BTEXIIN","sum-BTEXIIN","sum BTEXIIN","sumBTEXIIN", - "total_BTEXIIN","total-BTEXIIN","total BTEXIIN","totalBTEXIIN", - "concentration -BTEXIIN","concentration-BTEXIIN","concentration BTEXIIN", - "concentrationBTEXIIN"], + other_names = ["sum_btexiin","sum-btexiin","sum btexiin","sumbtexiin", + "total_btexiin","total-btexiin","total btexiin","totalbtexiin", + "concentration -btexiin","concentration-btexiin","concentration btexiin", + "concentrationbtexiin"], standard_unit = names.unit_microgperl, ) contaminants_analysis[names.name_total_oxidators] = dict( @@ -647,7 +1182,7 @@ ) contaminants_analysis[names.name_NP_avail] = dict( - other_names = ["NP_avail"] + other_names = ["np_avail"] ) ### List with all quantities of particular data type in standard names: @@ -672,6 +1207,106 @@ names.name_xylene, names.name_indane, names.name_indene, - names.name_naphthalene], + names.name_naphthalene, + # names.name_naphthalene_VOC, + # names.name_naphthalene_PAC, + ], + MAH = [names.name_benzene, + names.name_toluene, + names.name_ethylbenzene, + names.name_pm_xylene, + names.name_o_xylene, + names.name_xylene, + names.name_styrene, + names.name_isopropylbenzene, + names.name_n_propylbenzene, + names.name_ethyltoluene, + names.name_2_ethyltoluene, + names.name_3_ethyltoluene, + names.name_4_ethyltoluene, + names.name_trimethylbenzene, + names.name_123_trimethylbenzene, + names.name_124_trimethylbenzene, + names.name_135_trimethylbenzene, + names.name_4_isopropyltouene, + names.name_diethylbenzene, + names.name_12_diethylbenzene, + names.name_13_diethylbenzene, + names.name_14_diethylbenzene, + names.name_tetramethylbenzene, + names.name_1234_tetramethylbenzene, + names.name_1245_tetramethylbenzene, + names.name_1235_tetramethylbenzene, + ], + PAH = [names.name_indane, + names.name_indene, + names.name_naphthalene, + names.name_naphthalene_VOC, + names.name_naphthalene_PAH, + names.name_acenaphthylene, + names.name_acenaphthene, + names.name_fluorene, + names.name_phenanthrene, + names.name_anthracene, + names.name_fluoranthene, + names.name_pyrene, + names.name_chrysene, + names.name_benzo_a_anthracene, + names.name_benzo_b_fluoranthene, + names.name_benzo_k_fluoranthene, + names.name_benzo_a_pyrene, + names.name_dibenz_ah_anthracene, + names.name_benzo_ghi_perylene, + names.name_indeno_123cd_pyrene, + names.name_methylindene, + names.name_1_methylindene, + names.name_2_methylindene, + names.name_methylnaphthalene, + names.name_1_methylnaphthalene, + names.name_2_methylnaphthalene, + names.name_ethylnaphthalene, + names.name_1_ethylnaphthalene, + names.name_2_ethylnaphthalene, + names.name_dimethylnaphthalene, + names.name_12_dimethylnaphthalene, + names.name_13_dimethylnaphthalene, + names.name_14_dimethylnaphthalene, + names.name_15_dimethylnaphthalene, + names.name_16_dimethylnaphthalene, + names.name_17_dimethylnaphthalene, + names.name_18_dimethylnaphthalene, + names.name_23_dimethylnaphthalene, + names.name_26_dimethylnaphthalene, + names.name_27_dimethylnaphthalene, + ], + PAH_total_10 = [#Dutch RIVM defines environmental quality objectives for 10 PAHs + names.name_naphthalene, + names.name_anthracene, + names.name_benzo_a_anthracene, + names.name_benzo_a_pyrene, + names.name_benzo_b_fluoranthene, + names.name_benzo_k_fluoranthene, + names.name_chrysene, + names.name_dibenz_ah_anthracene, + names.name_fluoranthene, + names.name_pyrene, + ], + PAH_total_16 = [names.name_naphthalene, #the 16 EPA priority PAHs + names.name_acenaphthylene, + names.name_acenaphthene, + names.name_fluorene, + names.name_phenanthrene, + names.name_anthracene, + names.name_fluoranthene, + names.name_pyrene, + names.name_benzo_a_anthracene, + names.name_chrysene, + names.name_benzo_b_fluoranthene, + names.name_benzo_k_fluoranthene, + names.name_benzo_a_pyrene, + names.name_indeno_123cd_pyrene, + names.name_dibenz_ah_anthracene, + names.name_benzo_ghi_perylene, + ], all_cont = list(properties_contaminants.keys()) ) diff --git a/mibiscreen/data/settings/environment.py b/mibiscreen/data/settings/environment.py index b33ef89..dbd85db 100644 --- a/mibiscreen/data/settings/environment.py +++ b/mibiscreen/data/settings/environment.py @@ -22,6 +22,11 @@ standard_unit = names.unit_less, ) +properties_geochemicals[names.name_temperature]=dict( + other_names = ["temperature","T","temp"], + standard_unit = names.unit_celsius, + ) + properties_geochemicals[names.name_EC]=dict( other_names = ["ec"], standard_unit = names.unit_microsimpercm, @@ -66,6 +71,19 @@ standard_unit = names.unit_mgperl, ) +properties_geochemicals[names.name_nitrateN]=dict( + chemical_formula = 'N', + molecular_mass = 14.01, + other_names = ["nitraten","nitrate-n","nitrate n","nitrate_n", + "no3n","no3-n","no3 n","no3_n", + "no_3n","no_3-n","no_3 n","no_3_n", + "no 3n","no 3-n","no 3 n","no 3_n", + "no3-n","no3--n","no3- n","no3-_n", + "no_3-n","no_3--n","no_3- n","no_3-_n", + "no 3-n","no 3--n","no 3- n","no 3-_n"], + standard_unit = names.unit_mgNperl, + ) + properties_geochemicals[names.name_nitrite]=dict( chemical_formula = 'no2', molecular_mass = 46., @@ -74,6 +92,19 @@ standard_unit = names.unit_mgperl, ) +properties_geochemicals[names.name_nitriteN]=dict( + chemical_formula = 'N', + molecular_mass = 14.01, + other_names = ["nitriten","nitrite n","nitrite-n","nitrite_n", + "no2n","no2 n","no2-n","no2_n", + "no_2n","no_2 n","no_2-n","no_2_n", + "no 2n","no 2 n","no 2-n","no 2_n", + "no2-n","no2- n","no2--n","no2-_n", + "no_2-n","no_2- n","no_2--n","no_2-_n" + "no 2-n","no 2- n","no 2--n","no 2-_n"], + standard_unit = names.unit_mgNperl, + ) + properties_geochemicals[names.name_sulfate]=dict( chemical_formula = "so42-", molecular_mass = 96.1, @@ -168,6 +199,15 @@ standard_unit = names.unit_mgperl, ) +properties_geochemicals[names.name_phosphorus]=dict( + chemical_formula = "P", + other_names = ["phosphorus","P","TP","PT", + "phosphorustotal","phosphorus_total","phosphorus total","phosphorus-total", + "totalphosphorus","total_phosphorus","total phosphorus","total-phosphorus"], + standard_unit = names.unit_mgPperl, + ) + + properties_geochemicals[names.name_chloride]=dict( chemical_formula = "cl-", other_names = ['chloride','cl','cl-'], @@ -216,12 +256,23 @@ standard_unit = names.unit_mgperl, ) +properties_geochemicals[names.name_cyanide]=dict( + chemical_formula = 'cn-', + other_names = ['cyanides','cn-','total cyanides', + 'total cyanides','total-cyanides','totalcyanides','total_cyanides', + 'cyanides total','cyanides-total', 'cyanidestotal','cyanides_total', + ], + standard_unit = names.unit_microgperl, + ) + + ### List with all quantities of particular data type in standard names: environment = list(properties_geochemicals.keys()) environment_groups = dict( environmental_conditions = [names.name_redox, names.name_pH, + names.name_temperature, names.name_EC, names.name_pE, ], @@ -234,9 +285,12 @@ names.name_manganese4, names.name_methane, names.name_nitrite, + names.name_nitriteN, + names.name_nitrateN, names.name_sulfide, names.name_ammonium, names.name_phosphate, + names.name_phosphorus, names.name_chloride, names.name_bromide, names.name_fluoride, @@ -245,6 +299,7 @@ names.name_potassium, names.name_calcium, names.name_acetate, + names.name_cyanide, names.name_DOC, names.name_NPOC, names.name_TOC, diff --git a/mibiscreen/data/settings/sample_settings.py b/mibiscreen/data/settings/sample_settings.py index c07318c..11e422c 100644 --- a/mibiscreen/data/settings/sample_settings.py +++ b/mibiscreen/data/settings/sample_settings.py @@ -57,5 +57,11 @@ standard_unit = names.unit_less, ) +properties_sample_settings[names.name_date]=dict( + other_names = ["date",'time'], + standard_unit = names.unit_less, +) + + ### List with all quantities of particular data type in standard names: sample_settings = list(properties_sample_settings.keys()) diff --git a/mibiscreen/data/settings/standard_names.py b/mibiscreen/data/settings/standard_names.py index 0243561..cee848d 100644 --- a/mibiscreen/data/settings/standard_names.py +++ b/mibiscreen/data/settings/standard_names.py @@ -14,7 +14,12 @@ unit_microsimpercm = 'uS/cm' unit_permil = 'permil' unit_count = 'nr' +unit_date = 'date' unit_less = '' +unit_mgNperl = "mg/l-N" +unit_mgPperl = "mg/l-P" +unit_celsius = 'C' +unit_date = 'date' ### Standard names for settings name_sample = "sample_nr" @@ -22,10 +27,12 @@ name_well_type = "well_type" name_sample_depth = "depth" name_aquifer = 'aquifer' +name_date = 'date' ### Standard names for environmental parameters name_redox = "redoxpot" name_pH = "pH" +name_temperature = 'temperature' name_EC = "EC" #name_Eh = "Eh" #Reduction potential name_pE = "pE" #Alternative mathematical formulation of redox potential/reduction potential @@ -37,17 +44,20 @@ name_TOC = "TOC" # Total Organic Carbon name_oxygen = 'oxygen' #o2 -name_nitrate = 'nitrate' #no3 +name_nitrate = 'nitrate' #no3- -- full nitrate ion +name_nitrateN = 'nitrate_N' #no3-N -- amount of nitrogen (N) within the nitrate ion. name_sulfate = 'sulfate' #"so4" name_iron2 = "iron2" #"fe_II" name_iron3 = "iron3" #"fe_II" name_manganese2 = 'manganese2' #"mn_II" name_manganese4 = 'manganese4' #"mn_II" name_methane = 'methane' #"ch4" -name_nitrite = 'nitrite' #no2 +name_nitrite = 'nitrite' #no2 -- full nitrite ion +name_nitriteN = 'nitrite_N' #no2- - amount of nitrogen (N) within the nitrite ion. name_sulfide = 'sulfide' #"s2min" name_ammonium = 'ammonium' #"nh4+" -name_phosphate = 'phosphate' # "po4" +name_phosphate = 'phosphate' # "po4" - orthosphosphate ion, bioavailable form of phosphorus +name_phosphorus = 'phosphorus_total' #P - all phosphorus containing compounds name_chloride = 'chloride' name_bromide = 'bromide' name_fluoride = 'fluoride' @@ -56,19 +66,18 @@ name_potassium = 'potassium' name_calcium = 'calcium' name_acetate = 'acetate' +name_cyanide = 'cyanide_total' # sum of all cyanide species that can potentially release free cyanide ### Standard names for main contaminants +### MAHs - MONOCYCLIC AROMATIC HYDROCARBONS name_benzene = 'benzene' name_toluene = 'toluene' name_ethylbenzene = 'ethylbenzene' name_pm_xylene = 'pm_xylene' name_o_xylene = 'o_xylene' name_xylene = 'xylene' -name_indane = 'indane' -name_indene = 'indene' -name_naphthalene = 'naphthalene' -### Standard names for additional contaminants +### Standard names for additional MAH contaminants name_styrene = 'styrene' name_isopropylbenzene = 'isopropylbenzene' name_n_propylbenzene = 'n_propylbenzene' @@ -89,6 +98,33 @@ name_1234_tetramethylbenzene = '1234_tetramethylbenzene' name_1245_tetramethylbenzene = '1245_tetramethylbenzene' name_1235_tetramethylbenzene = '1235_tetramethylbenzene' + + + +### PAHs - POLYCYCLIC AROMATIC HYDROCARBONS +name_indane = 'indane' +name_indene = 'indene' +name_naphthalene = 'naphthalene' + +name_naphthalene_VOC = 'naphthalene_VOC' #measured via volatile organic carbon +name_naphthalene_PAH = 'naphthalene_PAH' #measured via polyaromatic hydrocarbons +name_acenaphthylene = 'acenaphthylene' +name_acenaphthene = 'acenaphthene' +name_fluorene = 'fluorene' +name_phenanthrene = 'phenanthrene' +name_anthracene = 'anthracene' +name_fluoranthene = 'fluoranthene' +name_pyrene = 'pyrene' +name_chrysene = 'chrysene' +name_benzo_a_anthracene = 'benzo[a]anthracene' +name_benzo_b_fluoranthene = 'benzo[b]fluoranthene' +name_benzo_k_fluoranthene = 'benzo[k]fluoranthene' +name_benzo_a_pyrene = 'benzo[a]pyrene' +name_dibenz_ah_anthracene = 'dibenz[a,h]anthracene' +name_benzo_ghi_perylene = 'benzo[ghi]perylene' +name_indeno_123cd_pyrene = 'indeno[1,2,3-cd]pyrene' + +### Standard names for additional PAH contaminants name_methylindene = 'methylindene' name_1_methylindene = '1_methylindene' name_2_methylindene = '2_methylindene' @@ -110,8 +146,28 @@ name_26_dimethylnaphthalene = '26_dimethylnaphthalene' name_27_dimethylnaphthalene = '27_dimethylnaphthalene' -### Standard names for a selection of metabolites +### Standard names for alkylphenols name_phenol = "phenol" +name_thymol = "thymol" +name_cresol = "cresol" +name_m_cresol = "m_cresol" +name_o_cresol = "o_cresol" +name_p_cresol = "p_cresol" +name_2_ethylphenol = "2_ethylphenol" +name_3_ethylphenol = "3_ethylphenol" +name_23_dimethylphenol = "23_dimethylphenol" +name_24_dimethylphenol = "24_dimethylphenol" +name_25_dimethylphenol = "25_dimethylphenol" +name_26_dimethylphenol26 = "26_dimethylphenol" +name_34_dimethylphenol = "34_dimethylphenol" +name_35_dimethylphenol = "35_dimethylphenol" +name_235_trimethylphenol = "235_trimethylphenol" +name_345_trimethylphenol = "345_trimethylphenol" +name_4_ethylphenol = "4_ethylphenol" +name_2_isopropylphenol = "2_isopropylphenol" +name_ptertbutylphenol = "p_[tert]butylphenol" + +### Standard names for a selection of metabolites name_cinnamic_acid = "cinnamic_acid" name_benzoic_acid = "benzoic_acid" name_dimethyl_benzoic_acid = 'dimethyl_benzoic_acid' @@ -124,7 +180,7 @@ name_benzylsuccinic_acid = "benzylsuccinic_acid" name_3o_toluoyl_propionic_acid = "3o_toluoyl_propionic_acid" -### Standard names for metabolite related quantities +### Standard names for summed up quantities #name_total_contaminants = "total_contaminants" name_total_contaminants = "concentration_contaminants" name_total_BTEX = "concentration_BTEX" @@ -133,6 +189,17 @@ name_total_BTEX_count = "count_BTEX" name_total_BTEXIIN_count = "count_BTEXIIN" +name_PAH_total_10 = 'PAH_total_10' +name_PAH_total_16 = 'PAH_total_16' +name_fraction_C10_C12 = 'fraction_C10-C12' +name_fraction_C12_C22 = 'fraction_C12-C22' +name_fraction_C22_C30 = 'fraction_C22-C30' +name_fraction_C30_C40 = 'fraction_C30-C40' +name_total_C10_C40 = 'total_C10-C40' + +name_total_c2_alkylphenols = "c2_alkylphenols_total" +name_total_c3_alkylphenols = "c3_alkylphenols_total" +name_total_c4_alkylphenols = "c4_alkylphenols_total" ### Standard names for metabolite related quantities name_metabolites_conc = "metabolites_concentration" diff --git a/mibiscreen/data/settings/unit_settings.py b/mibiscreen/data/settings/unit_settings.py index ac49eed..22d8ddc 100644 --- a/mibiscreen/data/settings/unit_settings.py +++ b/mibiscreen/data/settings/unit_settings.py @@ -14,6 +14,14 @@ other_names = ["mg/l",'ppm'], ) +properties_units[names.unit_mgNperl]=dict( + other_names = ["mg/l", 'ppm', "mgn/l",'ppm n','ppmn','ppm-n','ppm_n',"mg/l as n","ppm as n"], + ) + +properties_units[names.unit_mgPperl]=dict( + other_names = ["mg/l", 'ppm', "mgp/l",'ppm p','ppmp','ppm-p','ppm_p',"mg/l as p","ppm as p"], + ) + properties_units[names.unit_microgperl]=dict( other_names = ["ug/l","µg/l","\u00B5g/l","\u03BCg/l","micro g/l",r"$\mu$ g/l",], ) @@ -22,6 +30,12 @@ other_names = ["mV","mv"], ) +properties_units[names.unit_celsius]=dict( + other_names = ["C","c","Celsius","celsius", + "\u00B0C","\u00B0c","\u00B0Celsius","\u00B0celsius", + "\u00B0 C","\u00B0 c","\u00B0 Celsius","\u00B0 celsius"], + ) + properties_units[names.unit_meter]=dict( other_names = ['m',"meter"], ) @@ -39,13 +53,15 @@ other_names =['nr','number','count'], ) +# properties_units[names.unit_date]=dict( +# other_names = ['date','time','hr'], +# ) + properties_units[names.unit_less]=dict( other_names = ['',' ',' ','-',np.nan], ) - all_units = [] for key in properties_units.keys(): if key != names.unit_less: all_units = all_units + properties_units[key]['other_names'] - diff --git a/tests/test_concentrations.py b/tests/test_concentrations.py index 92564fe..085bdfb 100644 --- a/tests/test_concentrations.py +++ b/tests/test_concentrations.py @@ -97,7 +97,7 @@ def test_total_contaminant_concentration_01(self): Correct calculation of total amount of contaminants (total concentration). """ - tot_conc_test = 27046.0 + tot_conc_test = 27046.5 tot_conc = np.sum(total_contaminant_concentration(self.data)) assert (tot_conc - tot_conc_test)<1e-5 diff --git a/tests/test_data.py b/tests/test_data.py index a0cee77..b735441 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd import pytest +from mibiscreen.data.check_data import _check_duplicates_in_list # Replace with the actual module name from mibiscreen.data.check_data import _generate_dict_other_names from mibiscreen.data.check_data import check_columns from mibiscreen.data.check_data import check_data_frame @@ -13,6 +14,7 @@ from mibiscreen.data.check_data import standard_names from mibiscreen.data.check_data import standardize from mibiscreen.data.example_data.example_data import example_data +from mibiscreen.data.load_data import _check_duplicates_in_df from mibiscreen.data.load_data import load_csv from mibiscreen.data.load_data import load_excel from mibiscreen.data.set_data import compare_lists @@ -96,6 +98,44 @@ def test_load_excel_04(self,capsys): assert len(out)>0 + def test_load_excel_05(self,capsys): + """Testing routine load_excel(). + + Testing handling of duplicate column names from real excel file. + """ + data_t3= load_excel("{}/example_duplicate.xlsx".format(path_data),sheet_name= 'contaminants')[0] + captured = capsys.readouterr().out + + assert data_t3.shape == (5,12) + assert "WARNING: Looks like duplicate column names detected." in captured + assert " - 'naphthalene.1'" in captured + + def test_check_duplicates_in_df_01(self,capsys): + """Testing routine _check_duplicates_in_df(). + + Testing Warning in case of duplicates in dataframe. + """ + # Create a DataFrame with duplicate column names (simulated by manually renaming) + data = pd.DataFrame({ + 'Name': [1, 2], + 'Age': [30, 40], + 'Name.1': [3, 4], # Simulate pandas auto-renamed column + 'Age.1': [50, 60] + }) + + # Call the function + _check_duplicates_in_df(data) + + # Capture printed output + captured = capsys.readouterr().out + + # Check the warning is in the output + assert "WARNING: Looks like duplicate column names detected." in captured + # Check that the renamed columns are listed + assert " - 'Name.1'" in captured + assert " - 'Age.1'" in captured + # Check verbose messages appear + assert "Consider renaming them." in captured class TestExampleData: """Class for testing example data of data module of mibiscreen.""" @@ -368,6 +408,9 @@ class TestCheckDataColumns: data4check = pd.DataFrame([units,s01],columns = columns_mod) + columns_dup = ["sample","well","Depth",'pH', 'redox' , 'Sulfate', 'CH4','ironII','c6h6', 'benzene'] + data_dup = pd.DataFrame([units,s01],columns = columns_dup) + def test_check_columns_01(self): """Testing check_column() on complete example data. @@ -411,6 +454,54 @@ def test_check_columns_04(self,capsys): assert len(out)>0 + def test_check_columns_05(self,capsys): + """Testing routine check_column() on check and standardization of column names. + + Testing Warning if duplicate column names in identified standard names. + """ + check_columns(self.data_dup) + captured = capsys.readouterr().out + + # Check the warning is in the output + assert "WARNING: Duplicates found in list of standard names:" in captured + assert " - 'c6h6' (index 8)" in captured + assert " - 'benzene' (index 9)" in captured + assert "Remove or rename duplicate quantities in data." in captured + + def test_no_duplicates(self): + """Testing routine _check_duplicates_in_list(). + + Checking if there are no identical strings in a list. + """ + data = ['a', 'b', 'c', 'd'] + assert _check_duplicates_in_list(data) == {} + + def test_with_duplicates(self): + """Testing routine _check_duplicates_in_list(). + + Checking if there are identical strings in a list. + """ + data = ['a', 'b', 'a', 'c', 'b'] + expected = {'a': [0, 2], 'b': [1, 4]} + assert _check_duplicates_in_list(data) == expected + + def test_all_duplicates(self): + """Testing routine _check_duplicates_in_list(). + + Checking if there are only identical strings in a list. + """ + data = ['x', 'x', 'x', 'x'] + expected = {'x': [0, 1, 2, 3]} + assert _check_duplicates_in_list(data) == expected + + def test_empty_list(self): + """Testing routine _check_duplicates_in_list(). + + Checking if it works for an empty list. + """ + data = [] + assert _check_duplicates_in_list(data) == {} + class TestCheckDataUnits: """Class for testing data module of mibiscreen.""" @@ -728,6 +819,8 @@ def test_generate_dict_other_names_02(self): assert set(other_names.keys()) == set(self.other_names_1+self.other_names_2) + + class TestDataCompareLists: """Class for testing data module of mibiscreen."""