From 3bc12feee6191e23efc15c9fb2a929da36e033af Mon Sep 17 00:00:00 2001 From: clarkenj Date: Tue, 2 Apr 2024 17:29:09 -0400 Subject: [PATCH 01/21] draft npi conversion scripts --- code/adni_npi_to_mbi.py | 83 ++++++++++++++++++++++++++++++++++++++++ code/cimaq_npi_to_mbi.py | 43 +++++++++++++++++++++ code/util.py | 12 ++++++ 3 files changed, 138 insertions(+) create mode 100644 code/adni_npi_to_mbi.py create mode 100644 code/cimaq_npi_to_mbi.py create mode 100644 code/util.py diff --git a/code/adni_npi_to_mbi.py b/code/adni_npi_to_mbi.py new file mode 100644 index 0000000..4825a27 --- /dev/null +++ b/code/adni_npi_to_mbi.py @@ -0,0 +1,83 @@ +import pandas as pd +import numpy as np +import util + +from pathlib import Path + + +def adni_npi_to_mbi(df): + df["decreased_motivation"] = df["NPIG"] + df["emotional_dysregulation"] = df["NPID"] + df["NPIE"] + df["NPIF"] + df["impulse_dyscontrol"] = df["NPIC"] + df["NPII"] + df["NPIJ"] + df["social_inappropriateness"] = df["NPIH"] + df["abnormal_perception"] = df["NPIA"] + df["NPIB"] + return df + + +def select_columns(df): + columns = [ + "RID", + "EXAMDATE", + "decreased_motivation", + "emotional_dysregulation", + "impulse_dyscontrol", + "social_inappropriateness", + "abnormal_perception", + "mbi_total_score", + "mbi_status", + ] + df = df[columns].copy() + return df + + +# set paths +adnimerge_p = Path( + "/home/neuromod/wrangling-phenotype/data/adni/ADNIMERGE_22Aug2023.csv" +) +adni_npi_p = Path("/home/neuromod/wrangling-phenotype/data/adni/NPI_22Aug2023.csv") +qc_pheno_p = Path("/home/neuromod/wrangling-phenotype/outputs/passed_qc_master.tsv") +output_p = Path("/home/neuromod/wrangling-phenotype/mbi.tsv") + +# load data +adnimerge_df = pd.read_csv(adnimerge_p, low_memory=False) +npi_df = pd.read_csv(adni_npi_p) +qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t") + +# convert NPI to MBI and calculate total score +mbi_df = adni_npi_to_mbi(npi_df) +mbi_df = util.calculate_mbi_score(mbi_df) +mbi_df = select_columns(mbi_df) + +# Do some formatting +mbi_df.rename(columns={"EXAMDATE": "ses"}, inplace=True) + +# In qc_pheno_df grab just the ID part from participant_id, so it matches mbi_df +qc_df_filtered = qc_pheno_df.loc[qc_pheno_df["dataset"] == "adni"].copy() +qc_df_filtered["RID"] = ( + qc_df_filtered["participant_id"].str.split("S").str[-1].astype(int) +) + +# Replace some rougue dates first +mbi_df["ses"] = mbi_df["ses"].replace("0012-02-14", "2012-02-14") +mbi_df["ses"] = mbi_df["ses"].replace("0013-05-06", "2013-05-06") +mbi_df["ses"] = mbi_df["ses"].replace("0013-10-28", "2013-10-28") + +qc_df_filtered["ses"] = pd.to_datetime(qc_df_filtered["ses"]) # format="%m/%Y" +mbi_df["ses"] = pd.to_datetime(mbi_df["ses"]) + +# Ensure ordered by ses +qc_df_filtered = qc_df_filtered.sort_values(by=["ses"]) +mbi_df = mbi_df.dropna(subset=["ses"]) # Since some were missing +mbi_df = mbi_df.sort_values(by=["ses"]) + +merged_df = pd.merge_asof( + qc_df_filtered, + mbi_df, + by="RID", + on="ses", + direction="nearest", # Finds the closest match, whether before or after + tolerance=pd.Timedelta(days=183), # Approximately 6 months +) + +# merged_df.to_csv(output_p, sep="\t", index=False) +mbi_df.to_csv(output_p, sep="\t", index=False) diff --git a/code/cimaq_npi_to_mbi.py b/code/cimaq_npi_to_mbi.py new file mode 100644 index 0000000..6b10816 --- /dev/null +++ b/code/cimaq_npi_to_mbi.py @@ -0,0 +1,43 @@ +import pandas as pd +import numpy as np +from pathlib import Path + + +def map_values(df): + # Map scores to numerical values + mapping = {"0_non": 0, "1_oui_léger": 1, "2_oui_modéré": 2, "3_oui_sévère": 3} + + columns_to_map = [ + "22901_apathie", + "22901_depression_dysphorie", + "22901_anxiete", + "22901_euphorie", + "22901_agitation_aggressivite", + "22901_irritabilite", + "22901_comp_moteur_aberrant", + "22901_impulsivite", + "22901_idees_delirantes", + "22901_hallucinations", + ] + + for column in columns_to_map: + df[column] = df[column].map(mapping) + return df + + +def cimaq_npi_to_mbi(df): + # Calculate MBI domains + df["decreased_motivation"] = df["22901_apathie"] + df["emotional_dysregulation"] = ( + df["22901_depression_dysphorie"] + df["22901_anxiete"] + df["22901_euphorie"] + ) + df["impulse_dyscontrol"] = ( + df["22901_agitation_aggressivite"] + + df["22901_irritabilite"] + + df["22901_comp_moteur_aberrant"] + ) + df["social_inappropriateness"] = df["22901_impulsivite"] + df["abnormal_perception"] = ( + df["22901_idees_delirantes"] + df["22901_hallucinations"] + ) + return df diff --git a/code/util.py b/code/util.py new file mode 100644 index 0000000..f124183 --- /dev/null +++ b/code/util.py @@ -0,0 +1,12 @@ +def calculate_mbi_score(df): + mbi_domains = [ + "decreased_motivation", + "emotional_dysregulation", + "impulse_dyscontrol", + "social_inappropriateness", + "abnormal_perception", + ] + + df["mbi_total_score"] = df[mbi_domains].sum(axis=1) + df["mbi_status"] = (df["mbi_total_score"] >= 1).astype(int) + return df From 2f54a54ca219495fbdaab51f153bcdd060001318 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Tue, 2 Apr 2024 18:24:11 -0400 Subject: [PATCH 02/21] next draft --- code/adni_npi_to_mbi.py | 20 ++++++----- code/cimaq_npi_to_mbi.py | 72 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 82 insertions(+), 10 deletions(-) diff --git a/code/adni_npi_to_mbi.py b/code/adni_npi_to_mbi.py index 4825a27..819fe38 100644 --- a/code/adni_npi_to_mbi.py +++ b/code/adni_npi_to_mbi.py @@ -34,13 +34,13 @@ def select_columns(df): adnimerge_p = Path( "/home/neuromod/wrangling-phenotype/data/adni/ADNIMERGE_22Aug2023.csv" ) -adni_npi_p = Path("/home/neuromod/wrangling-phenotype/data/adni/NPI_22Aug2023.csv") +npi_p = Path("/home/neuromod/wrangling-phenotype/data/adni/NPI_22Aug2023.csv") qc_pheno_p = Path("/home/neuromod/wrangling-phenotype/outputs/passed_qc_master.tsv") -output_p = Path("/home/neuromod/wrangling-phenotype/mbi.tsv") +output_p = Path("/home/neuromod/wrangling-phenotype/test.tsv") # load data adnimerge_df = pd.read_csv(adnimerge_p, low_memory=False) -npi_df = pd.read_csv(adni_npi_p) +npi_df = pd.read_csv(npi_p) qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t") # convert NPI to MBI and calculate total score @@ -48,7 +48,7 @@ def select_columns(df): mbi_df = util.calculate_mbi_score(mbi_df) mbi_df = select_columns(mbi_df) -# Do some formatting +# Rename date field mbi_df.rename(columns={"EXAMDATE": "ses"}, inplace=True) # In qc_pheno_df grab just the ID part from participant_id, so it matches mbi_df @@ -57,12 +57,13 @@ def select_columns(df): qc_df_filtered["participant_id"].str.split("S").str[-1].astype(int) ) -# Replace some rougue dates first +# Replace some rougue dates mbi_df["ses"] = mbi_df["ses"].replace("0012-02-14", "2012-02-14") mbi_df["ses"] = mbi_df["ses"].replace("0013-05-06", "2013-05-06") mbi_df["ses"] = mbi_df["ses"].replace("0013-10-28", "2013-10-28") -qc_df_filtered["ses"] = pd.to_datetime(qc_df_filtered["ses"]) # format="%m/%Y" +# Convert sessions to datetime +qc_df_filtered["ses"] = pd.to_datetime(qc_df_filtered["ses"]) mbi_df["ses"] = pd.to_datetime(mbi_df["ses"]) # Ensure ordered by ses @@ -70,14 +71,15 @@ def select_columns(df): mbi_df = mbi_df.dropna(subset=["ses"]) # Since some were missing mbi_df = mbi_df.sort_values(by=["ses"]) +# Merge to get nearest mbi result within 6 months merged_df = pd.merge_asof( qc_df_filtered, mbi_df, by="RID", on="ses", - direction="nearest", # Finds the closest match, whether before or after + direction="nearest", tolerance=pd.Timedelta(days=183), # Approximately 6 months ) -# merged_df.to_csv(output_p, sep="\t", index=False) -mbi_df.to_csv(output_p, sep="\t", index=False) +# TO DO: how best to output? Now have adni filtered from qc_pheno +merged_df.to_csv(output_p, sep="\t", index=False) diff --git a/code/cimaq_npi_to_mbi.py b/code/cimaq_npi_to_mbi.py index 6b10816..1305600 100644 --- a/code/cimaq_npi_to_mbi.py +++ b/code/cimaq_npi_to_mbi.py @@ -1,5 +1,6 @@ import pandas as pd import numpy as np +import util from pathlib import Path @@ -26,7 +27,6 @@ def map_values(df): def cimaq_npi_to_mbi(df): - # Calculate MBI domains df["decreased_motivation"] = df["22901_apathie"] df["emotional_dysregulation"] = ( df["22901_depression_dysphorie"] + df["22901_anxiete"] + df["22901_euphorie"] @@ -41,3 +41,73 @@ def cimaq_npi_to_mbi(df): df["22901_idees_delirantes"] + df["22901_hallucinations"] ) return df + + +def select_columns(df): + columns = [ + "pscid", + "no_visite", + "decreased_motivation", + "emotional_dysregulation", + "impulse_dyscontrol", + "social_inappropriateness", + "abnormal_perception", + "mbi_total_score", + "mbi_status", + ] + df = df[columns].copy() + return df + + +# set paths +npi_p = Path( + "/home/neuromod/wrangling-phenotype/data/cimaq/22901_inventaire_neuropsychiatrique_q.tsv" +) +qc_pheno_p = Path("/home/neuromod/wrangling-phenotype/outputs/passed_qc_master.tsv") +output_p = Path("/home/neuromod/wrangling-phenotype/test.tsv") + +# load data +npi_df = pd.read_csv(npi_p, sep="\t") +qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t") + +# convert NPI to MBI and calculate total score +npi_df = map_values(npi_df) +mbi_df = cimaq_npi_to_mbi(npi_df) +mbi_df = util.calculate_mbi_score(mbi_df) +mbi_df = select_columns(mbi_df) + +# Rename columns in mbi_df so they match +mbi_df.rename(columns={"pscid": "participant_id"}, inplace=True) +mbi_df.rename(columns={"no_visite": "ses"}, inplace=True) + +# Filter to only cimaq rows +qc_df_filtered = qc_pheno_df.loc[qc_pheno_df["dataset"] == "cimaq"].copy() + +# Format id (must be numeric for merging) +mbi_df["participant_id"] = mbi_df["participant_id"].astype(int) +qc_df_filtered["participant_id"] = qc_df_filtered["participant_id"].astype(int) + +# Strip the 'V' from ses and convert to integer +mbi_df["ses_numeric"] = mbi_df["ses"].str.lstrip("V").astype(int) +qc_df_filtered["ses_numeric"] = qc_df_filtered["ses"].str.lstrip("V").astype(int) + +# Ensure ordered by ses +qc_df_filtered = qc_df_filtered.sort_values(by=["ses_numeric"]) +mbi_df = mbi_df.sort_values(by=["ses_numeric"]) + +# Merge to get nearest mbi result within 6 months +merged_df = pd.merge_asof( + qc_df_filtered, + mbi_df, + by="participant_id", + on="ses_numeric", + direction="nearest", + tolerance=(6), +) + +# Handle session columns +merged_df.drop(columns=["ses_y"], inplace=True) +merged_df.rename(columns={"ses_x": "ses"}, inplace=True) +merged_df.drop(columns=["ses_numeric"], inplace=True) + +merged_df.to_csv(output_p, sep="\t", index=False) From d064215e873e7a623dda6fe1a88b3f34092b2b7d Mon Sep 17 00:00:00 2001 From: clarkenj Date: Wed, 3 Apr 2024 17:01:59 -0400 Subject: [PATCH 03/21] next iteration --- code/adni_npi_to_mbi.py | 126 ++++++++++++++++++++---------------- code/cimaq_npi_to_mbi.py | 129 ++++++++++++++++++++++--------------- code/oasis_npi_to_mbi.py | 134 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 284 insertions(+), 105 deletions(-) create mode 100644 code/oasis_npi_to_mbi.py diff --git a/code/adni_npi_to_mbi.py b/code/adni_npi_to_mbi.py index 819fe38..4a4379e 100644 --- a/code/adni_npi_to_mbi.py +++ b/code/adni_npi_to_mbi.py @@ -1,5 +1,6 @@ import pandas as pd import numpy as np +import argparse import util from pathlib import Path @@ -30,56 +31,75 @@ def select_columns(df): return df -# set paths -adnimerge_p = Path( - "/home/neuromod/wrangling-phenotype/data/adni/ADNIMERGE_22Aug2023.csv" -) -npi_p = Path("/home/neuromod/wrangling-phenotype/data/adni/NPI_22Aug2023.csv") -qc_pheno_p = Path("/home/neuromod/wrangling-phenotype/outputs/passed_qc_master.tsv") -output_p = Path("/home/neuromod/wrangling-phenotype/test.tsv") - -# load data -adnimerge_df = pd.read_csv(adnimerge_p, low_memory=False) -npi_df = pd.read_csv(npi_p) -qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t") - -# convert NPI to MBI and calculate total score -mbi_df = adni_npi_to_mbi(npi_df) -mbi_df = util.calculate_mbi_score(mbi_df) -mbi_df = select_columns(mbi_df) - -# Rename date field -mbi_df.rename(columns={"EXAMDATE": "ses"}, inplace=True) - -# In qc_pheno_df grab just the ID part from participant_id, so it matches mbi_df -qc_df_filtered = qc_pheno_df.loc[qc_pheno_df["dataset"] == "adni"].copy() -qc_df_filtered["RID"] = ( - qc_df_filtered["participant_id"].str.split("S").str[-1].astype(int) -) - -# Replace some rougue dates -mbi_df["ses"] = mbi_df["ses"].replace("0012-02-14", "2012-02-14") -mbi_df["ses"] = mbi_df["ses"].replace("0013-05-06", "2013-05-06") -mbi_df["ses"] = mbi_df["ses"].replace("0013-10-28", "2013-10-28") - -# Convert sessions to datetime -qc_df_filtered["ses"] = pd.to_datetime(qc_df_filtered["ses"]) -mbi_df["ses"] = pd.to_datetime(mbi_df["ses"]) - -# Ensure ordered by ses -qc_df_filtered = qc_df_filtered.sort_values(by=["ses"]) -mbi_df = mbi_df.dropna(subset=["ses"]) # Since some were missing -mbi_df = mbi_df.sort_values(by=["ses"]) - -# Merge to get nearest mbi result within 6 months -merged_df = pd.merge_asof( - qc_df_filtered, - mbi_df, - by="RID", - on="ses", - direction="nearest", - tolerance=pd.Timedelta(days=183), # Approximately 6 months -) - -# TO DO: how best to output? Now have adni filtered from qc_pheno -merged_df.to_csv(output_p, sep="\t", index=False) +def adni_merge_mbi_qc(qc_pheno_df, mbi_df): + # Filter qc to only adni rows + # Grab just the ID part from participant_id, so it matches mbi_df + qc_df_filtered = qc_pheno_df.loc[qc_pheno_df["dataset"] == "adni"].copy() + qc_df_filtered["RID"] = ( + qc_df_filtered["participant_id"].str.split("S").str[-1].astype(int) + ) + + # Rename date field so it matches + mbi_df.rename(columns={"EXAMDATE": "ses"}, inplace=True) + + # Replace some rougue dates + mbi_df["ses"] = mbi_df["ses"].replace("0012-02-14", "2012-02-14") + mbi_df["ses"] = mbi_df["ses"].replace("0013-05-06", "2013-05-06") + mbi_df["ses"] = mbi_df["ses"].replace("0013-10-28", "2013-10-28") + + # Convert sessions to datetime + qc_df_filtered["ses"] = pd.to_datetime(qc_df_filtered["ses"]) + mbi_df["ses"] = pd.to_datetime(mbi_df["ses"]) + + # Ensure ordered by session + qc_df_filtered = qc_df_filtered.sort_values(by=["ses"]) + mbi_df = mbi_df.dropna(subset=["ses"]) # Since some were missing + mbi_df = mbi_df.sort_values(by=["ses"]) + + # Merge to get nearest mbi result within 6 months + merged_df = pd.merge_asof( + qc_df_filtered, + mbi_df, + by="RID", + on="ses", + direction="nearest", + tolerance=pd.Timedelta(days=183), # Approximately 6 months + ) + + return merged_df + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Convert NPI to MBI, merge with QC and pheno data for ADNI" + ) + parser.add_argument("rootpath", type=Path, help="Root path") + + args = parser.parse_args() + root_p = args.rootpath + + # Set paths + npi_p = root_p / "data/adni/NPI_22Aug2023.csv" + qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" + output_p = root_p / "outputs/final_adni.tsv" + + # load data + npi_df = pd.read_csv(npi_p) + qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t") + + # convert NPI to MBI and calculate total score + mbi_df = adni_npi_to_mbi(npi_df) + mbi_df = util.calculate_mbi_score(mbi_df) + mbi_df = select_columns(mbi_df) + + merged_df = adni_merge_mbi_qc(qc_pheno_df, mbi_df) + + # Select scans. For controls, we take the first available scan. For MCI and ADD, take the first with an MBI score + final_adni = ( + merged_df.groupby(["participant_id"], as_index=False) + .apply(util.select_row) + .reset_index(drop=True) + ) + + # Save the final DataFrame + final_adni.to_csv(output_p, sep="\t", index=False) diff --git a/code/cimaq_npi_to_mbi.py b/code/cimaq_npi_to_mbi.py index 1305600..6c73869 100644 --- a/code/cimaq_npi_to_mbi.py +++ b/code/cimaq_npi_to_mbi.py @@ -1,10 +1,14 @@ import pandas as pd import numpy as np +import argparse import util from pathlib import Path def map_values(df): + # Drop rows with some data unavailable + df = df[df["22901_score"] != "donnée_non_disponible"].copy() + # Map scores to numerical values mapping = {"0_non": 0, "1_oui_léger": 1, "2_oui_modéré": 2, "3_oui_sévère": 3} @@ -59,55 +63,76 @@ def select_columns(df): return df -# set paths -npi_p = Path( - "/home/neuromod/wrangling-phenotype/data/cimaq/22901_inventaire_neuropsychiatrique_q.tsv" -) -qc_pheno_p = Path("/home/neuromod/wrangling-phenotype/outputs/passed_qc_master.tsv") -output_p = Path("/home/neuromod/wrangling-phenotype/test.tsv") - -# load data -npi_df = pd.read_csv(npi_p, sep="\t") -qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t") - -# convert NPI to MBI and calculate total score -npi_df = map_values(npi_df) -mbi_df = cimaq_npi_to_mbi(npi_df) -mbi_df = util.calculate_mbi_score(mbi_df) -mbi_df = select_columns(mbi_df) - -# Rename columns in mbi_df so they match -mbi_df.rename(columns={"pscid": "participant_id"}, inplace=True) -mbi_df.rename(columns={"no_visite": "ses"}, inplace=True) - -# Filter to only cimaq rows -qc_df_filtered = qc_pheno_df.loc[qc_pheno_df["dataset"] == "cimaq"].copy() - -# Format id (must be numeric for merging) -mbi_df["participant_id"] = mbi_df["participant_id"].astype(int) -qc_df_filtered["participant_id"] = qc_df_filtered["participant_id"].astype(int) - -# Strip the 'V' from ses and convert to integer -mbi_df["ses_numeric"] = mbi_df["ses"].str.lstrip("V").astype(int) -qc_df_filtered["ses_numeric"] = qc_df_filtered["ses"].str.lstrip("V").astype(int) - -# Ensure ordered by ses -qc_df_filtered = qc_df_filtered.sort_values(by=["ses_numeric"]) -mbi_df = mbi_df.sort_values(by=["ses_numeric"]) - -# Merge to get nearest mbi result within 6 months -merged_df = pd.merge_asof( - qc_df_filtered, - mbi_df, - by="participant_id", - on="ses_numeric", - direction="nearest", - tolerance=(6), -) - -# Handle session columns -merged_df.drop(columns=["ses_y"], inplace=True) -merged_df.rename(columns={"ses_x": "ses"}, inplace=True) -merged_df.drop(columns=["ses_numeric"], inplace=True) - -merged_df.to_csv(output_p, sep="\t", index=False) +def cimaq_merge_mbi_qc(qc_pheno_df, mbi_df): + # Filter to only cimaq rows + qc_df_filtered = qc_pheno_df.loc[qc_pheno_df["dataset"] == "cimaq"].copy() + + # Rename columns in mbi_df so they match + mbi_df.rename(columns={"pscid": "participant_id"}, inplace=True) + mbi_df.rename(columns={"no_visite": "ses"}, inplace=True) + + # Format id + mbi_df["participant_id"] = mbi_df["participant_id"].astype(int) + qc_df_filtered["participant_id"] = qc_df_filtered["participant_id"].astype(int) + + # Strip the 'V' from ses and convert to integer + mbi_df["ses_numeric"] = mbi_df["ses"].str.lstrip("V").astype(int) + qc_df_filtered["ses_numeric"] = qc_df_filtered["ses"].str.lstrip("V").astype(int) + + # Ensure ordered by session + qc_df_filtered = qc_df_filtered.sort_values(by=["ses_numeric"]) + mbi_df = mbi_df.sort_values(by=["ses_numeric"]) + + # Merge to get nearest mbi result within 6 months + merged_df = pd.merge_asof( + qc_df_filtered, + mbi_df, + by="participant_id", + on="ses_numeric", + direction="nearest", + tolerance=(6), + ) + + # Handle session columns + merged_df.drop(columns=["ses_y"], inplace=True) + merged_df.rename(columns={"ses_x": "ses"}, inplace=True) + merged_df.drop(columns=["ses_numeric"], inplace=True) + + return merged_df + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Convert NPI to MBI, merge with QC and pheno data for OASIS3" + ) + parser.add_argument("rootpath", type=Path, help="Root path") + + args = parser.parse_args() + root_p = args.rootpath + + # Set paths + npi_p = root_p / "data/cimaq/22901_inventaire_neuropsychiatrique_q.tsv" + qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" + output_p = root_p / "outputs/final_cimaq.tsv" + + # Load CSVs + npi_df = pd.read_csv(npi_p, sep="\t") + qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t") + + # Convert NPI to MBI and calculate total score + npi_df = map_values(npi_df) + mbi_df = cimaq_npi_to_mbi(npi_df) + mbi_df = util.calculate_mbi_score(mbi_df) + mbi_df = select_columns(mbi_df) + + merged_df = cimaq_merge_mbi_qc(qc_pheno_df, mbi_df) + + # Select scans. For controls, we take the first available scan. For MCI and ADD, take the first with an MBI score + final_cimaq = ( + merged_df.groupby(["participant_id"], as_index=False) + .apply(util.select_row) + .reset_index(drop=True) + ) + + # Save the final DataFrame + final_cimaq.to_csv(output_p, sep="\t", index=False) diff --git a/code/oasis_npi_to_mbi.py b/code/oasis_npi_to_mbi.py new file mode 100644 index 0000000..324c7c4 --- /dev/null +++ b/code/oasis_npi_to_mbi.py @@ -0,0 +1,134 @@ +import pandas as pd +import numpy as np +import argparse +import util + +from pathlib import Path + + +def oasis_npi_to_mbi(df): + df["decreased_motivation"] = df["APA"] + df["emotional_dysregulation"] = df["DEPD"] + df["ANX"] + df["ELAT"] + df["impulse_dyscontrol"] = df["AGIT"] + df["IRR"] + df["MOT"] + df["social_inappropriateness"] = df["DISN"] + df["abnormal_perception"] = df["DEL"] + df["HALL"] + return df + + +def select_columns(df): + columns = [ + "OASISID", + "days_to_visit", + "decreased_motivation", + "emotional_dysregulation", + "impulse_dyscontrol", + "social_inappropriateness", + "abnormal_perception", + "mbi_total_score", + "mbi_status", + ] + df = df[columns].copy() + return df + + +def oasis_merge_mbi_qc(qc_pheno_df, mbi_df): + # Filter qc to only cimaq rows + qc_df_filtered = qc_pheno_df.loc[qc_pheno_df["dataset"] == "oasis3"].copy() + + # Rename columns in mbi_df so they match + mbi_df.rename(columns={"OASISID": "participant_id"}, inplace=True) + mbi_df.rename(columns={"days_to_visit": "ses"}, inplace=True) + + # Convert ses to integer and strip the d where necessary + mbi_df["ses_numeric"] = mbi_df["ses"].astype(int) + qc_df_filtered["ses_numeric"] = qc_df_filtered["ses"].str.lstrip("d").astype(int) + + # Ensure ordered by session + qc_df_filtered = qc_df_filtered.sort_values(by=["ses_numeric"]) + mbi_df = mbi_df.sort_values(by=["ses_numeric"]) + + # Merge to get nearest mbi result within 6 months + merged_df = pd.merge_asof( + qc_df_filtered, + mbi_df, + by="participant_id", + on="ses_numeric", + direction="nearest", + tolerance=(183), + ) + + # Handle session columns + merged_df.drop(columns=["ses_y"], inplace=True) + merged_df.rename(columns={"ses_x": "ses"}, inplace=True) + merged_df.drop(columns=["ses_numeric"], inplace=True) + + return merged_df + + +def first_session_controls(merged_df): + # Filter for controls + controls_df = merged_df[merged_df["diagnosis"] == "CON"] + + # Identify the first session for each participant + first_sessions = controls_df.groupby("participant_id")["ses"].min().reset_index() + + # Merge the first_sessions information back with the original controls_df + # This will filter controls_df to only include rows that match the first session for each participant + first_session_controls = pd.merge( + controls_df, first_sessions, on=["participant_id", "ses"] + ) + return first_session_controls + + +def first_session_mci_add(merged_df): + # Filter for participants with a diagnosis of "MCI" or "ADD" and a non-empty mbi_status + mci_add_df = merged_df[ + (merged_df["diagnosis"].isin(["MCI", "ADD"])) & merged_df["mbi_status"].notna() + ] + + # Identify the first session for each MCI/ADD participant + first_sessions = mci_add_df.groupby("participant_id")["ses"].min().reset_index() + + # Merge the first_sessions information back with the original mci_add_df + # This will filter mci_add_df to only include rows that match the first session for each participant + first_session_mci_add = pd.merge( + mci_add_df, first_sessions, on=["participant_id", "ses"] + ) + return first_session_mci_add + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Convert NPI to MBI, merge with QC and pheno data for OASIS3" + ) + parser.add_argument("rootpath", type=Path, help="Root path") + + args = parser.parse_args() + root_p = args.rootpath + + # Set paths + npi_p = root_p / "data/oasis3/OASIS3_UDSb5_npiq.csv" + qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" + output_p = root_p / "outputs/final_oasis.tsv" + + # Load CSVs + npi_df = pd.read_csv(npi_p) + qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t") + + # convert NPI to MBI and calculate total score + mbi_df = oasis_npi_to_mbi(npi_df) + mbi_df = util.calculate_mbi_score(mbi_df) + mbi_df = select_columns(mbi_df) + + # Merge mbi data with qc_pheno data + merged_df = oasis_merge_mbi_qc(qc_pheno_df, mbi_df) + + # Select scans. Find the first available session for each participant. For MCI and ADD they must have an mbi score, for controls it does not matter + # This approach retains multiple runs from the same session + control_df = first_session_controls(merged_df) + mci_add_df = first_session_mci_add(merged_df) + + final_oasis = pd.concat([control_df, mci_add_df], ignore_index=True) + + # Output df + final_oasis.to_csv(output_p, sep="\t", index=False) From 8b17fe61e900132512b4b71b5d700cfd0b917833 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Wed, 3 Apr 2024 17:02:10 -0400 Subject: [PATCH 04/21] added util.py --- code/util.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/code/util.py b/code/util.py index f124183..896c492 100644 --- a/code/util.py +++ b/code/util.py @@ -1,3 +1,6 @@ +import numpy as np + + def calculate_mbi_score(df): mbi_domains = [ "decreased_motivation", @@ -7,6 +10,25 @@ def calculate_mbi_score(df): "abnormal_perception", ] - df["mbi_total_score"] = df[mbi_domains].sum(axis=1) - df["mbi_status"] = (df["mbi_total_score"] >= 1).astype(int) + # Calculate mbi_total_score across domains + df["mbi_total_score"] = df[mbi_domains].sum(axis=1, min_count=1) + + # Set mbi_total_score to NaN where all mbi_domain columns are NaN + df.loc[df[mbi_domains].isna().all(axis=1), "mbi_total_score"] = np.nan + + # Calculate mbi_status based on mbi_total_score + # Set mbi_status to NaN where mbi_total_score is NaN + df["mbi_status"] = np.where( + df["mbi_total_score"].isna(), np.nan, (df["mbi_total_score"] >= 1).astype(int) + ) + return df + + +def select_row(group): + if group["diagnosis"].iloc[0] == "CON": + # For 'CON', the first row + return group.head(1) + else: + # For 'ADD' and 'MCI', select the first row where 'mbi_status' has a value + return group[group["mbi_status"].notna()].head(1) From ba692db3d40e8b265898512a60ce8a3ddafab9de Mon Sep 17 00:00:00 2001 From: clarkenj Date: Wed, 3 Apr 2024 17:04:29 -0400 Subject: [PATCH 05/21] updated comments --- code/adni_npi_to_mbi.py | 2 +- code/cimaq_npi_to_mbi.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/code/adni_npi_to_mbi.py b/code/adni_npi_to_mbi.py index 4a4379e..cf11a24 100644 --- a/code/adni_npi_to_mbi.py +++ b/code/adni_npi_to_mbi.py @@ -101,5 +101,5 @@ def adni_merge_mbi_qc(qc_pheno_df, mbi_df): .reset_index(drop=True) ) - # Save the final DataFrame + # Output df final_adni.to_csv(output_p, sep="\t", index=False) diff --git a/code/cimaq_npi_to_mbi.py b/code/cimaq_npi_to_mbi.py index 6c73869..571156e 100644 --- a/code/cimaq_npi_to_mbi.py +++ b/code/cimaq_npi_to_mbi.py @@ -134,5 +134,5 @@ def cimaq_merge_mbi_qc(qc_pheno_df, mbi_df): .reset_index(drop=True) ) - # Save the final DataFrame + # Output df final_cimaq.to_csv(output_p, sep="\t", index=False) From ab0ecd41a0c2e9d8f354e048c488600a36243921 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Mon, 8 Apr 2024 12:50:48 -0400 Subject: [PATCH 06/21] updated comment --- code/adni_npi_to_mbi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/adni_npi_to_mbi.py b/code/adni_npi_to_mbi.py index cf11a24..8d26a25 100644 --- a/code/adni_npi_to_mbi.py +++ b/code/adni_npi_to_mbi.py @@ -63,7 +63,7 @@ def adni_merge_mbi_qc(qc_pheno_df, mbi_df): by="RID", on="ses", direction="nearest", - tolerance=pd.Timedelta(days=183), # Approximately 6 months + tolerance=pd.Timedelta(days=183), ) return merged_df From 17fd48995c4f20687fd357925ca0321573d98f16 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Mon, 8 Apr 2024 12:53:00 -0400 Subject: [PATCH 07/21] fixed typo --- code/oasis_npi_to_mbi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/oasis_npi_to_mbi.py b/code/oasis_npi_to_mbi.py index 324c7c4..4012078 100644 --- a/code/oasis_npi_to_mbi.py +++ b/code/oasis_npi_to_mbi.py @@ -32,7 +32,7 @@ def select_columns(df): def oasis_merge_mbi_qc(qc_pheno_df, mbi_df): - # Filter qc to only cimaq rows + # Filter qc to only oasis rows qc_df_filtered = qc_pheno_df.loc[qc_pheno_df["dataset"] == "oasis3"].copy() # Rename columns in mbi_df so they match From f27c321d270c56284b26a209ba3d9d1362aa53ea Mon Sep 17 00:00:00 2001 From: clarkenj Date: Tue, 9 Apr 2024 10:19:06 -0400 Subject: [PATCH 08/21] now includes pheno info --- code/oasis_npi_to_mbi.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/code/oasis_npi_to_mbi.py b/code/oasis_npi_to_mbi.py index 4012078..b1bdcc8 100644 --- a/code/oasis_npi_to_mbi.py +++ b/code/oasis_npi_to_mbi.py @@ -32,24 +32,21 @@ def select_columns(df): def oasis_merge_mbi_qc(qc_pheno_df, mbi_df): - # Filter qc to only oasis rows - qc_df_filtered = qc_pheno_df.loc[qc_pheno_df["dataset"] == "oasis3"].copy() - # Rename columns in mbi_df so they match mbi_df.rename(columns={"OASISID": "participant_id"}, inplace=True) mbi_df.rename(columns={"days_to_visit": "ses"}, inplace=True) # Convert ses to integer and strip the d where necessary mbi_df["ses_numeric"] = mbi_df["ses"].astype(int) - qc_df_filtered["ses_numeric"] = qc_df_filtered["ses"].str.lstrip("d").astype(int) + qc_pheno_df["ses_numeric"] = qc_pheno_df["ses"].str.lstrip("d").astype(int) # Ensure ordered by session - qc_df_filtered = qc_df_filtered.sort_values(by=["ses_numeric"]) + qc_pheno_df = qc_pheno_df.sort_values(by=["ses_numeric"]) mbi_df = mbi_df.sort_values(by=["ses_numeric"]) # Merge to get nearest mbi result within 6 months merged_df = pd.merge_asof( - qc_df_filtered, + qc_pheno_df, mbi_df, by="participant_id", on="ses_numeric", @@ -108,7 +105,7 @@ def first_session_mci_add(merged_df): # Set paths npi_p = root_p / "data/oasis3/OASIS3_UDSb5_npiq.csv" - qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" + qc_pheno_p = root_p / "outputs/oasis3_qc_pheno.tsv" output_p = root_p / "outputs/final_oasis.tsv" # Load CSVs From 2d10eb89a4d31b43e1058d1f1a88ac558d8756b7 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Tue, 9 Apr 2024 10:31:27 -0400 Subject: [PATCH 09/21] now includes pheno info --- code/adni_npi_to_mbi.py | 14 ++++++-------- code/cimaq_npi_to_mbi.py | 13 +++++-------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/code/adni_npi_to_mbi.py b/code/adni_npi_to_mbi.py index 8d26a25..8ce38ec 100644 --- a/code/adni_npi_to_mbi.py +++ b/code/adni_npi_to_mbi.py @@ -32,11 +32,9 @@ def select_columns(df): def adni_merge_mbi_qc(qc_pheno_df, mbi_df): - # Filter qc to only adni rows # Grab just the ID part from participant_id, so it matches mbi_df - qc_df_filtered = qc_pheno_df.loc[qc_pheno_df["dataset"] == "adni"].copy() - qc_df_filtered["RID"] = ( - qc_df_filtered["participant_id"].str.split("S").str[-1].astype(int) + qc_pheno_df["RID"] = ( + qc_pheno_df["participant_id"].str.split("S").str[-1].astype(int) ) # Rename date field so it matches @@ -48,17 +46,17 @@ def adni_merge_mbi_qc(qc_pheno_df, mbi_df): mbi_df["ses"] = mbi_df["ses"].replace("0013-10-28", "2013-10-28") # Convert sessions to datetime - qc_df_filtered["ses"] = pd.to_datetime(qc_df_filtered["ses"]) + qc_pheno_df["ses"] = pd.to_datetime(qc_pheno_df["ses"]) mbi_df["ses"] = pd.to_datetime(mbi_df["ses"]) # Ensure ordered by session - qc_df_filtered = qc_df_filtered.sort_values(by=["ses"]) + qc_pheno_df = qc_pheno_df.sort_values(by=["ses"]) mbi_df = mbi_df.dropna(subset=["ses"]) # Since some were missing mbi_df = mbi_df.sort_values(by=["ses"]) # Merge to get nearest mbi result within 6 months merged_df = pd.merge_asof( - qc_df_filtered, + qc_pheno_df, mbi_df, by="RID", on="ses", @@ -80,7 +78,7 @@ def adni_merge_mbi_qc(qc_pheno_df, mbi_df): # Set paths npi_p = root_p / "data/adni/NPI_22Aug2023.csv" - qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" + qc_pheno_p = root_p / "outputs/adni_qc_pheno.tsv" output_p = root_p / "outputs/final_adni.tsv" # load data diff --git a/code/cimaq_npi_to_mbi.py b/code/cimaq_npi_to_mbi.py index 571156e..6c0df16 100644 --- a/code/cimaq_npi_to_mbi.py +++ b/code/cimaq_npi_to_mbi.py @@ -64,28 +64,25 @@ def select_columns(df): def cimaq_merge_mbi_qc(qc_pheno_df, mbi_df): - # Filter to only cimaq rows - qc_df_filtered = qc_pheno_df.loc[qc_pheno_df["dataset"] == "cimaq"].copy() - # Rename columns in mbi_df so they match mbi_df.rename(columns={"pscid": "participant_id"}, inplace=True) mbi_df.rename(columns={"no_visite": "ses"}, inplace=True) # Format id mbi_df["participant_id"] = mbi_df["participant_id"].astype(int) - qc_df_filtered["participant_id"] = qc_df_filtered["participant_id"].astype(int) + qc_pheno_df["participant_id"] = qc_pheno_df["participant_id"].astype(int) # Strip the 'V' from ses and convert to integer mbi_df["ses_numeric"] = mbi_df["ses"].str.lstrip("V").astype(int) - qc_df_filtered["ses_numeric"] = qc_df_filtered["ses"].str.lstrip("V").astype(int) + qc_pheno_df["ses_numeric"] = qc_pheno_df["ses"].str.lstrip("V").astype(int) # Ensure ordered by session - qc_df_filtered = qc_df_filtered.sort_values(by=["ses_numeric"]) + qc_pheno_df = qc_pheno_df.sort_values(by=["ses_numeric"]) mbi_df = mbi_df.sort_values(by=["ses_numeric"]) # Merge to get nearest mbi result within 6 months merged_df = pd.merge_asof( - qc_df_filtered, + qc_pheno_df, mbi_df, by="participant_id", on="ses_numeric", @@ -112,7 +109,7 @@ def cimaq_merge_mbi_qc(qc_pheno_df, mbi_df): # Set paths npi_p = root_p / "data/cimaq/22901_inventaire_neuropsychiatrique_q.tsv" - qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" + qc_pheno_p = root_p / "outputs/cimaq_qc_pheno.tsv" output_p = root_p / "outputs/final_cimaq.tsv" # Load CSVs From 0b50768efb0089dd2095ed21e7351ed5ed42d1dd Mon Sep 17 00:00:00 2001 From: clarkenj Date: Tue, 16 Apr 2024 10:48:38 -0400 Subject: [PATCH 10/21] update which qc df --- code/adni_npi_to_mbi.py | 9 ++++++--- code/cimaq_npi_to_mbi.py | 5 ++++- code/oasis_npi_to_mbi.py | 9 ++++++--- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/code/adni_npi_to_mbi.py b/code/adni_npi_to_mbi.py index 8ce38ec..8c8502d 100644 --- a/code/adni_npi_to_mbi.py +++ b/code/adni_npi_to_mbi.py @@ -78,14 +78,17 @@ def adni_merge_mbi_qc(qc_pheno_df, mbi_df): # Set paths npi_p = root_p / "data/adni/NPI_22Aug2023.csv" - qc_pheno_p = root_p / "outputs/adni_qc_pheno.tsv" + qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" output_p = root_p / "outputs/final_adni.tsv" - # load data + # Load data npi_df = pd.read_csv(npi_p) qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t") - # convert NPI to MBI and calculate total score + # Filter for dataset + qc_pheno_df = qc_pheno_df[qc_pheno_df["dataset"] == "adni"] + + # Convert NPI to MBI and calculate total score mbi_df = adni_npi_to_mbi(npi_df) mbi_df = util.calculate_mbi_score(mbi_df) mbi_df = select_columns(mbi_df) diff --git a/code/cimaq_npi_to_mbi.py b/code/cimaq_npi_to_mbi.py index 6c0df16..f309ef0 100644 --- a/code/cimaq_npi_to_mbi.py +++ b/code/cimaq_npi_to_mbi.py @@ -109,13 +109,16 @@ def cimaq_merge_mbi_qc(qc_pheno_df, mbi_df): # Set paths npi_p = root_p / "data/cimaq/22901_inventaire_neuropsychiatrique_q.tsv" - qc_pheno_p = root_p / "outputs/cimaq_qc_pheno.tsv" + qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" output_p = root_p / "outputs/final_cimaq.tsv" # Load CSVs npi_df = pd.read_csv(npi_p, sep="\t") qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t") + # Filter for dataset + qc_pheno_df = qc_pheno_df[qc_pheno_df["dataset"] == "cimaq"] + # Convert NPI to MBI and calculate total score npi_df = map_values(npi_df) mbi_df = cimaq_npi_to_mbi(npi_df) diff --git a/code/oasis_npi_to_mbi.py b/code/oasis_npi_to_mbi.py index b1bdcc8..5d2d2d7 100644 --- a/code/oasis_npi_to_mbi.py +++ b/code/oasis_npi_to_mbi.py @@ -105,14 +105,17 @@ def first_session_mci_add(merged_df): # Set paths npi_p = root_p / "data/oasis3/OASIS3_UDSb5_npiq.csv" - qc_pheno_p = root_p / "outputs/oasis3_qc_pheno.tsv" - output_p = root_p / "outputs/final_oasis.tsv" + qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" + output_p = root_p / "outputs/final_oasis3.tsv" # Load CSVs npi_df = pd.read_csv(npi_p) qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t") - # convert NPI to MBI and calculate total score + # Filter for dataset + qc_pheno_df = qc_pheno_df[qc_pheno_df["dataset"] == "oasis3"] + + # Convert NPI to MBI and calculate total score mbi_df = oasis_npi_to_mbi(npi_df) mbi_df = util.calculate_mbi_score(mbi_df) mbi_df = select_columns(mbi_df) From b11c0065d18e9ec90f2ac30a4d95579045cce268 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Tue, 23 Apr 2024 14:01:48 -0400 Subject: [PATCH 11/21] update oasis file name --- code/oasis3_npi_to_mbi.py | 134 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 code/oasis3_npi_to_mbi.py diff --git a/code/oasis3_npi_to_mbi.py b/code/oasis3_npi_to_mbi.py new file mode 100644 index 0000000..5d2d2d7 --- /dev/null +++ b/code/oasis3_npi_to_mbi.py @@ -0,0 +1,134 @@ +import pandas as pd +import numpy as np +import argparse +import util + +from pathlib import Path + + +def oasis_npi_to_mbi(df): + df["decreased_motivation"] = df["APA"] + df["emotional_dysregulation"] = df["DEPD"] + df["ANX"] + df["ELAT"] + df["impulse_dyscontrol"] = df["AGIT"] + df["IRR"] + df["MOT"] + df["social_inappropriateness"] = df["DISN"] + df["abnormal_perception"] = df["DEL"] + df["HALL"] + return df + + +def select_columns(df): + columns = [ + "OASISID", + "days_to_visit", + "decreased_motivation", + "emotional_dysregulation", + "impulse_dyscontrol", + "social_inappropriateness", + "abnormal_perception", + "mbi_total_score", + "mbi_status", + ] + df = df[columns].copy() + return df + + +def oasis_merge_mbi_qc(qc_pheno_df, mbi_df): + # Rename columns in mbi_df so they match + mbi_df.rename(columns={"OASISID": "participant_id"}, inplace=True) + mbi_df.rename(columns={"days_to_visit": "ses"}, inplace=True) + + # Convert ses to integer and strip the d where necessary + mbi_df["ses_numeric"] = mbi_df["ses"].astype(int) + qc_pheno_df["ses_numeric"] = qc_pheno_df["ses"].str.lstrip("d").astype(int) + + # Ensure ordered by session + qc_pheno_df = qc_pheno_df.sort_values(by=["ses_numeric"]) + mbi_df = mbi_df.sort_values(by=["ses_numeric"]) + + # Merge to get nearest mbi result within 6 months + merged_df = pd.merge_asof( + qc_pheno_df, + mbi_df, + by="participant_id", + on="ses_numeric", + direction="nearest", + tolerance=(183), + ) + + # Handle session columns + merged_df.drop(columns=["ses_y"], inplace=True) + merged_df.rename(columns={"ses_x": "ses"}, inplace=True) + merged_df.drop(columns=["ses_numeric"], inplace=True) + + return merged_df + + +def first_session_controls(merged_df): + # Filter for controls + controls_df = merged_df[merged_df["diagnosis"] == "CON"] + + # Identify the first session for each participant + first_sessions = controls_df.groupby("participant_id")["ses"].min().reset_index() + + # Merge the first_sessions information back with the original controls_df + # This will filter controls_df to only include rows that match the first session for each participant + first_session_controls = pd.merge( + controls_df, first_sessions, on=["participant_id", "ses"] + ) + return first_session_controls + + +def first_session_mci_add(merged_df): + # Filter for participants with a diagnosis of "MCI" or "ADD" and a non-empty mbi_status + mci_add_df = merged_df[ + (merged_df["diagnosis"].isin(["MCI", "ADD"])) & merged_df["mbi_status"].notna() + ] + + # Identify the first session for each MCI/ADD participant + first_sessions = mci_add_df.groupby("participant_id")["ses"].min().reset_index() + + # Merge the first_sessions information back with the original mci_add_df + # This will filter mci_add_df to only include rows that match the first session for each participant + first_session_mci_add = pd.merge( + mci_add_df, first_sessions, on=["participant_id", "ses"] + ) + return first_session_mci_add + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Convert NPI to MBI, merge with QC and pheno data for OASIS3" + ) + parser.add_argument("rootpath", type=Path, help="Root path") + + args = parser.parse_args() + root_p = args.rootpath + + # Set paths + npi_p = root_p / "data/oasis3/OASIS3_UDSb5_npiq.csv" + qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" + output_p = root_p / "outputs/final_oasis3.tsv" + + # Load CSVs + npi_df = pd.read_csv(npi_p) + qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t") + + # Filter for dataset + qc_pheno_df = qc_pheno_df[qc_pheno_df["dataset"] == "oasis3"] + + # Convert NPI to MBI and calculate total score + mbi_df = oasis_npi_to_mbi(npi_df) + mbi_df = util.calculate_mbi_score(mbi_df) + mbi_df = select_columns(mbi_df) + + # Merge mbi data with qc_pheno data + merged_df = oasis_merge_mbi_qc(qc_pheno_df, mbi_df) + + # Select scans. Find the first available session for each participant. For MCI and ADD they must have an mbi score, for controls it does not matter + # This approach retains multiple runs from the same session + control_df = first_session_controls(merged_df) + mci_add_df = first_session_mci_add(merged_df) + + final_oasis = pd.concat([control_df, mci_add_df], ignore_index=True) + + # Output df + final_oasis.to_csv(output_p, sep="\t", index=False) From fde4fc5707d8a259b90dacdc53762bebaf48fc61 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Tue, 23 Apr 2024 14:26:14 -0400 Subject: [PATCH 12/21] fix data loading bug --- code/adni_npi_to_mbi.py | 2 +- code/cimaq_npi_to_mbi.py | 2 +- code/oasis3_npi_to_mbi.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/code/adni_npi_to_mbi.py b/code/adni_npi_to_mbi.py index 8c8502d..c2785f7 100644 --- a/code/adni_npi_to_mbi.py +++ b/code/adni_npi_to_mbi.py @@ -83,7 +83,7 @@ def adni_merge_mbi_qc(qc_pheno_df, mbi_df): # Load data npi_df = pd.read_csv(npi_p) - qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t") + qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t", low_memory=False) # Filter for dataset qc_pheno_df = qc_pheno_df[qc_pheno_df["dataset"] == "adni"] diff --git a/code/cimaq_npi_to_mbi.py b/code/cimaq_npi_to_mbi.py index f309ef0..677b0c1 100644 --- a/code/cimaq_npi_to_mbi.py +++ b/code/cimaq_npi_to_mbi.py @@ -114,7 +114,7 @@ def cimaq_merge_mbi_qc(qc_pheno_df, mbi_df): # Load CSVs npi_df = pd.read_csv(npi_p, sep="\t") - qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t") + qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t", low_memory=False) # Filter for dataset qc_pheno_df = qc_pheno_df[qc_pheno_df["dataset"] == "cimaq"] diff --git a/code/oasis3_npi_to_mbi.py b/code/oasis3_npi_to_mbi.py index 5d2d2d7..85e674c 100644 --- a/code/oasis3_npi_to_mbi.py +++ b/code/oasis3_npi_to_mbi.py @@ -110,7 +110,7 @@ def first_session_mci_add(merged_df): # Load CSVs npi_df = pd.read_csv(npi_p) - qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t") + qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t", low_memory=False) # Filter for dataset qc_pheno_df = qc_pheno_df[qc_pheno_df["dataset"] == "oasis3"] From e5f8afd1686b6faa595128a696b6d21d3ccf0720 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Mon, 29 Apr 2024 10:57:17 -0400 Subject: [PATCH 13/21] combined scripts into one --- code/adni_npi_to_mbi.py | 106 ---------------- code/cimaq_npi_to_mbi.py | 138 --------------------- code/merge_mbi.py | 108 ++++++++++++++++ code/oasis3_npi_to_mbi.py | 134 -------------------- code/oasis_npi_to_mbi.py | 134 -------------------- code/util.py | 252 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 360 insertions(+), 512 deletions(-) delete mode 100644 code/adni_npi_to_mbi.py delete mode 100644 code/cimaq_npi_to_mbi.py create mode 100644 code/merge_mbi.py delete mode 100644 code/oasis3_npi_to_mbi.py delete mode 100644 code/oasis_npi_to_mbi.py diff --git a/code/adni_npi_to_mbi.py b/code/adni_npi_to_mbi.py deleted file mode 100644 index c2785f7..0000000 --- a/code/adni_npi_to_mbi.py +++ /dev/null @@ -1,106 +0,0 @@ -import pandas as pd -import numpy as np -import argparse -import util - -from pathlib import Path - - -def adni_npi_to_mbi(df): - df["decreased_motivation"] = df["NPIG"] - df["emotional_dysregulation"] = df["NPID"] + df["NPIE"] + df["NPIF"] - df["impulse_dyscontrol"] = df["NPIC"] + df["NPII"] + df["NPIJ"] - df["social_inappropriateness"] = df["NPIH"] - df["abnormal_perception"] = df["NPIA"] + df["NPIB"] - return df - - -def select_columns(df): - columns = [ - "RID", - "EXAMDATE", - "decreased_motivation", - "emotional_dysregulation", - "impulse_dyscontrol", - "social_inappropriateness", - "abnormal_perception", - "mbi_total_score", - "mbi_status", - ] - df = df[columns].copy() - return df - - -def adni_merge_mbi_qc(qc_pheno_df, mbi_df): - # Grab just the ID part from participant_id, so it matches mbi_df - qc_pheno_df["RID"] = ( - qc_pheno_df["participant_id"].str.split("S").str[-1].astype(int) - ) - - # Rename date field so it matches - mbi_df.rename(columns={"EXAMDATE": "ses"}, inplace=True) - - # Replace some rougue dates - mbi_df["ses"] = mbi_df["ses"].replace("0012-02-14", "2012-02-14") - mbi_df["ses"] = mbi_df["ses"].replace("0013-05-06", "2013-05-06") - mbi_df["ses"] = mbi_df["ses"].replace("0013-10-28", "2013-10-28") - - # Convert sessions to datetime - qc_pheno_df["ses"] = pd.to_datetime(qc_pheno_df["ses"]) - mbi_df["ses"] = pd.to_datetime(mbi_df["ses"]) - - # Ensure ordered by session - qc_pheno_df = qc_pheno_df.sort_values(by=["ses"]) - mbi_df = mbi_df.dropna(subset=["ses"]) # Since some were missing - mbi_df = mbi_df.sort_values(by=["ses"]) - - # Merge to get nearest mbi result within 6 months - merged_df = pd.merge_asof( - qc_pheno_df, - mbi_df, - by="RID", - on="ses", - direction="nearest", - tolerance=pd.Timedelta(days=183), - ) - - return merged_df - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Convert NPI to MBI, merge with QC and pheno data for ADNI" - ) - parser.add_argument("rootpath", type=Path, help="Root path") - - args = parser.parse_args() - root_p = args.rootpath - - # Set paths - npi_p = root_p / "data/adni/NPI_22Aug2023.csv" - qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" - output_p = root_p / "outputs/final_adni.tsv" - - # Load data - npi_df = pd.read_csv(npi_p) - qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t", low_memory=False) - - # Filter for dataset - qc_pheno_df = qc_pheno_df[qc_pheno_df["dataset"] == "adni"] - - # Convert NPI to MBI and calculate total score - mbi_df = adni_npi_to_mbi(npi_df) - mbi_df = util.calculate_mbi_score(mbi_df) - mbi_df = select_columns(mbi_df) - - merged_df = adni_merge_mbi_qc(qc_pheno_df, mbi_df) - - # Select scans. For controls, we take the first available scan. For MCI and ADD, take the first with an MBI score - final_adni = ( - merged_df.groupby(["participant_id"], as_index=False) - .apply(util.select_row) - .reset_index(drop=True) - ) - - # Output df - final_adni.to_csv(output_p, sep="\t", index=False) diff --git a/code/cimaq_npi_to_mbi.py b/code/cimaq_npi_to_mbi.py deleted file mode 100644 index 677b0c1..0000000 --- a/code/cimaq_npi_to_mbi.py +++ /dev/null @@ -1,138 +0,0 @@ -import pandas as pd -import numpy as np -import argparse -import util -from pathlib import Path - - -def map_values(df): - # Drop rows with some data unavailable - df = df[df["22901_score"] != "donnée_non_disponible"].copy() - - # Map scores to numerical values - mapping = {"0_non": 0, "1_oui_léger": 1, "2_oui_modéré": 2, "3_oui_sévère": 3} - - columns_to_map = [ - "22901_apathie", - "22901_depression_dysphorie", - "22901_anxiete", - "22901_euphorie", - "22901_agitation_aggressivite", - "22901_irritabilite", - "22901_comp_moteur_aberrant", - "22901_impulsivite", - "22901_idees_delirantes", - "22901_hallucinations", - ] - - for column in columns_to_map: - df[column] = df[column].map(mapping) - return df - - -def cimaq_npi_to_mbi(df): - df["decreased_motivation"] = df["22901_apathie"] - df["emotional_dysregulation"] = ( - df["22901_depression_dysphorie"] + df["22901_anxiete"] + df["22901_euphorie"] - ) - df["impulse_dyscontrol"] = ( - df["22901_agitation_aggressivite"] - + df["22901_irritabilite"] - + df["22901_comp_moteur_aberrant"] - ) - df["social_inappropriateness"] = df["22901_impulsivite"] - df["abnormal_perception"] = ( - df["22901_idees_delirantes"] + df["22901_hallucinations"] - ) - return df - - -def select_columns(df): - columns = [ - "pscid", - "no_visite", - "decreased_motivation", - "emotional_dysregulation", - "impulse_dyscontrol", - "social_inappropriateness", - "abnormal_perception", - "mbi_total_score", - "mbi_status", - ] - df = df[columns].copy() - return df - - -def cimaq_merge_mbi_qc(qc_pheno_df, mbi_df): - # Rename columns in mbi_df so they match - mbi_df.rename(columns={"pscid": "participant_id"}, inplace=True) - mbi_df.rename(columns={"no_visite": "ses"}, inplace=True) - - # Format id - mbi_df["participant_id"] = mbi_df["participant_id"].astype(int) - qc_pheno_df["participant_id"] = qc_pheno_df["participant_id"].astype(int) - - # Strip the 'V' from ses and convert to integer - mbi_df["ses_numeric"] = mbi_df["ses"].str.lstrip("V").astype(int) - qc_pheno_df["ses_numeric"] = qc_pheno_df["ses"].str.lstrip("V").astype(int) - - # Ensure ordered by session - qc_pheno_df = qc_pheno_df.sort_values(by=["ses_numeric"]) - mbi_df = mbi_df.sort_values(by=["ses_numeric"]) - - # Merge to get nearest mbi result within 6 months - merged_df = pd.merge_asof( - qc_pheno_df, - mbi_df, - by="participant_id", - on="ses_numeric", - direction="nearest", - tolerance=(6), - ) - - # Handle session columns - merged_df.drop(columns=["ses_y"], inplace=True) - merged_df.rename(columns={"ses_x": "ses"}, inplace=True) - merged_df.drop(columns=["ses_numeric"], inplace=True) - - return merged_df - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Convert NPI to MBI, merge with QC and pheno data for OASIS3" - ) - parser.add_argument("rootpath", type=Path, help="Root path") - - args = parser.parse_args() - root_p = args.rootpath - - # Set paths - npi_p = root_p / "data/cimaq/22901_inventaire_neuropsychiatrique_q.tsv" - qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" - output_p = root_p / "outputs/final_cimaq.tsv" - - # Load CSVs - npi_df = pd.read_csv(npi_p, sep="\t") - qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t", low_memory=False) - - # Filter for dataset - qc_pheno_df = qc_pheno_df[qc_pheno_df["dataset"] == "cimaq"] - - # Convert NPI to MBI and calculate total score - npi_df = map_values(npi_df) - mbi_df = cimaq_npi_to_mbi(npi_df) - mbi_df = util.calculate_mbi_score(mbi_df) - mbi_df = select_columns(mbi_df) - - merged_df = cimaq_merge_mbi_qc(qc_pheno_df, mbi_df) - - # Select scans. For controls, we take the first available scan. For MCI and ADD, take the first with an MBI score - final_cimaq = ( - merged_df.groupby(["participant_id"], as_index=False) - .apply(util.select_row) - .reset_index(drop=True) - ) - - # Output df - final_cimaq.to_csv(output_p, sep="\t", index=False) diff --git a/code/merge_mbi.py b/code/merge_mbi.py new file mode 100644 index 0000000..a34a6d9 --- /dev/null +++ b/code/merge_mbi.py @@ -0,0 +1,108 @@ +import pandas as pd +import numpy as np +import argparse +import util + +from pathlib import Path + + +def process_adni_mbi(qc_pheno_df): + # Set path and load data + npi_p = root_p / "data/adni/NPI_22Aug2023.csv" + npi_df = pd.read_csv(npi_p) + + # Filter for dataset + qc_pheno_df = qc_pheno_df[qc_pheno_df["dataset"] == "adni"] + + # Convert NPI to MBI and calculate total score + mbi_df = util.adni_npi_to_mbi(npi_df) + mbi_df = util.calculate_mbi_score(mbi_df) + mbi_df = util.adni_select_columns(mbi_df) + + # Merge mbi data with qc_pheno data + merged_df = util.adni_merge_mbi_qc(qc_pheno_df, mbi_df) + + # Select scans. For controls, we take the first available scan. For MCI and ADD, take the first with an MBI score + final_adni = ( + merged_df.groupby(["participant_id"], as_index=False) + .apply(util.select_row) + .reset_index(drop=True) + ) + + return final_adni + + +def process_cimaq_mbi(qc_pheno_df): + # Set path and load data + npi_p = root_p / "data/cimaq/22901_inventaire_neuropsychiatrique_q.tsv" + npi_df = pd.read_csv(npi_p, sep="\t") + + # Filter for dataset + qc_pheno_df = qc_pheno_df[qc_pheno_df["dataset"] == "cimaq"] + + # Convert NPI to MBI and calculate total score + npi_df = util.cimaq_map_values(npi_df) + mbi_df = util.cimaq_npi_to_mbi(npi_df) + mbi_df = util.calculate_mbi_score(mbi_df) + mbi_df = util.cimaq_select_columns(mbi_df) + + # Merge mbi data with qc_pheno data + merged_df = util.cimaq_merge_mbi_qc(qc_pheno_df, mbi_df) + + # Select scans. For controls, we take the first available scan. For MCI and ADD, take the first with an MBI score + final_cimaq = ( + merged_df.groupby(["participant_id"], as_index=False) + .apply(util.select_row) + .reset_index(drop=True) + ) + + return final_cimaq + + +def process_oasis3_mbi(qc_pheno_df): + # Set path and load data + npi_p = root_p / "data/oasis3/OASIS3_UDSb5_npiq.csv" + npi_df = pd.read_csv(npi_p) + + # Filter for dataset + qc_pheno_df = qc_pheno_df[qc_pheno_df["dataset"] == "oasis3"] + + # Convert NPI to MBI and calculate total score + mbi_df = util.oasis3_npi_to_mbi(npi_df) + mbi_df = util.calculate_mbi_score(mbi_df) + mbi_df = util.oasis3_select_columns(mbi_df) + + # Merge mbi data with qc_pheno data + merged_df = util.oasis3_merge_mbi_qc(qc_pheno_df, mbi_df) + + # Select scans. Find the first available session for each participant. For MCI and ADD they must have an mbi score, for controls it does not matter + # This approach retains multiple runs from the same session + control_df = util.first_session_controls(merged_df) + mci_add_df = util.first_session_mci_add(merged_df) + + final_oasis3 = pd.concat([control_df, mci_add_df], ignore_index=True) + + return final_oasis3 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Convert NPI to MBI, merge with QC and pheno data for ADNI, CIMA-Q and OASIS3" + ) + parser.add_argument("rootpath", type=Path, help="Root path") + + args = parser.parse_args() + root_p = args.rootpath + + # Load passed_qc_master once + qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" + qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t", low_memory=False) + + # Process mbi scores for specific datasets + final_adni = process_adni_mbi(qc_pheno_df) + final_cimaq = process_cimaq_mbi(qc_pheno_df) + final_oasis3 = process_oasis3_mbi(qc_pheno_df) + + # Output df + output_p = root_p / "outputs/final_oasis3.tsv" + final_oasis3.to_csv(output_p, sep="\t", index=False) diff --git a/code/oasis3_npi_to_mbi.py b/code/oasis3_npi_to_mbi.py deleted file mode 100644 index 85e674c..0000000 --- a/code/oasis3_npi_to_mbi.py +++ /dev/null @@ -1,134 +0,0 @@ -import pandas as pd -import numpy as np -import argparse -import util - -from pathlib import Path - - -def oasis_npi_to_mbi(df): - df["decreased_motivation"] = df["APA"] - df["emotional_dysregulation"] = df["DEPD"] + df["ANX"] + df["ELAT"] - df["impulse_dyscontrol"] = df["AGIT"] + df["IRR"] + df["MOT"] - df["social_inappropriateness"] = df["DISN"] - df["abnormal_perception"] = df["DEL"] + df["HALL"] - return df - - -def select_columns(df): - columns = [ - "OASISID", - "days_to_visit", - "decreased_motivation", - "emotional_dysregulation", - "impulse_dyscontrol", - "social_inappropriateness", - "abnormal_perception", - "mbi_total_score", - "mbi_status", - ] - df = df[columns].copy() - return df - - -def oasis_merge_mbi_qc(qc_pheno_df, mbi_df): - # Rename columns in mbi_df so they match - mbi_df.rename(columns={"OASISID": "participant_id"}, inplace=True) - mbi_df.rename(columns={"days_to_visit": "ses"}, inplace=True) - - # Convert ses to integer and strip the d where necessary - mbi_df["ses_numeric"] = mbi_df["ses"].astype(int) - qc_pheno_df["ses_numeric"] = qc_pheno_df["ses"].str.lstrip("d").astype(int) - - # Ensure ordered by session - qc_pheno_df = qc_pheno_df.sort_values(by=["ses_numeric"]) - mbi_df = mbi_df.sort_values(by=["ses_numeric"]) - - # Merge to get nearest mbi result within 6 months - merged_df = pd.merge_asof( - qc_pheno_df, - mbi_df, - by="participant_id", - on="ses_numeric", - direction="nearest", - tolerance=(183), - ) - - # Handle session columns - merged_df.drop(columns=["ses_y"], inplace=True) - merged_df.rename(columns={"ses_x": "ses"}, inplace=True) - merged_df.drop(columns=["ses_numeric"], inplace=True) - - return merged_df - - -def first_session_controls(merged_df): - # Filter for controls - controls_df = merged_df[merged_df["diagnosis"] == "CON"] - - # Identify the first session for each participant - first_sessions = controls_df.groupby("participant_id")["ses"].min().reset_index() - - # Merge the first_sessions information back with the original controls_df - # This will filter controls_df to only include rows that match the first session for each participant - first_session_controls = pd.merge( - controls_df, first_sessions, on=["participant_id", "ses"] - ) - return first_session_controls - - -def first_session_mci_add(merged_df): - # Filter for participants with a diagnosis of "MCI" or "ADD" and a non-empty mbi_status - mci_add_df = merged_df[ - (merged_df["diagnosis"].isin(["MCI", "ADD"])) & merged_df["mbi_status"].notna() - ] - - # Identify the first session for each MCI/ADD participant - first_sessions = mci_add_df.groupby("participant_id")["ses"].min().reset_index() - - # Merge the first_sessions information back with the original mci_add_df - # This will filter mci_add_df to only include rows that match the first session for each participant - first_session_mci_add = pd.merge( - mci_add_df, first_sessions, on=["participant_id", "ses"] - ) - return first_session_mci_add - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Convert NPI to MBI, merge with QC and pheno data for OASIS3" - ) - parser.add_argument("rootpath", type=Path, help="Root path") - - args = parser.parse_args() - root_p = args.rootpath - - # Set paths - npi_p = root_p / "data/oasis3/OASIS3_UDSb5_npiq.csv" - qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" - output_p = root_p / "outputs/final_oasis3.tsv" - - # Load CSVs - npi_df = pd.read_csv(npi_p) - qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t", low_memory=False) - - # Filter for dataset - qc_pheno_df = qc_pheno_df[qc_pheno_df["dataset"] == "oasis3"] - - # Convert NPI to MBI and calculate total score - mbi_df = oasis_npi_to_mbi(npi_df) - mbi_df = util.calculate_mbi_score(mbi_df) - mbi_df = select_columns(mbi_df) - - # Merge mbi data with qc_pheno data - merged_df = oasis_merge_mbi_qc(qc_pheno_df, mbi_df) - - # Select scans. Find the first available session for each participant. For MCI and ADD they must have an mbi score, for controls it does not matter - # This approach retains multiple runs from the same session - control_df = first_session_controls(merged_df) - mci_add_df = first_session_mci_add(merged_df) - - final_oasis = pd.concat([control_df, mci_add_df], ignore_index=True) - - # Output df - final_oasis.to_csv(output_p, sep="\t", index=False) diff --git a/code/oasis_npi_to_mbi.py b/code/oasis_npi_to_mbi.py deleted file mode 100644 index 5d2d2d7..0000000 --- a/code/oasis_npi_to_mbi.py +++ /dev/null @@ -1,134 +0,0 @@ -import pandas as pd -import numpy as np -import argparse -import util - -from pathlib import Path - - -def oasis_npi_to_mbi(df): - df["decreased_motivation"] = df["APA"] - df["emotional_dysregulation"] = df["DEPD"] + df["ANX"] + df["ELAT"] - df["impulse_dyscontrol"] = df["AGIT"] + df["IRR"] + df["MOT"] - df["social_inappropriateness"] = df["DISN"] - df["abnormal_perception"] = df["DEL"] + df["HALL"] - return df - - -def select_columns(df): - columns = [ - "OASISID", - "days_to_visit", - "decreased_motivation", - "emotional_dysregulation", - "impulse_dyscontrol", - "social_inappropriateness", - "abnormal_perception", - "mbi_total_score", - "mbi_status", - ] - df = df[columns].copy() - return df - - -def oasis_merge_mbi_qc(qc_pheno_df, mbi_df): - # Rename columns in mbi_df so they match - mbi_df.rename(columns={"OASISID": "participant_id"}, inplace=True) - mbi_df.rename(columns={"days_to_visit": "ses"}, inplace=True) - - # Convert ses to integer and strip the d where necessary - mbi_df["ses_numeric"] = mbi_df["ses"].astype(int) - qc_pheno_df["ses_numeric"] = qc_pheno_df["ses"].str.lstrip("d").astype(int) - - # Ensure ordered by session - qc_pheno_df = qc_pheno_df.sort_values(by=["ses_numeric"]) - mbi_df = mbi_df.sort_values(by=["ses_numeric"]) - - # Merge to get nearest mbi result within 6 months - merged_df = pd.merge_asof( - qc_pheno_df, - mbi_df, - by="participant_id", - on="ses_numeric", - direction="nearest", - tolerance=(183), - ) - - # Handle session columns - merged_df.drop(columns=["ses_y"], inplace=True) - merged_df.rename(columns={"ses_x": "ses"}, inplace=True) - merged_df.drop(columns=["ses_numeric"], inplace=True) - - return merged_df - - -def first_session_controls(merged_df): - # Filter for controls - controls_df = merged_df[merged_df["diagnosis"] == "CON"] - - # Identify the first session for each participant - first_sessions = controls_df.groupby("participant_id")["ses"].min().reset_index() - - # Merge the first_sessions information back with the original controls_df - # This will filter controls_df to only include rows that match the first session for each participant - first_session_controls = pd.merge( - controls_df, first_sessions, on=["participant_id", "ses"] - ) - return first_session_controls - - -def first_session_mci_add(merged_df): - # Filter for participants with a diagnosis of "MCI" or "ADD" and a non-empty mbi_status - mci_add_df = merged_df[ - (merged_df["diagnosis"].isin(["MCI", "ADD"])) & merged_df["mbi_status"].notna() - ] - - # Identify the first session for each MCI/ADD participant - first_sessions = mci_add_df.groupby("participant_id")["ses"].min().reset_index() - - # Merge the first_sessions information back with the original mci_add_df - # This will filter mci_add_df to only include rows that match the first session for each participant - first_session_mci_add = pd.merge( - mci_add_df, first_sessions, on=["participant_id", "ses"] - ) - return first_session_mci_add - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Convert NPI to MBI, merge with QC and pheno data for OASIS3" - ) - parser.add_argument("rootpath", type=Path, help="Root path") - - args = parser.parse_args() - root_p = args.rootpath - - # Set paths - npi_p = root_p / "data/oasis3/OASIS3_UDSb5_npiq.csv" - qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" - output_p = root_p / "outputs/final_oasis3.tsv" - - # Load CSVs - npi_df = pd.read_csv(npi_p) - qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t") - - # Filter for dataset - qc_pheno_df = qc_pheno_df[qc_pheno_df["dataset"] == "oasis3"] - - # Convert NPI to MBI and calculate total score - mbi_df = oasis_npi_to_mbi(npi_df) - mbi_df = util.calculate_mbi_score(mbi_df) - mbi_df = select_columns(mbi_df) - - # Merge mbi data with qc_pheno data - merged_df = oasis_merge_mbi_qc(qc_pheno_df, mbi_df) - - # Select scans. Find the first available session for each participant. For MCI and ADD they must have an mbi score, for controls it does not matter - # This approach retains multiple runs from the same session - control_df = first_session_controls(merged_df) - mci_add_df = first_session_mci_add(merged_df) - - final_oasis = pd.concat([control_df, mci_add_df], ignore_index=True) - - # Output df - final_oasis.to_csv(output_p, sep="\t", index=False) diff --git a/code/util.py b/code/util.py index 896c492..9ff11fe 100644 --- a/code/util.py +++ b/code/util.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd def calculate_mbi_score(df): @@ -32,3 +33,254 @@ def select_row(group): else: # For 'ADD' and 'MCI', select the first row where 'mbi_status' has a value return group[group["mbi_status"].notna()].head(1) + + +def adni_npi_to_mbi(df): + df["decreased_motivation"] = df["NPIG"] + df["emotional_dysregulation"] = df["NPID"] + df["NPIE"] + df["NPIF"] + df["impulse_dyscontrol"] = df["NPIC"] + df["NPII"] + df["NPIJ"] + df["social_inappropriateness"] = df["NPIH"] + df["abnormal_perception"] = df["NPIA"] + df["NPIB"] + return df + + +def adni_select_columns(df): + columns = [ + "RID", + "EXAMDATE", + "decreased_motivation", + "emotional_dysregulation", + "impulse_dyscontrol", + "social_inappropriateness", + "abnormal_perception", + "mbi_total_score", + "mbi_status", + ] + df = df[columns].copy() + return df + + +def adni_merge_mbi_qc(qc_pheno_df, mbi_df): + qc_pheno_df = qc_pheno_df.copy() + mbi_df = mbi_df.copy() + + # Grab just the ID part from participant_id, so it matches mbi_df + qc_pheno_df["RID"] = ( + qc_pheno_df["participant_id"].str.split("S").str[-1].astype(int) + ) + + # Rename date field so it matches + mbi_df.rename(columns={"EXAMDATE": "ses"}, inplace=True) + + # Replace some rougue dates + mbi_df["ses"] = mbi_df["ses"].replace("0012-02-14", "2012-02-14") + mbi_df["ses"] = mbi_df["ses"].replace("0013-05-06", "2013-05-06") + mbi_df["ses"] = mbi_df["ses"].replace("0013-10-28", "2013-10-28") + + # Convert sessions to datetime + qc_pheno_df["ses"] = pd.to_datetime(qc_pheno_df["ses"]) + mbi_df["ses"] = pd.to_datetime(mbi_df["ses"]) + + # Ensure ordered by session + qc_pheno_df = qc_pheno_df.sort_values(by=["ses"]) + mbi_df = mbi_df.dropna(subset=["ses"]) # Since some were missing + mbi_df = mbi_df.sort_values(by=["ses"]) + + # Merge to get nearest mbi result within 6 months + merged_df = pd.merge_asof( + qc_pheno_df, + mbi_df, + by="RID", + on="ses", + direction="nearest", + tolerance=pd.Timedelta(days=183), + ) + + return merged_df + + +def cimaq_map_values(df): + # Drop rows with some data unavailable + df = df[df["22901_score"] != "donnée_non_disponible"].copy() + + # Map scores to numerical values + mapping = {"0_non": 0, "1_oui_léger": 1, "2_oui_modéré": 2, "3_oui_sévère": 3} + + columns_to_map = [ + "22901_apathie", + "22901_depression_dysphorie", + "22901_anxiete", + "22901_euphorie", + "22901_agitation_aggressivite", + "22901_irritabilite", + "22901_comp_moteur_aberrant", + "22901_impulsivite", + "22901_idees_delirantes", + "22901_hallucinations", + ] + + for column in columns_to_map: + df[column] = df[column].map(mapping) + return df + + +def cimaq_npi_to_mbi(df): + df["decreased_motivation"] = df["22901_apathie"] + df["emotional_dysregulation"] = ( + df["22901_depression_dysphorie"] + df["22901_anxiete"] + df["22901_euphorie"] + ) + df["impulse_dyscontrol"] = ( + df["22901_agitation_aggressivite"] + + df["22901_irritabilite"] + + df["22901_comp_moteur_aberrant"] + ) + df["social_inappropriateness"] = df["22901_impulsivite"] + df["abnormal_perception"] = ( + df["22901_idees_delirantes"] + df["22901_hallucinations"] + ) + return df + + +def cimaq_select_columns(df): + columns = [ + "pscid", + "no_visite", + "decreased_motivation", + "emotional_dysregulation", + "impulse_dyscontrol", + "social_inappropriateness", + "abnormal_perception", + "mbi_total_score", + "mbi_status", + ] + df = df[columns].copy() + return df + + +def cimaq_merge_mbi_qc(qc_pheno_df, mbi_df): + qc_pheno_df = qc_pheno_df.copy() + mbi_df = mbi_df.copy() + + # Rename columns in mbi_df so they match + mbi_df.rename(columns={"pscid": "participant_id"}, inplace=True) + mbi_df.rename(columns={"no_visite": "ses"}, inplace=True) + + # Format id + mbi_df["participant_id"] = mbi_df["participant_id"].astype(int) + qc_pheno_df["participant_id"] = qc_pheno_df["participant_id"].astype(int) + + # Strip the 'V' from ses and convert to integer + mbi_df["ses_numeric"] = mbi_df["ses"].str.lstrip("V").astype(int) + qc_pheno_df["ses_numeric"] = qc_pheno_df["ses"].str.lstrip("V").astype(int) + + # Ensure ordered by session + qc_pheno_df = qc_pheno_df.sort_values(by=["ses_numeric"]) + mbi_df = mbi_df.sort_values(by=["ses_numeric"]) + + # Merge to get nearest mbi result within 6 months + merged_df = pd.merge_asof( + qc_pheno_df, + mbi_df, + by="participant_id", + on="ses_numeric", + direction="nearest", + tolerance=(6), + ) + + # Handle session columns + merged_df.drop(columns=["ses_y"], inplace=True) + merged_df.rename(columns={"ses_x": "ses"}, inplace=True) + merged_df.drop(columns=["ses_numeric"], inplace=True) + + return merged_df + + +def oasis3_npi_to_mbi(df): + df["decreased_motivation"] = df["APA"] + df["emotional_dysregulation"] = df["DEPD"] + df["ANX"] + df["ELAT"] + df["impulse_dyscontrol"] = df["AGIT"] + df["IRR"] + df["MOT"] + df["social_inappropriateness"] = df["DISN"] + df["abnormal_perception"] = df["DEL"] + df["HALL"] + return df + + +def oasis3_select_columns(df): + columns = [ + "OASISID", + "days_to_visit", + "decreased_motivation", + "emotional_dysregulation", + "impulse_dyscontrol", + "social_inappropriateness", + "abnormal_perception", + "mbi_total_score", + "mbi_status", + ] + df = df[columns].copy() + return df + + +def oasis3_merge_mbi_qc(qc_pheno_df, mbi_df): + qc_pheno_df = qc_pheno_df.copy() + mbi_df = mbi_df.copy() + + # Rename columns in mbi_df so they match + mbi_df.rename(columns={"OASISID": "participant_id"}, inplace=True) + mbi_df.rename(columns={"days_to_visit": "ses"}, inplace=True) + + # Convert ses to integer and strip the d where necessary + mbi_df["ses_numeric"] = mbi_df["ses"].astype(int) + qc_pheno_df["ses_numeric"] = qc_pheno_df["ses"].str.lstrip("d").astype(int) + + # Ensure ordered by session + qc_pheno_df = qc_pheno_df.sort_values(by=["ses_numeric"]) + mbi_df = mbi_df.sort_values(by=["ses_numeric"]) + + # Merge to get nearest mbi result within 6 months + merged_df = pd.merge_asof( + qc_pheno_df, + mbi_df, + by="participant_id", + on="ses_numeric", + direction="nearest", + tolerance=(183), + ) + + # Handle session columns + merged_df.drop(columns=["ses_y"], inplace=True) + merged_df.rename(columns={"ses_x": "ses"}, inplace=True) + merged_df.drop(columns=["ses_numeric"], inplace=True) + + return merged_df + + +def first_session_controls(merged_df): + # Filter for controls + controls_df = merged_df[merged_df["diagnosis"] == "CON"] + + # Identify the first session for each participant + first_sessions = controls_df.groupby("participant_id")["ses"].min().reset_index() + + # Merge the first_sessions information back with the original controls_df + # This will filter controls_df to only include rows that match the first session for each participant + first_session_controls = pd.merge( + controls_df, first_sessions, on=["participant_id", "ses"] + ) + return first_session_controls + + +def first_session_mci_add(merged_df): + # Filter for participants with a diagnosis of "MCI" or "ADD" and a non-empty mbi_status + mci_add_df = merged_df[ + (merged_df["diagnosis"].isin(["MCI", "ADD"])) & merged_df["mbi_status"].notna() + ] + + # Identify the first session for each MCI/ADD participant + first_sessions = mci_add_df.groupby("participant_id")["ses"].min().reset_index() + + # Merge the first_sessions information back with the original mci_add_df + # This will filter mci_add_df to only include rows that match the first session for each participant + first_session_mci_add = pd.merge( + mci_add_df, first_sessions, on=["participant_id", "ses"] + ) + return first_session_mci_add From 4aadb2946aa0e7fb69ebd8fe2e0c182413faf335 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Mon, 29 Apr 2024 11:11:03 -0400 Subject: [PATCH 14/21] dropped column not needed --- code/merge_mbi.py | 15 +++++++++++++-- code/util.py | 2 ++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/code/merge_mbi.py b/code/merge_mbi.py index a34a6d9..c3c4edb 100644 --- a/code/merge_mbi.py +++ b/code/merge_mbi.py @@ -103,6 +103,17 @@ def process_oasis3_mbi(qc_pheno_df): final_cimaq = process_cimaq_mbi(qc_pheno_df) final_oasis3 = process_oasis3_mbi(qc_pheno_df) + # Remove existing data for specific datasets from qc_pheno_df + remaining_qc_pheno_df = qc_pheno_df[ + ~qc_pheno_df["dataset"].isin(["adni", "cimaq", "oasis3"]) + ] + + # Concatenate the remaining with the new processed mbi data + updated_qc_pheno_df = pd.concat( + [remaining_qc_pheno_df, final_adni, final_cimaq, final_oasis3], + ignore_index=True, + ) + # Output df - output_p = root_p / "outputs/final_oasis3.tsv" - final_oasis3.to_csv(output_p, sep="\t", index=False) + output_p = root_p / "outputs/final_master_pheno.tsv" + updated_qc_pheno_df.to_csv(output_p, sep="\t", index=False) diff --git a/code/util.py b/code/util.py index 9ff11fe..e7ecf4a 100644 --- a/code/util.py +++ b/code/util.py @@ -96,6 +96,8 @@ def adni_merge_mbi_qc(qc_pheno_df, mbi_df): tolerance=pd.Timedelta(days=183), ) + merged_df = merged_df.drop("RID", axis=1) + return merged_df From 10b294c858633801aa69da92c15bc5ed870a22e1 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Wed, 1 May 2024 09:16:21 -0400 Subject: [PATCH 15/21] refactored code --- code/{merge_mbi.py => create_final_pheno.py} | 94 ++++++++++++++++---- code/util.py | 5 +- 2 files changed, 81 insertions(+), 18 deletions(-) rename code/{merge_mbi.py => create_final_pheno.py} (59%) diff --git a/code/merge_mbi.py b/code/create_final_pheno.py similarity index 59% rename from code/merge_mbi.py rename to code/create_final_pheno.py index c3c4edb..f74a482 100644 --- a/code/merge_mbi.py +++ b/code/create_final_pheno.py @@ -85,6 +85,76 @@ def process_oasis3_mbi(qc_pheno_df): return final_oasis3 +def assign_mbi_group(row): + if row["diagnosis"] in ["ADD", "ADD(M)"]: + if row["mbi_status"] == 1: + return "ADD+" + elif row["mbi_status"] == 0: + return "ADD-" + elif row["diagnosis"] in ["MCI", "EMCI", "LMCI"]: + if row["mbi_status"] == 1: + return "MCI+" + elif row["mbi_status"] == 0: + return "MCI-" + elif row["diagnosis"] == "CON": + return "CON-AD" + + +def assign_sz_group(row): + if row["diagnosis"] in ["SCHZ"]: + return "SCHZ" + elif row["diagnosis"] == "CON": + return "CON-SCHZ" + + +def create_final_ad_df(qc_pheno_df): + # Process mbi scores for specific datasets + final_adni = process_adni_mbi(qc_pheno_df) + final_cimaq = process_cimaq_mbi(qc_pheno_df) + final_oasis3 = process_oasis3_mbi(qc_pheno_df) + + # Concatenate these Alzheimer datasets + ad_datasets_df = pd.concat( + [final_adni, final_cimaq, final_oasis3], + ignore_index=True, + ) + + # Assign group based on diagnosis and MBI status + ad_datasets_df["group"] = ad_datasets_df.apply(assign_mbi_group, axis=1) + + # Save Alzheimer datasets with MBI data for further analysis + ad_datasets_df.to_csv(root_p / "outputs/ad_datasets_df.tsv", sep="\t", index=False) + + # Drop MBI columns, not needed for further analysis + ad_datasets_df = ad_datasets_df.drop( + [ + "decreased_motivation", + "emotional_dysregulation", + "impulse_dyscontrol", + "social_inappropriateness", + "abnormal_perception", + "mbi_total_score", + "mbi_status", + ], + axis=1, + ) + + return ad_datasets_df + + +def create_final_sz_df(qc_pheno_df): + # Select schizophrenia datasets from qc_pheno + sz_datasets_df = qc_pheno_df[ + qc_pheno_df["dataset"].isin(["hcpep", "cobre", "srpbs", "ds000030"]) + ].copy() + + # Re-code diagnosis and assign group + sz_datasets_df["diagnosis"] = sz_datasets_df["diagnosis"].replace("PSYC", "SCHZ") + sz_datasets_df["group"] = sz_datasets_df.apply(assign_sz_group, axis=1) + + return sz_datasets_df + + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Convert NPI to MBI, merge with QC and pheno data for ADNI, CIMA-Q and OASIS3" @@ -98,22 +168,16 @@ def process_oasis3_mbi(qc_pheno_df): qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t", low_memory=False) - # Process mbi scores for specific datasets - final_adni = process_adni_mbi(qc_pheno_df) - final_cimaq = process_cimaq_mbi(qc_pheno_df) - final_oasis3 = process_oasis3_mbi(qc_pheno_df) - - # Remove existing data for specific datasets from qc_pheno_df - remaining_qc_pheno_df = qc_pheno_df[ - ~qc_pheno_df["dataset"].isin(["adni", "cimaq", "oasis3"]) - ] + # Create dfs for different diagnosis datasets + ad_datasets_df = create_final_ad_df(qc_pheno_df) + sz_datasets_df = create_final_sz_df(qc_pheno_df) - # Concatenate the remaining with the new processed mbi data - updated_qc_pheno_df = pd.concat( - [remaining_qc_pheno_df, final_adni, final_cimaq, final_oasis3], + # Concatenate the two sets of datasets + final_qc_pheno_df = pd.concat( + [ad_datasets_df, sz_datasets_df], ignore_index=True, ) - # Output df - output_p = root_p / "outputs/final_master_pheno.tsv" - updated_qc_pheno_df.to_csv(output_p, sep="\t", index=False) + # Save Alzheimer datasets + output_p = root_p / "outputs/final_qc_pheno.tsv" + final_qc_pheno_df.to_csv(output_p, sep="\t", index=False) diff --git a/code/util.py b/code/util.py index e7ecf4a..f8327f4 100644 --- a/code/util.py +++ b/code/util.py @@ -18,7 +18,6 @@ def calculate_mbi_score(df): df.loc[df[mbi_domains].isna().all(axis=1), "mbi_total_score"] = np.nan # Calculate mbi_status based on mbi_total_score - # Set mbi_status to NaN where mbi_total_score is NaN df["mbi_status"] = np.where( df["mbi_total_score"].isna(), np.nan, (df["mbi_total_score"] >= 1).astype(int) ) @@ -28,7 +27,7 @@ def calculate_mbi_score(df): def select_row(group): if group["diagnosis"].iloc[0] == "CON": - # For 'CON', the first row + # For 'CON', select the first row return group.head(1) else: # For 'ADD' and 'MCI', select the first row where 'mbi_status' has a value @@ -72,7 +71,7 @@ def adni_merge_mbi_qc(qc_pheno_df, mbi_df): # Rename date field so it matches mbi_df.rename(columns={"EXAMDATE": "ses"}, inplace=True) - # Replace some rougue dates + # Replace some rogue dates mbi_df["ses"] = mbi_df["ses"].replace("0012-02-14", "2012-02-14") mbi_df["ses"] = mbi_df["ses"].replace("0013-05-06", "2013-05-06") mbi_df["ses"] = mbi_df["ses"].replace("0013-10-28", "2013-10-28") From 299873e2b8d9ee7b126ed4d4b688d2c056acc6d6 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Mon, 6 May 2024 13:10:14 -0400 Subject: [PATCH 16/21] final pheno script --- code/create_final_pheno.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/code/create_final_pheno.py b/code/create_final_pheno.py index f74a482..1e2a4b2 100644 --- a/code/create_final_pheno.py +++ b/code/create_final_pheno.py @@ -1,6 +1,5 @@ import pandas as pd import numpy as np -import argparse import util from pathlib import Path @@ -86,12 +85,12 @@ def process_oasis3_mbi(qc_pheno_df): def assign_mbi_group(row): - if row["diagnosis"] in ["ADD", "ADD(M)"]: + if row["diagnosis"] in ["ADD"]: if row["mbi_status"] == 1: return "ADD+" elif row["mbi_status"] == 0: return "ADD-" - elif row["diagnosis"] in ["MCI", "EMCI", "LMCI"]: + elif row["diagnosis"] in ["MCI"]: if row["mbi_status"] == 1: return "MCI+" elif row["mbi_status"] == 0: @@ -119,11 +118,17 @@ def create_final_ad_df(qc_pheno_df): ignore_index=True, ) - # Assign group based on diagnosis and MBI status + # Re-code diagnoses and assign groups. For this study we are not looking at MCI/ADD subtypes + ad_datasets_df["diagnosis"] = ad_datasets_df["diagnosis"].replace( + {"ADD(M)": "ADD", "EMCI": "MCI", "LMCI": "MCI"} + ) ad_datasets_df["group"] = ad_datasets_df.apply(assign_mbi_group, axis=1) - # Save Alzheimer datasets with MBI data for further analysis - ad_datasets_df.to_csv(root_p / "outputs/ad_datasets_df.tsv", sep="\t", index=False) + # Save Alzheimer datasets with MBI data + out_p = root_p / "outputs/ad_datasets_mbi_df.tsv" + ad_datasets_df.to_csv(out_p, sep="\t", index=False) + + print(f"Saved ad_datasets_df to {out_p}") # Drop MBI columns, not needed for further analysis ad_datasets_df = ad_datasets_df.drop( @@ -148,7 +153,7 @@ def create_final_sz_df(qc_pheno_df): qc_pheno_df["dataset"].isin(["hcpep", "cobre", "srpbs", "ds000030"]) ].copy() - # Re-code diagnosis and assign group + # Re-code diagnosis and assign groups sz_datasets_df["diagnosis"] = sz_datasets_df["diagnosis"].replace("PSYC", "SCHZ") sz_datasets_df["group"] = sz_datasets_df.apply(assign_sz_group, axis=1) @@ -156,13 +161,7 @@ def create_final_sz_df(qc_pheno_df): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Convert NPI to MBI, merge with QC and pheno data for ADNI, CIMA-Q and OASIS3" - ) - parser.add_argument("rootpath", type=Path, help="Root path") - - args = parser.parse_args() - root_p = args.rootpath + root_p = Path("/home/neuromod/wrangling-phenotype") # Load passed_qc_master once qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" @@ -178,6 +177,8 @@ def create_final_sz_df(qc_pheno_df): ignore_index=True, ) - # Save Alzheimer datasets - output_p = root_p / "outputs/final_qc_pheno.tsv" - final_qc_pheno_df.to_csv(output_p, sep="\t", index=False) + # Save output + out_p = root_p / "outputs/final_qc_pheno.tsv" + final_qc_pheno_df.to_csv(out_p, sep="\t", index=False) + + print(f"Saved final_qc_pheno_df to {out_p}") From 60221570c80f6ac433aac12488e2b3e0106ecbf6 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Thu, 9 May 2024 10:34:21 -0400 Subject: [PATCH 17/21] remove single sites --- code/create_final_pheno.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/code/create_final_pheno.py b/code/create_final_pheno.py index 1e2a4b2..db00aeb 100644 --- a/code/create_final_pheno.py +++ b/code/create_final_pheno.py @@ -160,6 +160,16 @@ def create_final_sz_df(qc_pheno_df): return sz_datasets_df +def remove_single_site(df, site_column="site"): + # Count the occurrences of each site + site_counts = df[site_column].value_counts() + # Identify sites with more than one participant + sites_to_keep = site_counts[site_counts > 1].index + # Filter the df + filtered_df = df[df[site_column].isin(sites_to_keep)].copy() + return filtered_df + + if __name__ == "__main__": root_p = Path("/home/neuromod/wrangling-phenotype") @@ -172,11 +182,14 @@ def create_final_sz_df(qc_pheno_df): sz_datasets_df = create_final_sz_df(qc_pheno_df) # Concatenate the two sets of datasets - final_qc_pheno_df = pd.concat( + concat_qc_pheno_df = pd.concat( [ad_datasets_df, sz_datasets_df], ignore_index=True, ) + # Remove sites with only a single participant (cannot harmonize) + final_qc_pheno_df = remove_single_site(concat_qc_pheno_df, site_column="site") + # Save output out_p = root_p / "outputs/final_qc_pheno.tsv" final_qc_pheno_df.to_csv(out_p, sep="\t", index=False) From a8be365134b19a7a60be90843b0f318bbbb6b59e Mon Sep 17 00:00:00 2001 From: clarkenj Date: Thu, 9 May 2024 10:58:48 -0400 Subject: [PATCH 18/21] fixed typo --- code/create_final_pheno.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/create_final_pheno.py b/code/create_final_pheno.py index db00aeb..ddd4d99 100644 --- a/code/create_final_pheno.py +++ b/code/create_final_pheno.py @@ -96,7 +96,7 @@ def assign_mbi_group(row): elif row["mbi_status"] == 0: return "MCI-" elif row["diagnosis"] == "CON": - return "CON-AD" + return "CON-ADD" def assign_sz_group(row): From 2a38b48c2c322f314172ff7a0f731a3d4c365c62 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Thu, 9 May 2024 11:29:05 -0400 Subject: [PATCH 19/21] removed site function --- code/create_final_pheno.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/code/create_final_pheno.py b/code/create_final_pheno.py index ddd4d99..31ec89d 100644 --- a/code/create_final_pheno.py +++ b/code/create_final_pheno.py @@ -160,16 +160,6 @@ def create_final_sz_df(qc_pheno_df): return sz_datasets_df -def remove_single_site(df, site_column="site"): - # Count the occurrences of each site - site_counts = df[site_column].value_counts() - # Identify sites with more than one participant - sites_to_keep = site_counts[site_counts > 1].index - # Filter the df - filtered_df = df[df[site_column].isin(sites_to_keep)].copy() - return filtered_df - - if __name__ == "__main__": root_p = Path("/home/neuromod/wrangling-phenotype") @@ -187,11 +177,8 @@ def remove_single_site(df, site_column="site"): ignore_index=True, ) - # Remove sites with only a single participant (cannot harmonize) - final_qc_pheno_df = remove_single_site(concat_qc_pheno_df, site_column="site") - # Save output out_p = root_p / "outputs/final_qc_pheno.tsv" - final_qc_pheno_df.to_csv(out_p, sep="\t", index=False) + concat_qc_pheno_df.to_csv(out_p, sep="\t", index=False) print(f"Saved final_qc_pheno_df to {out_p}") From 1a8f03cffff697932b74ce2a90267ae1d454a56d Mon Sep 17 00:00:00 2001 From: clarkenj Date: Tue, 16 Jul 2024 16:34:00 -0400 Subject: [PATCH 20/21] start adding compassnd --- code/create_final_pheno.py | 35 +++++++++++++++------ code/util.py | 62 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 9 deletions(-) diff --git a/code/create_final_pheno.py b/code/create_final_pheno.py index 31ec89d..49a29b6 100644 --- a/code/create_final_pheno.py +++ b/code/create_final_pheno.py @@ -84,6 +84,19 @@ def process_oasis3_mbi(qc_pheno_df): return final_oasis3 +def process_compassnd_mbi(qc_pheno_df): + # Set path and load data + npi_p = root_p / "data/compassnd/data-2024-07-10T22_03_30.029Z.csv" + npi_df = pd.read_csv(npi_p) + + # Filter for dataset + qc_pheno_df = qc_pheno_df[qc_pheno_df["dataset"] == "compassnd"] + + mbi_df = util.compassnd_npi_to_mbi(npi_df) + + return mbi_df + + def assign_mbi_group(row): if row["diagnosis"] in ["ADD"]: if row["mbi_status"] == 1: @@ -167,18 +180,22 @@ def create_final_sz_df(qc_pheno_df): qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t", low_memory=False) + compass_mbi = process_compassnd_mbi(qc_pheno_df) + # Create dfs for different diagnosis datasets - ad_datasets_df = create_final_ad_df(qc_pheno_df) - sz_datasets_df = create_final_sz_df(qc_pheno_df) + # ad_datasets_df = create_final_ad_df(qc_pheno_df) + # sz_datasets_df = create_final_sz_df(qc_pheno_df) # Concatenate the two sets of datasets - concat_qc_pheno_df = pd.concat( - [ad_datasets_df, sz_datasets_df], - ignore_index=True, - ) + # concat_qc_pheno_df = pd.concat( + # [ad_datasets_df, sz_datasets_df], + # ignore_index=True, + # ) # Save output - out_p = root_p / "outputs/final_qc_pheno.tsv" - concat_qc_pheno_df.to_csv(out_p, sep="\t", index=False) + out_p = root_p / "outputs/test.tsv" + compass_mbi.to_csv(out_p, sep="\t", index=False) + # out_p = root_p / "outputs/final_qc_pheno.tsv" + # concat_qc_pheno_df.to_csv(out_p, sep="\t", index=False) - print(f"Saved final_qc_pheno_df to {out_p}") + # print(f"Saved final_qc_pheno_df to {out_p}") diff --git a/code/util.py b/code/util.py index f8327f4..ccacee2 100644 --- a/code/util.py +++ b/code/util.py @@ -255,6 +255,68 @@ def oasis3_merge_mbi_qc(qc_pheno_df, mbi_df): return merged_df +def compassnd_npi_to_mbi(df): + + cols = [ + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,008_delusion_yn", + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,011_hallucinations_yn", + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,014_agitation_yn", + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,017_depression_yn", + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,020_anxiety_yn", + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,023_elation_yn", + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,026_apathy_yn", + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,029_disinhibition_yn", + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,032_irritability_yn", + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,035_disturbance_yn", + ] + + df.replace( + ["not_answered", "refused_to_answer", "dont_know"], + [None, None, None], + inplace=True, + ) + df.dropna(subset=cols, inplace=True) + df[cols] = df[cols].replace({"yes": 1, "no": 0}) + + df["decreased_motivation"] = df[ + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,026_apathy_yn" + ] + df["emotional_dysregulation"] = ( + df[ + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,017_depression_yn" + ] + + df[ + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,020_anxiety_yn" + ] + + df[ + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,023_elation_yn" + ] + ) + df["impulse_dyscontrol"] = ( + df[ + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,014_agitation_yn" + ] + + df[ + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,032_irritability_yn" + ] + + df[ + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,035_disturbance_yn" + ] + ) + df["social_inappropriateness"] = df[ + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,029_disinhibition_yn" + ] + df["abnormal_perception"] = ( + df[ + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,008_delusion_yn" + ] + + df[ + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,011_hallucinations_yn" + ] + ) + return df + + def first_session_controls(merged_df): # Filter for controls controls_df = merged_df[merged_df["diagnosis"] == "CON"] From b632627763cb1aed2c324a580e31f91520203fc2 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Wed, 17 Jul 2024 11:13:20 -0400 Subject: [PATCH 21/21] added compassnd data --- code/create_final_pheno.py | 60 +++++++++++++++++++++++++--------- code/util.py | 66 +++++++++++++++++++++++++++++++------- 2 files changed, 99 insertions(+), 27 deletions(-) diff --git a/code/create_final_pheno.py b/code/create_final_pheno.py index 49a29b6..32450cc 100644 --- a/code/create_final_pheno.py +++ b/code/create_final_pheno.py @@ -18,7 +18,7 @@ def process_adni_mbi(qc_pheno_df): mbi_df = util.calculate_mbi_score(mbi_df) mbi_df = util.adni_select_columns(mbi_df) - # Merge mbi data with qc_pheno data + # Merge mbi data with qc_pheno data. Note that for controls with an mbi not in window, they are left blank which is fine merged_df = util.adni_merge_mbi_qc(qc_pheno_df, mbi_df) # Select scans. For controls, we take the first available scan. For MCI and ADD, take the first with an MBI score @@ -93,8 +93,39 @@ def process_compassnd_mbi(qc_pheno_df): qc_pheno_df = qc_pheno_df[qc_pheno_df["dataset"] == "compassnd"] mbi_df = util.compassnd_npi_to_mbi(npi_df) + mbi_df = util.calculate_mbi_score(mbi_df) + mbi_df = util.compassnd_select_columns(mbi_df) + + # Convert age in months to years + mbi_df["age"] = ( + ( + mbi_df[ + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,001_Candidate_Age" + ] + / 12 + ) + .astype(float) + .round(2) + ) - return mbi_df + mbi_df.drop( + columns=[ + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,001_Candidate_Age" + ], + inplace=True, + ) + + # Merge mbi data with qc_pheno data + merged_df = util.compassnd_merge_mbi_qc(qc_pheno_df, mbi_df) + + # Select scans. Find the first available session for each participant. For MCI and ADD they must have an mbi score, for controls it does not matter + # This approach retains multiple runs from the same session + control_df = util.first_session_controls(merged_df) + mci_add_df = util.first_session_mci_add(merged_df) + + final_compassnd = pd.concat([control_df, mci_add_df], ignore_index=True) + + return final_compassnd def assign_mbi_group(row): @@ -124,10 +155,11 @@ def create_final_ad_df(qc_pheno_df): final_adni = process_adni_mbi(qc_pheno_df) final_cimaq = process_cimaq_mbi(qc_pheno_df) final_oasis3 = process_oasis3_mbi(qc_pheno_df) + final_compassnd = process_compassnd_mbi(qc_pheno_df) # Concatenate these Alzheimer datasets ad_datasets_df = pd.concat( - [final_adni, final_cimaq, final_oasis3], + [final_adni, final_cimaq, final_oasis3, final_compassnd], ignore_index=True, ) @@ -180,22 +212,18 @@ def create_final_sz_df(qc_pheno_df): qc_pheno_p = root_p / "outputs/passed_qc_master.tsv" qc_pheno_df = pd.read_csv(qc_pheno_p, sep="\t", low_memory=False) - compass_mbi = process_compassnd_mbi(qc_pheno_df) - # Create dfs for different diagnosis datasets - # ad_datasets_df = create_final_ad_df(qc_pheno_df) - # sz_datasets_df = create_final_sz_df(qc_pheno_df) + ad_datasets_df = create_final_ad_df(qc_pheno_df) + sz_datasets_df = create_final_sz_df(qc_pheno_df) # Concatenate the two sets of datasets - # concat_qc_pheno_df = pd.concat( - # [ad_datasets_df, sz_datasets_df], - # ignore_index=True, - # ) + concat_qc_pheno_df = pd.concat( + [ad_datasets_df, sz_datasets_df], + ignore_index=True, + ) # Save output - out_p = root_p / "outputs/test.tsv" - compass_mbi.to_csv(out_p, sep="\t", index=False) - # out_p = root_p / "outputs/final_qc_pheno.tsv" - # concat_qc_pheno_df.to_csv(out_p, sep="\t", index=False) + out_p = root_p / "outputs/final_qc_pheno.tsv" + concat_qc_pheno_df.to_csv(out_p, sep="\t", index=False) - # print(f"Saved final_qc_pheno_df to {out_p}") + print(f"Saved final_qc_pheno_df to {out_p}") diff --git a/code/util.py b/code/util.py index ccacee2..c03eecf 100644 --- a/code/util.py +++ b/code/util.py @@ -68,34 +68,33 @@ def adni_merge_mbi_qc(qc_pheno_df, mbi_df): qc_pheno_df["participant_id"].str.split("S").str[-1].astype(int) ) - # Rename date field so it matches - mbi_df.rename(columns={"EXAMDATE": "ses"}, inplace=True) - # Replace some rogue dates - mbi_df["ses"] = mbi_df["ses"].replace("0012-02-14", "2012-02-14") - mbi_df["ses"] = mbi_df["ses"].replace("0013-05-06", "2013-05-06") - mbi_df["ses"] = mbi_df["ses"].replace("0013-10-28", "2013-10-28") + mbi_df["EXAMDATE"] = mbi_df["EXAMDATE"].replace("0012-02-14", "2012-02-14") + mbi_df["EXAMDATE"] = mbi_df["EXAMDATE"].replace("0013-05-06", "2013-05-06") + mbi_df["EXAMDATE"] = mbi_df["EXAMDATE"].replace("0013-10-28", "2013-10-28") # Convert sessions to datetime qc_pheno_df["ses"] = pd.to_datetime(qc_pheno_df["ses"]) - mbi_df["ses"] = pd.to_datetime(mbi_df["ses"]) + mbi_df["EXAMDATE"] = pd.to_datetime(mbi_df["EXAMDATE"]) # Ensure ordered by session qc_pheno_df = qc_pheno_df.sort_values(by=["ses"]) - mbi_df = mbi_df.dropna(subset=["ses"]) # Since some were missing - mbi_df = mbi_df.sort_values(by=["ses"]) + mbi_df = mbi_df.dropna(subset=["EXAMDATE"]) # Since some were missing + mbi_df = mbi_df.sort_values(by=["EXAMDATE"]) - # Merge to get nearest mbi result within 6 months + # Merge on session merged_df = pd.merge_asof( qc_pheno_df, mbi_df, by="RID", - on="ses", + left_on="ses", + right_on="EXAMDATE", direction="nearest", tolerance=pd.Timedelta(days=183), ) merged_df = merged_df.drop("RID", axis=1) + merged_df = merged_df.drop("EXAMDATE", axis=1) return merged_df @@ -159,6 +158,7 @@ def cimaq_select_columns(df): def cimaq_merge_mbi_qc(qc_pheno_df, mbi_df): + # TO DO: instead of renaming use right_by etc qc_pheno_df = qc_pheno_df.copy() mbi_df = mbi_df.copy() @@ -222,6 +222,7 @@ def oasis3_select_columns(df): def oasis3_merge_mbi_qc(qc_pheno_df, mbi_df): + # TO DO: instead of renaming use right_by etc qc_pheno_df = qc_pheno_df.copy() mbi_df = mbi_df.copy() @@ -317,6 +318,49 @@ def compassnd_npi_to_mbi(df): return df +def compassnd_select_columns(df): + columns = [ + "Identifiers", + "Clinical_Assessment PI_Neuropsychiatric_Inventory_Questionnaire,001_Candidate_Age", + "decreased_motivation", + "emotional_dysregulation", + "impulse_dyscontrol", + "social_inappropriateness", + "abnormal_perception", + "mbi_total_score", + "mbi_status", + ] + df = df[columns].copy() + return df + + +def compassnd_merge_mbi_qc(qc_pheno_df, mbi_df): + qc_pheno_df = qc_pheno_df.copy() + mbi_df = mbi_df.copy() + + # Ensure ordered by age + qc_pheno_df = qc_pheno_df.sort_values(by=["age"]) + mbi_df = mbi_df.sort_values(by=["age"]) + + # Merge dfs with 6 month tolerance + merged_df = pd.merge_asof( + qc_pheno_df, + mbi_df, + left_by="participant_id", + right_by="Identifiers", + on="age", + direction="nearest", + tolerance=0.5, + ) + + merged_df.drop( + columns=["Identifiers"], + inplace=True, + ) + + return merged_df + + def first_session_controls(merged_df): # Filter for controls controls_df = merged_df[merged_df["diagnosis"] == "CON"]