diff --git a/.gitignore b/.gitignore index 0d05551..918137f 100644 --- a/.gitignore +++ b/.gitignore @@ -113,7 +113,7 @@ venv/ ENV/ env.bak/ venv.bak/ -cwas_neuro_env/ +ad_sz_env/ # Spyder project settings .spyderproject diff --git a/scripts/baseline_biomarkers.py b/scripts/baseline_biomarkers.py deleted file mode 100644 index e7ea469..0000000 --- a/scripts/baseline_biomarkers.py +++ /dev/null @@ -1,173 +0,0 @@ -import pandas as pd -import numpy as np - -from functools import reduce -from pathlib import Path - -# paths to files - -adni_data = Path("__file__").resolve().parent / 'data' / 'adni_spreadsheet.csv' -tau_data = Path("__file__").resolve().parent / 'data' / 'UCBERKELEYAV1451_04_26_22.csv' -other_biomarker_data = Path("__file__").resolve().parent / 'data' / 'ADNIMERGE.csv' - -# functions - -def get_baselines(file): - - # load baseline phenotypic data - pheno = pd.read_csv(file, index_col=0, header=0) - - # keep only the variables of interest - pheno = pheno.filter(['Subject ID','Phase','Sex','Research Group', 'Visit','Study Date','Age'], axis=1) - - # convert 'study date' to 'session' in datetime format, to match other spreadsheets - pheno['session'] = pd.to_datetime(pheno['Study Date']) - - # pull out only the subject id and asign it to the index - pheno_subj = [] - for i in pheno['Subject ID']: - subj = i.split('_')[2].lstrip("0") # remove leading zeros since it won't match ADNI IDs - pheno_subj.append(subj) - - pheno.index = pheno_subj - pheno.rename_axis('RID',inplace=True) - pheno.index = pheno.index.astype('int64') - - # separate patients and controls, because (in theory) we can use any control visit as baseline, but - # for patients we want their actual baseline data - patient_diagnoses = ['AD', 'EMCI', 'LMCI', 'MCI', 'SMC'] - patient_df = pheno[pheno['Research Group'].isin(patient_diagnoses)] # df of patient diagnoses - - control_df = pheno.loc[pheno['Research Group'] == 'CN'] # df of control diagnoses - - # I think these visits are acceptable as baseline data, i.e. actual baseline +/-3 months, excluding - # any initial visits where patient continued from a previous phase - bl_visits = ['ADNI Screening','ADNI2 Month 3 MRI-New Pt', 'ADNI2 Screening MRI-New Pt', - 'ADNIGO Month 3 MRI','ADNIGO Screening MRI'] - - patient_df_bl = patient_df[patient_df['Visit'].isin(bl_visits)] - - # rejoin the patients to the controls - new_df = pd.concat([control_df,patient_df_bl]) - - # select the earliest visit available for each participant - new_df.sort_values(['Subject ID', 'Age'], inplace=True) # sort by age - baseline_df = new_df[~new_df.duplicated(['Subject ID'], keep='first')] # select the first row - - # sort df by index - baseline_df.sort_values(by='RID', inplace=True) - - # calculate window for acceptable biomarker data, currently +- 12months - baseline_df.loc[:,('date_lower')] = baseline_df.loc[:,('session')] - pd.DateOffset(months=18) - baseline_df.loc[:,('date_upper')] = baseline_df.loc[:,('session')] + pd.DateOffset(months=18) - - return (baseline_df) - -def load_biomarker_df(biomarker_data): - - # load data - biomarker_df = pd.read_csv(biomarker_data, index_col=0, header=0, low_memory=False) - - # convert examdate to datetime - biomarker_df['EXAMDATE'] = pd.to_datetime(biomarker_df['EXAMDATE']) - - # sort df by index and date - biomarker_df.sort_values(by=['RID', 'EXAMDATE'],inplace=True) - - # create column from index (useful for later functions) - biomarker_df['RID'] = biomarker_df.index - - return (biomarker_df) - -def match_baselines_biomarker(biomarker_df, baseline_df, biomarker): - - df = pd.DataFrame(columns=['RID',biomarker,biomarker+'_EXAMDATE']) #create df - common_ids = biomarker_df.index.intersection(baseline_df.index) #find ids common to the biomarker and baseline dfs - biomarker_df = biomarker_df.set_index('EXAMDATE') #reindex, needed to use 'nearest' method - - for rid in common_ids: - participant_df = biomarker_df[(biomarker_df['RID'] == rid)] #create df of all participants results - participant_df = participant_df.dropna(subset=[biomarker]) #drop NaNs, since we only want nearest date with result available - - baseline = baseline_df.loc[rid] #create df of participants baseline data - session = baseline['session'] #participant's baseline date - - if participant_df.empty: - pass - else: - idx_nearest = participant_df.index.get_loc(session, method='nearest') #find the index of the closest test date to session - nearest_date = participant_df.index[idx_nearest] #which date is it - nearest_result = participant_df[biomarker][idx_nearest] #find the biomarker result associated with closest date - - df.loc[len(df)] = [rid,nearest_result,nearest_date] #add to df - - df = df.set_index('RID') - - return (df) - -def check_visit_window(biomarker_df, baseline_df, biomarker): - - ''' - Join closest biomarkers to baseline info, check if the result was collected on a date within the baseline - window, and if not replace with NaN. Drop unwanted columns and return biomarker info again, ready to merge. - ''' - - # create new df, merging biomarker data into baseline df - baseline_bio = baseline_df.join(biomarker_df) - - # create mask of date range, between lower and upper acceptable dates - mask_date_range = (baseline_bio[biomarker+'_EXAMDATE'] > baseline_bio['date_lower']) & (baseline_bio[biomarker+'_EXAMDATE'] < baseline_bio['date_upper']) - - # fill values collected outside date range with NaN - baseline_bio[biomarker][~mask_date_range] = np.nan - - cols_to_drop = ['Subject ID', - 'Phase', - 'Sex', - 'Research Group', - 'Visit', - 'Study Date', - 'Age', - 'session', - 'date_lower', - 'date_upper'] - - baseline_bio = baseline_bio.drop(cols_to_drop, axis=1) #drop unwanted columns - - return (baseline_bio) - -def get_biomarker(biomarker_df, baseline_df, biomarker): - - #runs two functions to match the baseline to closest biomarker result, and then checks if it falls in the window - - find_nearest_biomarker = match_baselines_biomarker(biomarker_df, baseline_df, biomarker) - window_checked = check_visit_window(find_nearest_biomarker, baseline_df, biomarker) - - return (window_checked) - -if __name__ == '__main__': - - #get baselines from adni spreadsheet - baseline_df = get_baselines(adni_data) - - # load biomarker files - tau_df = load_biomarker_df(tau_data) - other_biomarkers_df = load_biomarker_df(other_biomarker_data) - - # create one df per biomarker with closest result within a one year window of participant baseline - tau = get_biomarker(tau_df, baseline_df, 'META_TEMPORAL_SUVR') - abeta = get_biomarker(other_biomarkers_df, baseline_df, 'ABETA') - ptau = get_biomarker(other_biomarkers_df, baseline_df, 'PTAU') - av45 = get_biomarker(other_biomarkers_df, baseline_df, 'AV45') - fbb = get_biomarker(other_biomarkers_df, baseline_df, 'FBB') - - # create list of dataframes (baseline data and all individual biomarkers) - data_frames = [baseline_df, tau, abeta, ptau, av45, fbb] - - # merge dataframes - master_biomarkers = reduce(lambda left, right: - pd.merge_asof(left, right, left_index=True, right_index=True), - data_frames) - - # save baseline and matched biomarker data in data directory - master_biomarkers.to_csv(Path("__file__").resolve().parent / 'data' / 'master_biomarkers.csv') diff --git a/scripts/biomarker_thresholding.ipynb b/scripts/biomarker_thresholding.ipynb deleted file mode 100644 index dedac85..0000000 --- a/scripts/biomarker_thresholding.ipynb +++ /dev/null @@ -1,294 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 22, - "id": "d213377a", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "from pathlib import Path" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "9a4034fa", - "metadata": {}, - "outputs": [], - "source": [ - "# set thresholds and calculate 5% margin\n", - "\n", - "# CSF biomarkers\n", - "ab42_threshold = 980\n", - "ab42_margin = (5*ab42_threshold)/100\n", - "ab42_upper = ab42_threshold+ab42_margin\n", - "ab42_lower = ab42_threshold-ab42_margin\n", - "\n", - "ptau_threshold = 23\n", - "ptau_margin = (5*ptau_threshold)/100\n", - "ptau_upper = ptau_threshold+ptau_margin\n", - "ptau_lower = ptau_threshold-ptau_margin\n", - "\n", - "# PET biomarkers\n", - "fbb_threshold = 1.08\n", - "fbb_margin = (5*fbb_threshold)/100\n", - "fbb_upper = fbb_threshold+fbb_margin\n", - "fbb_lower = fbb_threshold-fbb_margin\n", - "\n", - "av45_threshold = 1.08\n", - "av45_margin = (5*av45_threshold)/100\n", - "av45_upper = av45_threshold+av45_margin\n", - "av45_lower = av45_threshold-av45_margin" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a913b27a", - "metadata": {}, - "outputs": [], - "source": [ - "def calculate_tau_cutoff(df):\n", - " \n", - " '''\n", - " Take a df with biomarker results for all participants. Select controls only. Find those negative for amyloid\n", - " PET (either av45 or FBB). For those who hav flortaucipir results, calculate average. Calculate +2SD, this is\n", - " the flortaucipir cutoff.\n", - " \n", - " '''\n", - " \n", - " controls = df.loc[df['Research Group'] == 'CN']\n", - " \n", - " controls_Apet_neg = (controls[(controls['FBB'] < fbb_lower) | \n", - " (controls['AV45'] < av45_lower)])\n", - " \n", - " tau_mean = controls_Apet_neg['META_ROI'].mean()\n", - " tau_sd = controls_Apet_neg['META_ROI'].std()\n", - " tau_cutoff = tau_mean+(tau_sd*2)\n", - " \n", - " return (tau_cutoff) " - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "bfa2dd1c", - "metadata": {}, - "outputs": [], - "source": [ - "# paths to files\n", - "biomarker_data = Path(\"__file__\").resolve().parents[1] / 'data' / 'master_biomarkers.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "837cb329", - "metadata": {}, - "outputs": [], - "source": [ - "# load baseline data\n", - "df = pd.read_csv(biomarker_data, index_col=0, header=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "c75a9dee", - "metadata": {}, - "outputs": [], - "source": [ - "# replace some biomarker data which has been coded as > or <\n", - "df = df.replace({'ABETA': {'>1700': 1701, '<200': 199}})\n", - "df = df.replace({'PTAU': {'<8': 7, '>120':121}})\n", - "\n", - "df[['ABETA', 'PTAU']] = df[['ABETA', 'PTAU']].apply(pd.to_numeric)" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "b2d48836", - "metadata": {}, - "outputs": [], - "source": [ - "# calculate study specific cutoff for tau PET meta temporal ROI \n", - "tau_threshold = calculate_tau_cutoff(df)\n", - "tau_margin = (5*tau_threshold)/100\n", - "tau_upper = tau_threshold+tau_margin\n", - "tau_lower = tau_threshold-tau_margin" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "60d0393b", - "metadata": {}, - "outputs": [], - "source": [ - "def biomarker_negative_controls(df):\n", - " \n", - " '''\n", - " Take a df with biomarker results for participants labelled as controls, and return a df excluding those\n", - " who have a positive result on any one of five markers (or no biomarker information).\n", - " \n", - " To get more info on which biomarkers are positive, uncomment last section. But this is only to give an \n", - " idea, since some may be positive on more than one marker.\n", - " '''\n", - " \n", - " controls_all = df.loc[df['Research Group'] == 'CN']\n", - " \n", - " # drop participants who don't have any biomarker data\n", - " controls = controls_all.dropna(subset=['ABETA','PTAU','AV45','FBB','META_ROI'], how='all')\n", - " \n", - " controls_biomarker_neg = (controls[(controls['FBB'] < fbb_lower) | \n", - " (controls['AV45'] < av45_lower) | \n", - " (controls['ABETA'] > ab42_upper) | \n", - " (controls['PTAU'] < ptau_lower) | \n", - " (controls['META_ROI'] < tau_lower)])\n", - " \n", - " n_controls = len(controls_all)\n", - " no_markers = n_controls-len(controls)\n", - " n_controls_negative = len(controls_biomarker_neg)\n", - " n_dropped = (n_controls-no_markers) - n_controls_negative\n", - " \n", - " print ('Of {} controls, {} dropped due to no biomarker data and {} due to positive/borderline biomarkers. {} remaining.'.format(\n", - " n_controls,\n", - " no_markers, \n", - " n_dropped,\n", - " n_controls_negative))\n", - " \n", - " '''\n", - " controls_biomarker_pos = pd.concat([controls,controls_biomarker_neg]).drop_duplicates(keep=False)\n", - " print ('{} positive/borderline for Abeta42.'.format(len(controls_biomarker_pos[(controls_biomarker_pos['ABETA'] < ab42_upper)])))\n", - " print ('{} positive/borderline for Ptau.'.format(len(controls_biomarker_pos[(controls_biomarker_pos['PTAU'] > ptau_lower)])))\n", - " print ('{} positive/borderline for PET amyloid (FBB tracer).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['FBB'] > fbb_lower)])))\n", - " print ('{} positive/borderline for PET amyloid (AV45 tracer).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['AV45'] > av45_lower)])))\n", - " print ('{} positive/borderline for PET tau (meta roi).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['META_ROI'] > tau_lower)])))\n", - " '''\n", - " return (controls_biomarker_neg)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "b3a95289", - "metadata": {}, - "outputs": [], - "source": [ - "def biomarker_positive_patients(df, diagnoses):\n", - " \n", - " '''\n", - " Take a df with biomarker results for participants labelled as a patient (specifying which), \n", - " and return a df including only those who have a positive result on any one of five markers. \n", - " Exclude any with no biomarker information.\n", - " \n", - " To get more info on which biomarkers are negative, uncomment last section. But this is only to give an \n", - " idea, since some may be negative on more than one marker.\n", - " '''\n", - " \n", - " patients_all = df.loc[df['Research Group'].isin(diagnoses)]\n", - " \n", - " # drop participants who don't have any biomarker data\n", - " patients = patients_all.dropna(subset=['ABETA','PTAU','AV45','FBB','META_ROI'], how='all')\n", - " \n", - " patients_biomarker_pos = (patients[(patients['FBB'] > fbb_upper) | \n", - " (patients['AV45'] > av45_upper) | \n", - " (patients['ABETA'] < ab42_lower) | \n", - " (patients['PTAU'] > ptau_upper) |\n", - " (patients['META_ROI'] > tau_upper)])\n", - "\n", - " n_patients = len(patients_all)\n", - " no_markers = n_patients-len(patients)\n", - " n_patients_positive = len(patients_biomarker_pos)\n", - " n_dropped = (n_patients-no_markers) - n_patients_positive\n", - " \n", - " print ('Of {} {}, {} dropped due to no biomarker data and {} due to negative/borderline biomarkers. {} remaining.'.format(\n", - " n_patients,\n", - " diagnoses,\n", - " no_markers,\n", - " n_dropped,\n", - " n_patients_positive))\n", - " \n", - " '''\n", - " patients_neg = pd.concat([patients,patients_biomarker_pos]).drop_duplicates(keep=False)\n", - " \n", - " print ('{} negative/borderline for Abeta42.'.format(len(patients_neg[(patients_neg['ABETA'] > ab42_lower)])))\n", - " print ('{} negative/borderline for Ptau.'.format(len(patients_neg[(patients_neg['PTAU'] < ptau_upper)])))\n", - " print ('{} negative/borderline for PET amyloid (FBB tracer).'.format(len(patients_neg[(patients_neg['FBB'] < fbb_upper)])))\n", - " print ('{} negative/borderline for PET amyloid (AV45 tracer).'.format(len(patients_neg[(patients_neg['AV45'] < av45_upper)])))\n", - " print ('{} negative/borderline for PET tau (meta roi).'.format(len(patients_neg[(patients_neg['META_ROI'] < tau_upper)])))\n", - " '''\n", - " return (patients_biomarker_pos)" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "0d123b9f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Of 362 controls, 44 dropped due to no biomarker data and 60 due to positive/borderline biomarkers. 258 remaining.\n", - "Of 58 ['AD'], 7 dropped due to no biomarker data and 3 due to negative/borderline biomarkers. 48 remaining.\n", - "Of 165 ['EMCI', 'LMCI', 'MCI'], 25 dropped due to no biomarker data and 54 due to negative/borderline biomarkers. 86 remaining.\n" - ] - } - ], - "source": [ - "# run functions to get negative controls and positive patients\n", - "controls_bio_neg = biomarker_negative_controls(df)\n", - "ad_bio_positive = biomarker_positive_patients(df, ['AD'])\n", - "mci_bio_positive = biomarker_positive_patients(df, ['EMCI','LMCI','MCI'])" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "979848a1", - "metadata": {}, - "outputs": [], - "source": [ - "master_df = pd.concat([controls_bio_neg, ad_bio_positive, mci_bio_positive])" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "cf508b49", - "metadata": {}, - "outputs": [], - "source": [ - "master_df.to_csv(Path(\"__file__\").resolve().parents[1] / 'data' / 'final_biomarker_spreadsheet.csv')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/scripts/calculate_meta_roi.ipynb b/scripts/calculate_meta_roi.ipynb deleted file mode 100644 index b3cb980..0000000 --- a/scripts/calculate_meta_roi.ipynb +++ /dev/null @@ -1,723 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "3881d9a6", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "from pathlib import Path" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3fdee605", - "metadata": {}, - "outputs": [], - "source": [ - "tau_data = Path(\"__file__\").resolve().parents[1] / 'data' / 'UCBERKELEYAV1451_04_26_22.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "7b6b5db3", - "metadata": {}, - "outputs": [], - "source": [ - "# load tau PET data\n", - "df = pd.read_csv(tau_data, index_col=0, header=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "83a462f2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
VISCODEVISCODE2EXAMDATEINFERIORCEREBELLUM_SUVRINFERIORCEREBELLUM_VOLUMEERODED_SUBCORTICALWM_SUVRERODED_SUBCORTICALWM_VOLUMEBRAAK1_SUVRBRAAK1_VOLUMEBRAAK34_SUVR...RIGHT_PUTAMEN_VOLUMERIGHT_THALAMUS_PROPER_SUVRRIGHT_THALAMUS_PROPER_VOLUMERIGHT_VENTRALDC_SUVRRIGHT_VENTRALDC_VOLUMERIGHT_VESSEL_SUVRRIGHT_VESSEL_VOLUMEWM_HYPOINTENSITIES_SUVRWM_HYPOINTENSITIES_VOLUMEupdate_stamp
RID
21initm1442018-02-021.0285552851.17711106951.193728691.2152...41691.522458741.386033781.457610.01.061524602022-04-27 15:31:20.0
31initm1502018-04-240.9647623441.0258556831.246325581.0969...57661.142160161.234031781.36912.00.7641268452022-04-27 15:31:20.0
31y1m1622019-04-230.9636615101.0193504271.226425081.0675...55371.080158651.164430601.420315.00.7805283062022-04-27 15:31:20.0
56initm1442018-02-200.9790644151.24421144011.268238091.2016...40261.339666011.362536551.619042.01.055613852022-04-27 15:31:20.0
56y1m1562019-01-100.9548633021.28631186251.193136661.1839...41451.361863491.387336251.451110.01.085014272022-04-27 15:31:20.0
\n", - "

5 rows × 242 columns

\n", - "
" - ], - "text/plain": [ - " VISCODE VISCODE2 EXAMDATE INFERIORCEREBELLUM_SUVR \\\n", - "RID \n", - "21 init m144 2018-02-02 1.0285 \n", - "31 init m150 2018-04-24 0.9647 \n", - "31 y1 m162 2019-04-23 0.9636 \n", - "56 init m144 2018-02-20 0.9790 \n", - "56 y1 m156 2019-01-10 0.9548 \n", - "\n", - " INFERIORCEREBELLUM_VOLUME ERODED_SUBCORTICALWM_SUVR \\\n", - "RID \n", - "21 55285 1.1771 \n", - "31 62344 1.0258 \n", - "31 61510 1.0193 \n", - "56 64415 1.2442 \n", - "56 63302 1.2863 \n", - "\n", - " ERODED_SUBCORTICALWM_VOLUME BRAAK1_SUVR BRAAK1_VOLUME BRAAK34_SUVR \\\n", - "RID \n", - "21 110695 1.1937 2869 1.2152 \n", - "31 55683 1.2463 2558 1.0969 \n", - "31 50427 1.2264 2508 1.0675 \n", - "56 114401 1.2682 3809 1.2016 \n", - "56 118625 1.1931 3666 1.1839 \n", - "\n", - " ... RIGHT_PUTAMEN_VOLUME RIGHT_THALAMUS_PROPER_SUVR \\\n", - "RID ... \n", - "21 ... 4169 1.5224 \n", - "31 ... 5766 1.1421 \n", - "31 ... 5537 1.0801 \n", - "56 ... 4026 1.3396 \n", - "56 ... 4145 1.3618 \n", - "\n", - " RIGHT_THALAMUS_PROPER_VOLUME RIGHT_VENTRALDC_SUVR \\\n", - "RID \n", - "21 5874 1.3860 \n", - "31 6016 1.2340 \n", - "31 5865 1.1644 \n", - "56 6601 1.3625 \n", - "56 6349 1.3873 \n", - "\n", - " RIGHT_VENTRALDC_VOLUME RIGHT_VESSEL_SUVR RIGHT_VESSEL_VOLUME \\\n", - "RID \n", - "21 3378 1.4576 10.0 \n", - "31 3178 1.3691 2.0 \n", - "31 3060 1.4203 15.0 \n", - "56 3655 1.6190 42.0 \n", - "56 3625 1.4511 10.0 \n", - "\n", - " WM_HYPOINTENSITIES_SUVR WM_HYPOINTENSITIES_VOLUME update_stamp \n", - "RID \n", - "21 1.0615 2460 2022-04-27 15:31:20.0 \n", - "31 0.7641 26845 2022-04-27 15:31:20.0 \n", - "31 0.7805 28306 2022-04-27 15:31:20.0 \n", - "56 1.0556 1385 2022-04-27 15:31:20.0 \n", - "56 1.0850 1427 2022-04-27 15:31:20.0 \n", - "\n", - "[5 rows x 242 columns]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "630a3d56", - "metadata": {}, - "outputs": [], - "source": [ - "result_cols = ['CTX_LH_ENTORHINAL_SUVR',\n", - "'CTX_RH_ENTORHINAL_SUVR',\n", - "'LEFT_AMYGDALA_SUVR',\n", - "'RIGHT_AMYGDALA_SUVR',\n", - "'CTX_LH_PARAHIPPOCAMPAL_SUVR',\n", - "'CTX_RH_PARAHIPPOCAMPAL_SUVR',\n", - "'CTX_LH_FUSIFORM_SUVR',\n", - "'CTX_RH_FUSIFORM_SUVR',\n", - "'CTX_LH_INFERIORTEMPORAL_SUVR',\n", - "'CTX_RH_INFERIORTEMPORAL_SUVR',\n", - "'CTX_LH_MIDDLETEMPORAL_SUVR',\n", - "'CTX_RH_MIDDLETEMPORAL_SUVR']" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "79cb3167", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
VISCODEVISCODE2EXAMDATEINFERIORCEREBELLUM_SUVRINFERIORCEREBELLUM_VOLUMEERODED_SUBCORTICALWM_SUVRERODED_SUBCORTICALWM_VOLUMEBRAAK1_SUVRBRAAK1_VOLUMEBRAAK34_SUVR...RIGHT_THALAMUS_PROPER_VOLUMERIGHT_VENTRALDC_SUVRRIGHT_VENTRALDC_VOLUMERIGHT_VESSEL_SUVRRIGHT_VESSEL_VOLUMEWM_HYPOINTENSITIES_SUVRWM_HYPOINTENSITIES_VOLUMEupdate_stampmeta_roiMETA_ROI
RID
21initm1442018-02-021.0285552851.17711106951.193728691.2152...58741.386033781.457610.01.061524602022-04-27 15:31:20.01.2349581.234958
31initm1502018-04-240.9647623441.0258556831.246325581.0969...60161.234031781.36912.00.7641268452022-04-27 15:31:20.01.1450251.145025
31y1m1622019-04-230.9636615101.0193504271.226425081.0675...58651.164430601.420315.00.7805283062022-04-27 15:31:20.01.1378251.137825
56initm1442018-02-200.9790644151.24421144011.268238091.2016...66011.362536551.619042.01.055613852022-04-27 15:31:20.01.2358831.235883
56y1m1562019-01-100.9548633021.28631186251.193136661.1839...63491.387336251.451110.01.085014272022-04-27 15:31:20.01.2089331.208933
\n", - "

5 rows × 244 columns

\n", - "
" - ], - "text/plain": [ - " VISCODE VISCODE2 EXAMDATE INFERIORCEREBELLUM_SUVR \\\n", - "RID \n", - "21 init m144 2018-02-02 1.0285 \n", - "31 init m150 2018-04-24 0.9647 \n", - "31 y1 m162 2019-04-23 0.9636 \n", - "56 init m144 2018-02-20 0.9790 \n", - "56 y1 m156 2019-01-10 0.9548 \n", - "\n", - " INFERIORCEREBELLUM_VOLUME ERODED_SUBCORTICALWM_SUVR \\\n", - "RID \n", - "21 55285 1.1771 \n", - "31 62344 1.0258 \n", - "31 61510 1.0193 \n", - "56 64415 1.2442 \n", - "56 63302 1.2863 \n", - "\n", - " ERODED_SUBCORTICALWM_VOLUME BRAAK1_SUVR BRAAK1_VOLUME BRAAK34_SUVR \\\n", - "RID \n", - "21 110695 1.1937 2869 1.2152 \n", - "31 55683 1.2463 2558 1.0969 \n", - "31 50427 1.2264 2508 1.0675 \n", - "56 114401 1.2682 3809 1.2016 \n", - "56 118625 1.1931 3666 1.1839 \n", - "\n", - " ... RIGHT_THALAMUS_PROPER_VOLUME RIGHT_VENTRALDC_SUVR \\\n", - "RID ... \n", - "21 ... 5874 1.3860 \n", - "31 ... 6016 1.2340 \n", - "31 ... 5865 1.1644 \n", - "56 ... 6601 1.3625 \n", - "56 ... 6349 1.3873 \n", - "\n", - " RIGHT_VENTRALDC_VOLUME RIGHT_VESSEL_SUVR RIGHT_VESSEL_VOLUME \\\n", - "RID \n", - "21 3378 1.4576 10.0 \n", - "31 3178 1.3691 2.0 \n", - "31 3060 1.4203 15.0 \n", - "56 3655 1.6190 42.0 \n", - "56 3625 1.4511 10.0 \n", - "\n", - " WM_HYPOINTENSITIES_SUVR WM_HYPOINTENSITIES_VOLUME \\\n", - "RID \n", - "21 1.0615 2460 \n", - "31 0.7641 26845 \n", - "31 0.7805 28306 \n", - "56 1.0556 1385 \n", - "56 1.0850 1427 \n", - "\n", - " update_stamp meta_roi META_ROI \n", - "RID \n", - "21 2022-04-27 15:31:20.0 1.234958 1.234958 \n", - "31 2022-04-27 15:31:20.0 1.145025 1.145025 \n", - "31 2022-04-27 15:31:20.0 1.137825 1.137825 \n", - "56 2022-04-27 15:31:20.0 1.235883 1.235883 \n", - "56 2022-04-27 15:31:20.0 1.208933 1.208933 \n", - "\n", - "[5 rows x 244 columns]" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['META_ROI'] = df[result_cols].mean(axis=1)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "5aec7b55", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
EXAMDATEMETA_ROI
RID
212018-02-021.234958
312018-04-241.145025
312019-04-231.137825
562018-02-201.235883
562019-01-101.208933
\n", - "
" - ], - "text/plain": [ - " EXAMDATE META_ROI\n", - "RID \n", - "21 2018-02-02 1.234958\n", - "31 2018-04-24 1.145025\n", - "31 2019-04-23 1.137825\n", - "56 2018-02-20 1.235883\n", - "56 2019-01-10 1.208933" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "results_df = df[['EXAMDATE','META_ROI']].copy()\n", - "results_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "5e52c415", - "metadata": {}, - "outputs": [], - "source": [ - "results_df.to_csv(Path(\"__file__\").resolve().parents[1] / 'data' / 'tau_meta_roi.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "663dc7b4", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/scripts/cnvfc/stats.py b/scripts/cnvfc/stats.py new file mode 100644 index 0000000..3a4f28e --- /dev/null +++ b/scripts/cnvfc/stats.py @@ -0,0 +1,348 @@ +import sys +import time +import warnings +import scipy as sp +import numpy as np +import patsy as pat +import pandas as pd +from scipy import stats +import statsmodels.api as sm +#from .tools import conn2mat +from sklearn import linear_model as sln +from sklearn import preprocessing as skp +from sklearn import model_selection as skm +from statsmodels.sandbox.stats.multicomp import multipletests as stm + + +def make_weights(data, pattern): + if not data.shape[1:] == pattern.shape: + raise Exception(f'data and pattern shape mismatch: {data.shape[1:]}, {pattern.shape}') + n_nodes = pattern.shape[0] + n_data = data.shape[0] + weights = np.array([[np.corrcoef(data[data_id, node_id, :], + pattern[node_id, :])[0, 1] + for node_id in range(n_nodes)] + for data_id in range(n_data)]) + return weights + + +def categorical_enrichment(pheno, data, group, case=None, control=None, node_labels='node_name_not_specificed'): + sub_mask, cases_mask = find_subset(pheno, group, [case, control]) + sub_data = data[sub_mask, ...] + n_data = sub_data.shape[1] + n_case = np.sum(cases_mask[case]) + n_control = np.sum(cases_mask[control]) + + results = {key: list() for key in ['node', 'U', 'p_value', + f'median_{group}_{case}', + f'median_{group}_{control}', + 'rank_biserial_correlation']} + # Conduct Mann-Whitney-U for each node region + for node_id in range(n_data): + u_right, p = sp.stats.mannwhitneyu(sub_data[cases_mask[case], node_id], sub_data[cases_mask[control], node_id], + alternative='two-sided') + u_left = n_case * n_control - u_right + u_min = np.min([u_left, u_right]) + # Compute the median for the case and control groups + median_case = np.median(sub_data[cases_mask[case], node_id]) + median_control = np.median(sub_data[cases_mask[control], node_id]) + # Compute rank biserial correlation + r_b = 1 - (2 * u_min) / (n_case * n_control) + # Determine if cases > controls or reverse + case_gt_con = u_right > u_min + if not case_gt_con: + r_b = -r_b + # Store the results + results['node'].append(node_id + 1) + results['U'].append(u_min) + results['p_value'].append(p) + results[f'median_{group}_{case}'].append(median_case) + results[f'median_{group}_{control}'].append(median_control) + results['rank_biserial_correlation'].append(r_b) + + results_table = pd.DataFrame(data=results) + # Correct for multiple comparisons + (fdr_pass, qval, _, _) = stm(results_table.p_value, alpha=0.05, method='fdr_bh') + results_table['q_value'] = qval + # Add the node_names + results_table['node_name'] = node_labels + + return results_table + + +def continuous_enrichment(pheno, data, covariate, node_labels='node_name_not_specificed'): + sub_mask = find_subset(pheno, covariate) + sub_pheno = pheno.loc[sub_mask] + sub_data = data[sub_mask, :] + n_data = sub_data.shape[1] + + results = {key: list() for key in ['node', 'pearson_r', 'p_value']} + for node_id in range(n_data): + r, p = stats.pearsonr(sub_pheno[covariate], sub_data[:, node_id]) + # Store the results + results['node'].append(node_id + 1) + results['pearson_r'].append(r) + results['p_value'].append(p) + + results_table = pd.DataFrame(data=results) + # Correct for multiple comparisons + (fdr_pass, qval, _, _) = stm(results_table.p_value, alpha=0.05, method='fdr_bh') + results_table['q_value'] = qval + # Add the node_names + results_table['node_name'] = node_labels + + return results_table + + +def find_subset(pheno, column, cases=None): + # TODO check pandas type input + subset_mask = np.array(~pheno[column].isnull()) + if cases is not None and not not cases: + all_cases = pheno.loc[subset_mask][column].unique() + try: + case_available = np.array([True if case in all_cases else False for case in cases]) + except TypeError as e: + raise Exception(f'the attribute "cases" needs to be iterable but is: {type(cases)}') from e + if not all(case_available): + if not any(case_available): + raise Exception(f'none of the requested cases of "{column}" are available') + else: + warnings.warn( + f'\nnot all requested cases of "{column}" are available: {list(zip(cases, case_available))}', + RuntimeWarning) + case_masks = np.array([pheno[column] == case for case in cases]) + subset_mask = np.any(case_masks, 0) + # Return the masked instances of the requested cases + cases_dict = {case: case_masks[idx][subset_mask] for idx, case in enumerate(cases)} + return subset_mask, cases_dict + else: + return subset_mask + + +def standardize(data, mask): + scaler = skp.StandardScaler(with_mean=False, with_std=True) + scaler.fit(data[mask, :]) + standardized_data = scaler.transform(data) + return standardized_data + + +def find_contrast(design_matrix, contrast): + # Find the contrast column + contrast_columns = [(col_id, col) for col_id, col in enumerate(design_matrix.columns) if f'{contrast}' in col] + if not len(contrast_columns) == 1: + raise Exception(f'There is no single factor that matches {contrast}: {(list(design_matrix.columns))}') + return contrast_columns + + +def fast_glm(data, design_matrix, contrast): + # Does not compute p-values but operates in parallel + contrast_id, contrast_name = find_contrast(design_matrix, contrast)[0] + glm = sln.LinearRegression(fit_intercept=False, normalize=False, n_jobs=-2) + res = glm.fit(design_matrix, data) + betas = res.coef_[:, contrast_id] + return betas + + +def glm(data, design_matrix, contrast): + contrast_id, contrast_name = find_contrast(design_matrix, contrast)[0] + n_data = data.shape[1] + + # Conduct the GLM + betas = np.zeros(shape=n_data) + pvals = np.zeros(shape=n_data) + for conn_id in range(n_data): + model = sm.OLS(data[:, conn_id], design_matrix) + results = model.fit() + betas[conn_id] = results.params[contrast_id] + pvals[conn_id] = results.pvalues[contrast_id] + + return betas, pvals + + +def glm_wrap_cc(conn, pheno, group, case, control, regressors='', report=False, fast=False): + # Make sure pheno and conn have the same number of cases + if not conn.shape[0] == pheno.shape[0]: + print(f'Conn ({conn.shape[0]}) and pheno ({pheno.shape[0]}) must be same number of cases') + + # Define the subset of the sample + sub_mask, case_masks = find_subset(pheno, group, [case, control]) + sub_conn = conn[sub_mask, :] + sub_pheno = pheno.loc[sub_mask] + n_sub = np.sum(sub_mask) + n_case = np.sum(case_masks[case]) + n_control = np.sum(case_masks[control]) + n_data = sub_conn.shape[1] + if report: + print(f'Selected sample based on group variable {group}.\n' + f'cases: {case} (n={n_case})\n' + f'controls: {control} (n={n_control})\n' + f'original sample: n={pheno.shape[0]}; new sample: n={n_sub}\n' + f'{n_data} data points available\n' + f'standardized estimators are based on {group}=={control}') + + stand_conn = standardize(sub_conn, case_masks[control]) + + # Construct design matrix + if type(control) == str: + contrast = f'C({group}, Treatment("{control}"))' + else: + contrast = f'C({group}, Treatment({control}))' + formula = ' + '.join((regressors, contrast)) + dmat = pat.dmatrix(formula, sub_pheno, return_type='dataframe') + + if fast: + betas = fast_glm(sub_conn, dmat, group) + table = pd.DataFrame(data={'betas': betas}) + else: + betas, pvals = glm(sub_conn, dmat, group) + stand_betas, _ = glm(stand_conn, dmat, group) + table = pd.DataFrame(data={'betas': betas, 'stand_betas': stand_betas, 'pvals': pvals}) + + return table + + +def glm_wrap_continuous(conn, pheno, contrast, regressors, report=False, fast=False): + # Make sure pheno and conn have the same number of cases + if not conn.shape[0] == pheno.shape[0]: + print(f'Conn ({conn.shape[0]}) and pheno ({pheno.shape[0]}) must be same number of cases') + + # Define the subset of the sample + sub_mask = find_subset(pheno, contrast) + sub_conn = conn[sub_mask, :] + sub_pheno = pheno.loc[sub_mask] + n_sub = sub_pheno.shape[0] + n_data = sub_conn.shape[1] + sub_conn_stand = standardize(sub_conn, np.ones(n_sub).astype(bool)) + + if report: + print(f'Selected sample based on contrast variable {contrast}.\n' + f'Found {n_sub} subjects with no missing data for {contrast}\n' + f'original sample: n={pheno.shape[0]}; new sample: n={n_sub}\n' + f'{n_data} data points available\n' + f'standardized estimators are based on all subjects with no missing data for {contrast}') + + formula = ' + '.join((regressors, contrast)) + design_matrix = pat.dmatrix(formula, sub_pheno, return_type='dataframe') + + if fast: + betas = fast_glm(sub_conn, design_matrix, contrast) + table = pd.DataFrame(data={'betas': betas}) + else: + betas, pvals = glm(sub_conn, design_matrix, contrast) + stand_betas, _ = glm(sub_conn_stand, design_matrix, contrast) + table = pd.DataFrame(data={'betas': betas, 'stand_betas': stand_betas, 'pvals': pvals}) + + return table + + +def permutation_glm_contrast(conn, pheno, n_a, n_a_1, n_b, n_b_1, n_iter=5000): + # conn: connectome (n_sub, n_conn) + # pheno: pheno pandas table, same order as conn + # n_a / n_b: number of subs per group a / b + # n_a_1 / n_b_1: number of subs in the first contrast of group a / b + # n_iter: number of permutations + + # Some sanity checks because people can be stupid + if n_a_1 > n_a: + raise ValueError(f'n_a_1 ({n_a_1}) cannot be larger than n_a ({n_a})') + if n_b_1 > n_b: + raise ValueError(f'n_b_1 ({n_b_1}) cannot be larger than n_b ({n_b})') + + shsp = skm.ShuffleSplit(n_splits=n_iter, test_size=0.5) + pheno.reset_index(inplace=True, drop=True) + pheno.loc[:, 'group'] = 0 + n_subjects = pheno.shape[0] + beta = np.zeros((conn.shape[1], 2, n_iter)) + + start = time.time() + for fold_id, (group_a, group_b) in enumerate(shsp.split(np.ones(n_subjects))): + pheno_a = pheno.loc[group_a[:n_a]].copy() + pheno_b = pheno.loc[group_b[:n_b]].copy() + # Assign the groups + pheno_a.loc[group_a[:n_a_1], 'group'] = 1 + pheno_b.loc[group_b[:n_b_1], 'group'] = 1 + mat1 = pat.dmatrix('sex_dummy + Age_Bin + FD_scrubbed_s1_s2 + group', pheno_a, return_type='dataframe') + mat2 = pat.dmatrix('sex_dummy + Age_Bin + FD_scrubbed_s1_s2 + group', pheno_b, return_type='dataframe') + beta[:, 0, fold_id] = fast_glm(conn[group_a[:n_a], :], mat1, 'group') + beta[:, 1, fold_id] = fast_glm(conn[group_b[:n_b], :], mat2, 'group') + + elapsed = time.time() - start + done = fold_id + 1 + remaining = n_iter - done + time_left = (elapsed / done) * remaining + + sys.stdout.write('\r {}/{}. {:.2f}s left ({:.3f}s)'.format(done, n_iter, time_left, elapsed / done)) + sys.stdout.flush() + sys.stdout.write('\r Done. Took {:.2f}s'.format(elapsed)) + sys.stdout.flush() + + return beta + + +def contrast_correlation(beta, mask, mode='region'): + # mode: 'region' or 'global' + if mode == 'region': + n_iter = beta.shape[2] + n_seed = mask.shape[0] + corr = np.zeros((n_seed, n_iter)) + for fold_id in range(n_iter): + # Map the betas back to a matrix + beta1_mat = conn2mat(beta[:, 0, fold_id], mask) + beta2_mat = conn2mat(beta[:, 1, fold_id], mask) + # Compute the correlation + corr[:, fold_id] = np.array([np.corrcoef(beta1_mat[nid, :], beta2_mat[nid, :])[0, 1] for nid in range(64)]) + return corr + elif mode == 'global': + n_iter = beta.shape[2] + corr = np.array([np.corrcoef(beta[:, 0, fold_id], beta[:, 1, fold_id])[0, 1] for fold_id in range(n_iter)]) + return corr + else: + raise ValueError(f'mode {mode} is not configured!') + + +def permutation_glm(pheno, conn, group, case, control, regressors='', n_iter=1000, stand=False): + # Make sure pheno and conn have the same number of cases + if not conn.shape[0] == pheno.shape[0]: + print(f'Conn ({conn.shape[0]}) and pheno ({pheno.shape[0]}) must be same number of cases') + sub_mask, case_masks = find_subset(pheno, group, [case, control]) + sub_pheno = pheno.loc[sub_mask, :] + sub_pheno.loc[:, 'group'] = '' + sub_conn = conn[sub_mask, :] + # Standardize the input data + if stand: + sub_conn = standardize(sub_conn, case_masks[control]) + n_conn = sub_conn.shape[1] + + n1 = np.sum(case_masks[case]) + n2 = np.sum(case_masks[control]) + r2 = n2 / (n1 + n2) + + splitter = skm.ShuffleSplit(n_splits=n_iter, test_size=r2, random_state=1) + + i_pheno = sub_pheno.copy() + group_id = list(i_pheno.columns).index('group') + + betas = np.zeros(shape=(n_iter, n_conn)) + start = time.time() + for fold_id, (train_id, test_id) in enumerate(splitter.split(sub_conn)): + n1_ids = train_id[:n1] + n2_ids = test_id[:n2] + i_pheno = sub_pheno.copy() + i_pheno['group'] = '' + i_pheno.iloc[n1_ids, group_id] = 'case' + i_pheno.iloc[n2_ids, group_id] = 'control' + table = glm_wrap_cc(sub_conn, i_pheno, 'group', 'case', 'control', regressors=regressors, + report=False, fast=True) + betas[fold_id, :] = table['betas'].values + + elapsed = time.time() - start + done = fold_id + 1 + remaining = n_iter - done + time_left = (elapsed / done) * remaining + + sys.stdout.write('\r {}/{}. {:.2f}s left ({:.3f}s per permutation)'.format(done, n_iter, time_left, elapsed / done)) + sys.stdout.flush() + sys.stdout.write('\r Done. Took {:.2f}s'.format(elapsed)) + sys.stdout.flush() + + return betas diff --git a/scripts/h5_to_npy.py b/scripts/h5_to_npy.py new file mode 100644 index 0000000..805bb0b --- /dev/null +++ b/scripts/h5_to_npy.py @@ -0,0 +1,173 @@ +#import os dont think I need this? +import h5py +import numpy as np +import pandas as pd +from pathlib import Path + +def load_connectome(file, participant_id, session, identifier, no_session): + # Construct the dataset path + if no_session: + dataset_path = f"sub-{participant_id}/{identifier}_atlas-MIST_desc-64_connectome" + else: + dataset_path = f"sub-{participant_id}/ses-{session}/{identifier}_atlas-MIST_desc-64_connectome" + + dataset = file.get(dataset_path) + + connectome_npy = None # Initialise to None in case missing + if dataset is not None: + connectome_npy = dataset[...] # Load the dataset into a NumPy array + return connectome_npy + +def save_connectome(connectome_npy, output_p, identifier): + if connectome_npy is not None: + output_file_p = output_p / f"{identifier}.npy" + np.save(output_file_p, connectome_npy) + print(f"Saved data for {identifier}") + else: + print(f"Connectome for {identifier} not found or could not be loaded") + +def process_connectome_participant(row, base_dir, output_p): + identifier = row["identifier"] + participant_id = row["participant_id"] + session = row["ses"] + # Construct path to subject's HDF5 file + hdf5_file_p = base_dir / f"{participant_id}" / f"sub-{participant_id}_atlas-MIST_desc-scrubbing.5+gsr.h5" + + if hdf5_file_p.exists(): + with h5py.File(hdf5_file_p, "r") as file: + connectome_npy = load_connectome(file, participant_id, session, identifier) + save_connectome(connectome_npy, output_p, identifier) + +def process_connectome_group(row, file, output_p, no_session): + identifier = row["identifier"] + participant_id = row["participant_id"] + session = row["ses"] + + connectome_npy = load_connectome(file, participant_id, session, identifier, no_session) + save_connectome(connectome_npy, output_p, identifier) + +def process_cobre(conn_p, df, output_dir): + base_dir = conn_p / "cobre_connectome-0.4.1" + hdf5_file_p = base_dir / "atlas-MIST_desc-scrubbing.5+gsr.h5" + output_p = output_dir / "cobre" + if not output_p.exists(): + output_p.mkdir(parents=True, exist_ok=True) + + # Filter df for dataset + filtered_df = df[df['dataset'] == 'cobre'] + + with h5py.File(hdf5_file_p, "r") as file: + # Loop through the filtered dataframe + for index, row in filtered_df.iterrows(): + process_connectome_group(row, file, output_p) + +def process_ds000030(conn_p, df, output_dir): + base_dir = conn_p / "ds000030_connectomes-0.4.1" + hdf5_file_p = base_dir / "atlas-MIST_desc-scrubbing.5+gsr.h5" + output_p = output_dir / "ds000030" + if not output_p.exists(): + output_p.mkdir(parents=True, exist_ok=True) + + # Filter df for dataset + filtered_df = df[df['dataset'] == 'ds000030'] + + with h5py.File(hdf5_file_p, "r") as file: + # Loop through the filtered dataframe + for index, row in filtered_df.iterrows(): + process_connectome_group(row, file, output_p, no_session=True) + +def process_hcpep(conn_p, df, output_dir): + base_dir = conn_p / "hcp-ep_connectome-0.4.1" + output_p = output_dir / "hcpep" + if not output_p.exists(): + output_p.mkdir(parents=True, exist_ok=True) + + # Filter df for dataset + filtered_df = df[df['dataset'] == 'hcpep'] + + # Loop through the filtered dataframe + for index, row in filtered_df.iterrows(): + process_connectome_participant(row, base_dir, output_p) + +def process_srpbs(conn_p, df, output_dir): + base_dir = conn_p / "srpbs_connectome-0.4.1" + output_p = output_dir / "srpbs" + if not output_p.exists(): + output_p.mkdir(parents=True, exist_ok=True) + + # Filter df for dataset + filtered_df = df[df['dataset'] == 'srpbs'] + + # Loop through the filtered dataframe + for index, row in filtered_df.iterrows(): + process_connectome_participant(row, base_dir, output_p) + +def process_adni(conn_p, final_file_p, output_dir): + base_dir = conn_p / "adni_connectomes-0.4.1" + final_file = final_file_p / "final_adni.tsv" + output_p = output_dir / "adni" + if not output_p.exists(): + output_p.mkdir(parents=True, exist_ok=True) + + # Load the file to get identifiers and participant_ids + df = pd.read_csv(final_file, sep="\t") + + # Loop through the filtered dataframe + for index, row in df.iterrows(): + process_connectome_participant(row, base_dir, output_p) + +def process_oasis3(conn_p, final_file_p, output_dir): + base_dir = conn_p / "oasis3_connectomes-0.4.1" + final_file = final_file_p / "final_oasis3.tsv" + output_p = output_dir / "oasis3" + if not output_p.exists(): + output_p.mkdir(parents=True, exist_ok=True) + + # Load the file to get identifiers and participant_ids + df = pd.read_csv(final_file, sep="\t") + + # Loop through the filtered dataframe + for index, row in df.iterrows(): + process_connectome_participant(row, base_dir, output_p) + +def process_cimaq(conn_p, final_file_p, output_dir): + base_dir = conn_p / "cimaq_connectomes-0.4.1" + final_file = final_file_p / "final_cimaq.tsv" + output_p = output_dir / "cimaq" + if not output_p.exists(): + output_p.mkdir(parents=True, exist_ok=True) + + # Load the file to get identifiers and participant_ids + df = pd.read_csv(final_file, sep="\t") + + # Loop through the filtered dataframe + for index, row in df.iterrows(): + process_connectome_participant(row, base_dir, output_p) + + +if __name__ == "__main__": + passed_qc_file_p = Path("/home/nclarke/projects/rrg-pbellec/nclarke/ad_sz/files/passed_qc_master.tsv") + conn_p = Path("/home/nclarke/scratch") + output_dir = Path("/home/nclarke/scratch/npy_connectome") + + # These datasets needed some extra steps aftre checking QC, so get identifiers from a different tsv + final_file_p = Path("/home/nclarke/projects/rrg-pbellec/nclarke/ad_sz/files") + + # Load the file to get identifiers and participant_ids + df = pd.read_csv(passed_qc_file_p, sep="\t") + + #process_cobre(conn_p, df, output_dir) + process_ds000030(conn_p, df, output_dir) + #process_hcpep(conn_p, df, output_dir) + #process_srpbs(conn_p, df, output_dir) + #process_adni(conn_p, final_file_p, output_dir) + #process_cimaq(conn_p, final_file_p, output_dir) + #process_oasis3(conn_p, final_file_p, output_dir) + + + + + + + + diff --git a/scripts/prepare_combat_file.py b/scripts/prepare_combat_file.py new file mode 100644 index 0000000..5d09044 --- /dev/null +++ b/scripts/prepare_combat_file.py @@ -0,0 +1,61 @@ +from pathlib import Path +import numpy as np + +def load_connectomes(base_directory_p): + """ + Load connectome data from .npy files across multiple subdirectories. + + Parameters: + - base_directory_p: Path to the base directory containing subdirectories of connectome files. + + Returns: + - connectomes: A list of connectome matrices, one per participant across all subdirectories. + """ + connectomes = [] + # Iterate over each subdirectory in the base directory + for directory_p in base_directory_p.iterdir(): + if directory_p.is_dir(): + for file_p in directory_p.glob("*.npy"): + connectome = np.load(file_p) + connectomes.append(connectome) + return connectomes + +def connectome_to_edges_matrix(connectomes): + """ + Transform a list of connectomes into a matrix suitable for ComBat, + where rows are ROI pairs and columns are participants. + """ + num_participants = len(connectomes) + num_rois = connectomes[0].shape[0] + num_edges = num_rois * (num_rois - 1) // 2 + + edges_matrix = np.zeros((num_edges, num_participants)) + for i, connectome in enumerate(connectomes): + edge_index = 0 + for row in range(num_rois): + for col in range(row + 1, num_rois): + edges_matrix[edge_index, i] = connectome[row, col] + edge_index += 1 + + return edges_matrix + +# Set paths +base_directory_p = Path("/home/neuromod/ad_sz/data/npy_connectome") +output_p = "/home/neuromod/ad_sz/data/edges_matrix.tsv" + +# Load connectomes from all subdirectories. +connectomes = load_connectomes(base_directory_p) + +print(f"Number of connectomes loaded: {len(connectomes)}") +for connectome in connectomes[:5]: # Print the shape of the first 5 connectomes as a check + print(connectome.shape) + +# Transform into the ComBat-ready matrix. +edges_matrix = connectome_to_edges_matrix(connectomes) + +print(f"Edges matrix shape: {edges_matrix.shape}") + +# Save the edges_matrix to a .tsv file +np.savetxt(output_p, edges_matrix, delimiter='\t') + +print(f"Saved edges matrix to {output_p}") diff --git a/scripts/prepare_input_data.py b/scripts/prepare_input_data.py new file mode 100644 index 0000000..34f5f76 --- /dev/null +++ b/scripts/prepare_input_data.py @@ -0,0 +1,284 @@ +import h5py +import numpy as np +import pandas as pd +from pathlib import Path + +dataset_configs = { + "cobre": { + "connectome_path": "cobre_connectome-0.4.1_MIST_afc", + "no_session": False, + "subject_folder": False, + }, + "cimaq": { + "connectome_path": "cimaq_connectome-0.4.1_MIST_afc", + "no_session": False, + "subject_folder": True, + }, + "hcpep": { + "connectome_path": "hcp-ep_connectome-0.4.1", + "no_session": False, + "subject_folder": True, + }, + "ds000030": { + "connectome_path": "ds000030_connectomes-0.4.1", + "no_session": True, + "subject_folder": False, + }, + "oasis3": { + "connectome_path": "oasis3_connectomes-0.4.1", + "no_session": False, + "subject_folder": True, + }, + "srpbs": { + "connectome_path": "srpbs_connectome-0.4.1", + "no_session": False, + "subject_folder": True, + }, + "adni": { + "connectome_path": "adni_connectome-0.4.1_MIST_afc", + "no_session": False, + "subject_folder": True, + }, +} + + +def load_connectome(file, participant_id, session, identifier, no_session): + # Path construction within .h5 file + dataset_path = f"sub-{participant_id}" + + if not no_session: + dataset_path += f"/ses-{session}" + + dataset_path += f"/{identifier}_atlas-MIST_desc-64_connectome" + dataset = file.get(dataset_path) + + if dataset is not None: + return dataset[...] + else: + print(f"Connectome for {identifier} not found") + return None + + +def save_npy_connectome(connectome_npy, output_p, identifier): + # Not using this function currently + if connectome_npy is not None: + output_file_p = output_p / f"{identifier}.npy" + np.save(output_file_p, connectome_npy) + print(f"Saved data for {identifier}") + else: + print(f"Connectome for {identifier} not found") + + +def connectome_to_edge_matrix(connectome): + """ + Transform a single connectome into an edge matrix. + Extract the upper triangular part of the connectome, including the diagonal. + """ + # Get the upper triangular indices including the diagonal + row_indices, col_indices = np.triu_indices_from(connectome) + + # Extract the values at these indices + edge_matrix = connectome[row_indices, col_indices] + + return edge_matrix + + +def process_connectomes(connectomes_by_participant): + """ + Apply Fisher z transform, compute the global signal and convert to an edge matrix. For multiple connectomes per participant, connectomes are averaged first. + """ + edges_matrix_dict = {} + global_signal_dict = {} + for participant_id, connectomes in connectomes_by_participant.items(): + # Compute the mean connectome + mean_connectome = np.mean(connectomes, axis=0) + # Apply Fisher Z transformation + transformed_connectome = np.arctanh(mean_connectome) + # Compute global signal (mean of the Z values) + global_signal = np.mean(transformed_connectome) + global_signal_dict[participant_id] = global_signal + # Convert connectome to a marix for ComBat + edges_matrix = connectome_to_edge_matrix(transformed_connectome) + edges_matrix_dict[participant_id] = edges_matrix + + return edges_matrix_dict, global_signal_dict + + +def matrix_dict_to_df(edges_matrix_dict): + # Create a df from the edge matrices + # Convert dictionary values to a 2D numpy array + edge_matrix_combined = np.column_stack(list(edges_matrix_dict.values())) + participants = list(edges_matrix_dict.keys()) + edges_df = pd.DataFrame(edge_matrix_combined, columns=participants) + + # Transpose so suitable for ComBat + edges_df = edges_df.T + + return edges_df + + +def process_datasets(conn_p, df, dataset_name): + config = dataset_configs[dataset_name] + base_dir = conn_p / config["connectome_path"] + + # Filter the df for required scans for the dataset + filtered_df = df[df["dataset"] == dataset_name] + valid_rows = [] # To store rows where the connectome was found + + connectomes_by_participant = {} + for index, row in filtered_df.iterrows(): + participant_id = row["participant_id"] + identifier = row["identifier"] + session = None if config["no_session"] else row.get("ses", None) + + # Adjust file path based on whether dataset uses a subject folder + if config["subject_folder"]: + file_path = ( + base_dir + / participant_id + / f"sub-{participant_id}_atlas-MIST_desc-scrubbing.5+gsr.h5" + ) + else: + file_path = base_dir / "atlas-MIST_desc-scrubbing.5+gsr.h5" + try: + with h5py.File(file_path, "r") as file: + connectome = load_connectome( + file, participant_id, session, identifier, config["no_session"] + ) + if connectome is not None: + # Append the connectome to the participant's list in the dictionary + if participant_id not in connectomes_by_participant: + connectomes_by_participant[participant_id] = [] + connectomes_by_participant[participant_id].append(connectome) + + # Add the row to valid_rows if the connectome is found + valid_rows.append(row) + + except FileNotFoundError: + print(f"File not found: {file_path}") + + edges_df = pd.DataFrame() + # Process connectomes + if connectomes_by_participant: + edges_matrix_dict, global_signal_dict = process_connectomes( + connectomes_by_participant + ) + edges_df = matrix_dict_to_df(edges_matrix_dict) + + # Convert the list of valid rows to a df + valid_df = pd.DataFrame(valid_rows) + + return edges_df, valid_df, global_signal_dict + + +def create_pheno_df(valid_df, global_signal_dict): + # Group by 'participant_id' and aggregate, taking the first entry for variables that do not change (for this use case) + aggregation_functions = { + "sex": "first", + "age": "first", + "diagnosis": "first", + "site": "first", + "dataset": "first", + "mean_fd_scrubbed": "mean", # Averaging mean framewise displacement across scans + "group": "first", + } + pheno_df = valid_df.groupby("participant_id").agg(aggregation_functions) + # Add global signal to pheno_df + pheno_df["mean_conn"] = pheno_df.index.map(global_signal_dict) + return pheno_df + + +def create_covariates_df_old(valid_df): + # Group by participant_id and select the first entry for each group + first_entries = valid_df.groupby("participant_id").first() + + # Extract the relevant covariates + covariates_df = first_entries[["site", "sex", "age", "diagnosis"]] + + return covariates_df + + +def one_hot_encode_column(df, column_name): + dummies = pd.get_dummies(df[column_name], prefix=column_name) + df = pd.concat([df, dummies], axis=1) + df.drop(column_name, axis=1, inplace=True) + return df + + +def one_hot_encode_column_no_prefix(df, column_name): + dummies = pd.get_dummies(df[column_name]) + df = pd.concat([df, dummies], axis=1) + df.drop(column_name, axis=1, inplace=True) + return df + + +def create_covariates_df(pheno_df): + # Extract the relevant covariates from pheno_df + covariates_df = pheno_df[ + [ + "site", + "sex_male", + "sex_female", + "age", + "diagnosis_MCI", + "diagnosis_ADD", + "diagnosis_CON", + "diagnosis_SCHZ", + ] + ] # split CON into controls from AD and SCHZ datasets? + + return covariates_df + + +if __name__ == "__main__": + conn_p = Path("/home/nclarke/scratch") + output_p = Path("/home/nclarke/ad_sz/data/input_data") + df = pd.read_csv("/home/nclarke/ad_sz/data/final_qc_pheno.tsv", sep="\t") + + # Convert adni sessions to strings so can formulate paths correctly + mask = df["dataset"] == "adni" + df.loc[mask, "ses"] = pd.to_datetime(df.loc[mask, "ses"]).dt.strftime("%Y%m%d") + + if not output_p.exists(): + output_p.mkdir(parents=True, exist_ok=True) + + all_edges = [] + all_pheno = [] + for dataset in dataset_configs: + edges_df, valid_df, global_signal_dict = process_datasets(conn_p, df, dataset) + pheno_df = create_pheno_df(valid_df, global_signal_dict) + + # Collect data per dataset + all_edges.append(edges_df) + all_pheno.append(pheno_df) + + # Concatenate data across datasets + combined_edges_df = pd.concat(all_edges) + combined_pheno_df = pd.concat(all_pheno) + + # One-hot encode columns + combined_pheno_df = one_hot_encode_column(combined_pheno_df, "sex") + combined_pheno_df = one_hot_encode_column(combined_pheno_df, "diagnosis") + combined_pheno_df = one_hot_encode_column(combined_pheno_df, "group") + + # Create covariates_df from pheno_df + covariates_df = create_covariates_df(combined_pheno_df) + + # Capitalize column names, necessary for combat etc + covariates_df.columns = [x.upper() for x in covariates_df.columns] + combined_pheno_df.columns = [x.upper() for x in combined_pheno_df.columns] + + print("Shape of combined_edges_df = ", combined_edges_df.shape) + print("Shape of covariates_df = ", covariates_df.shape) + print("Shape of combined_pheno_df = ", combined_pheno_df.shape) + + # Ensure order of data is identical and output + combined_edges_df.sort_index().to_csv( + output_p / "combat_edges.tsv", sep="\t", index=True + ) + covariates_df.sort_index().to_csv( + output_p / "combat_covariates.tsv", sep="\t", index=True + ) + combined_pheno_df.sort_index().to_csv(output_p / "pheno.tsv", sep="\t", index=True) + + print(f"Saved edges matrix, covariates and pheno to {output_p}") diff --git a/scripts/significance_corr.py b/scripts/significance_corr.py new file mode 100644 index 0000000..05a1e7e --- /dev/null +++ b/scripts/significance_corr.py @@ -0,0 +1,265 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +import util +import time +import os +import itertools +from statsmodels.stats.multitest import multipletests +from scipy.stats import pearsonr +from argparse import ArgumentParser + + +# SEBASTIEN URCHS +def p_permut(empirical_value, permutation_values): + n_permutation = len(permutation_values) + if empirical_value >= 0: + return (np.sum(permutation_values > empirical_value) + 1) / (n_permutation + 1) + return (np.sum(permutation_values < empirical_value) + 1) / (n_permutation + 1) + + +def filter_fdr(df, contrasts): + df_filtered = df[ + (df["pair0"].isin(contrasts)) & (df["pair1"].isin(contrasts)) + ].copy() + _, fdr, _, _ = multipletests(df_filtered["pval"], method="fdr_bh") + df_filtered["fdr_filtered"] = fdr + return df_filtered + + +def mat_form(df, contrasts, value="betamap_corr"): + n = len(contrasts) + d = dict(zip(contrasts, range(n))) + mat = np.zeros((n, n)) + for c in contrasts: + # fill out vertical strip of mat + for i in range(n): + if i == d[c]: + val = 1 + else: + val = df[ + ((df["pair0"] == c) | (df["pair1"] == c)) + & ((df["pair0"] == contrasts[i]) | (df["pair1"] == contrasts[i])) + ][value] + mat[i, d[c]] = val + mat[d[c], i] = val + return pd.DataFrame(mat, columns=contrasts, index=contrasts) + + +def make_matrices(df, contrasts, fdr="fdr_filtered"): + "Param fdr can be set to 'fdr_filtered': FDR is performed using the pvalues only from the chosen contrasts" + " or 'fdr': values taken from FDR performed on full set of 42 contrasts" + if fdr == "fdr_filtered": + df = filter_fdr(df, contrasts) + mat_corr = mat_form(df, contrasts, value="betamap_corr") + mat_pval = mat_form(df, contrasts, value="pval") + mat_fdr = mat_form(df, contrasts, value=fdr) + return mat_corr, mat_pval, mat_fdr + + +def get_corr_dist(cases, nulls, path_out, tag="wholeconn"): + # For each unique pair, between the null maps. + n_pairs = int((len(cases)) * (len(cases) - 1) / 2) + corr = np.zeros((n_pairs, 5000)) + + print( + "Getting correlation between 5000 null maps for {} unique pairs for {} cases...".format( + n_pairs, len(cases) + ) + ) + pair = [] + l = 0 + for i in itertools.combinations(cases, 2): + for j in range(5000): + corr[l, j] = pearsonr( + nulls.loc[i[0]].values[j, :], nulls.loc[i[1]].values[j, :] + )[0] + + pair.append(i) + if l % 50 == 0: + print("{}/{}".format(l, n_pairs)) + l = l + 1 + + df = pd.DataFrame(corr) + df["pair"] = pair + df.to_csv(os.path.join(path_out, "correlation_dist_{}.csv".format(tag))) + return df + + +def get_corr(cases, betas, path_out, tag="wholeconn"): + # For each unique pair, correlation between betamaps. Use standardized betas here (as in rest of paper). + n_pairs = int((len(cases)) * (len(cases) - 1) / 2) + corr = np.zeros(n_pairs) + + print( + "Getting correlation between betamaps for {} unique pairs for {} cases...".format( + n_pairs, len(cases) + ) + ) + pair = [] + l = 0 + for i in itertools.combinations(cases, 2): + corr[l] = pearsonr(betas.loc[i[0]].values, betas.loc[i[1]].values)[0] + l = l + 1 + pair.append(i) + df = pd.DataFrame(corr) + df["pair"] = pair + df.to_csv(os.path.join(path_out, "correlation_betas_{}.csv".format(tag))) + return df + + +def get_corr_pval(maps, nulls, betas, path_out, tag="wholeconn"): + df = get_corr_dist(maps, nulls, path_out, tag=tag) + df_bb = get_corr(maps, betas, path_out, tag=tag) + + df_bb = df_bb.rename(columns={0: "betamap_corr"}) + df_master = df_bb.merge(df, on="pair") + + print("Calculating pvals...") + # CALCULATE PVALS + pval = [] + for i in df_master.index: + p = p_permut(df_master.loc[i, "betamap_corr"], df_master[range(5000)].loc[i]) + pval.append(p) + df_master["pval"] = pval + + # ADD LABELS + pair0 = [p[0] for p in df["pair"].tolist()] + pair1 = [p[1] for p in df["pair"].tolist()] + df_master["pair0"] = pair0 + df_master["pair1"] = pair1 + + df_compact = df_master[["pair0", "pair1", "betamap_corr", "pval"]] + df_compact.to_csv( + os.path.join(path_out, "corr_pval_null_v_null_{}.csv".format(tag)) + ) + + return df_compact + + +def _make_region_masks(row): + mask = np.tri(64, k=0, dtype=bool) + + # Get the parcel number from the beginning of the row and subtract 1 + parcel_num = int(row[0]) - 1 + + # Get the parcel name from the second column value + parcel_name = str(row[1]).upper().replace(" ", "_") + + # Generate the mask + parcel = np.zeros((64, 64), bool) + parcel[:, parcel_num] = True + parcel_mask = parcel + np.transpose(parcel) + parcel_mask = np.tril(parcel_mask) + parcel_mask = parcel_mask[mask] + + return (parcel_mask, parcel_name) + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument( + "--n_path_mc", help="path to mc null models dir", dest="n_path_mc" + ) + parser.add_argument("--b_path_mc", help="path to mc betamaps dir", dest="b_path_mc") + parser.add_argument("--path_out", help="path to output directory", dest="path_out") + parser.add_argument( + "--path_corr", help="path to corr dir", dest="path_corr", default=None + ) + parser.add_argument( + "--path_parcellation", help="path to MIST_64.cv", dest="path_parcellation" + ) + args = parser.parse_args() + + n_path_mc = os.path.join(args.n_path_mc, "null_model_{}_vs_control_mc.npy") + # cont_n_path_mc = os.path.join(args.n_path_mc,'{}_null_model_mc.npy') + b_path_mc = os.path.join(args.b_path_mc, "{}_vs_con_mean.csv") + # cont_b_path_mc = os.path.join(args.b_path_mc,'cont_{}_results_mc.csv') + path_out = args.path_out + path_corr = args.path_corr + + cases = ["mci_neg", "mci_pos", "sz"] + + maps = cases # + cont + + ############# + # LOAD DATA # + ############# + null = [] + beta_std = [] + + for c in cases: + null.append(pd.DataFrame(np.load(n_path_mc.format(c)))) + beta_std.append(pd.read_csv(b_path_mc.format(c))["stand_betas"].values) + + betamaps_std = pd.DataFrame(beta_std, index=maps) + nullmodels = pd.concat(null, keys=maps) + + #################### + # WHOLE CONNECTOME # + #################### + print("Creating correlation distributions for whole connectome...") + df_wc = get_corr_pval(maps, nullmodels, betamaps_std, path_out, tag="wholeconn") + + # make matrices + print("Preparing correlation matrices...") + subset = ["mci_neg", "mci_pos", "sz"] + + corr, pval, fdr = make_matrices(df_wc, subset, fdr="fdr_filtered") + + corr.to_csv(os.path.join(path_out, "FC_corr_wholebrain_mc_null_v_null.csv")) + pval.to_csv(os.path.join(path_out, "FC_corr_pval_wholebrain_mc_null_v_null.csv")) + fdr.to_csv( + os.path.join(path_out, "FC_corr_fdr_filtered_wholebrain_mc_null_v_null.csv") + ) + + #################### + # REGIONS # + #################### + df_MIST = pd.read_csv(args.path_parcellation, delimiter=";") + + for index, row in df_MIST.iterrows(): + # create region mask + parcel_mask, parcel_name = _make_region_masks(row) + + # filter maps + null_parcel = [n.transpose()[parcel_mask].transpose() for n in null] + beta_std_parcel = [b[parcel_mask] for b in beta_std] + + betamaps_std_parcel = pd.DataFrame(beta_std_parcel, index=maps) + nullmodels_parcel = pd.concat(null_parcel, keys=maps) + + print(f"Creating correlation distributions for {parcel_name}...") + df_parcel = get_corr_pval( + maps, nullmodels_parcel, betamaps_std_parcel, path_out, tag=parcel_name + ) + + # make matrices + corr_parcel, pval_parcel, fdr_filtered_parcel = make_matrices( + df_parcel, subset, fdr="fdr_filtered" + ) + + _, pval_parcel_fdr_corrected, _, _ = multipletests(pval_parcel, method="fdr_bh") + + # TO DO - see if I need to correct fdr filtered, I think so? + if (pval_parcel < 0.05).any(): + corr_parcel.to_csv( + os.path.join(path_out, f"FC_corr_{parcel_name}_mc_null_v_null.csv") + ) + pval_parcel.to_csv( + os.path.join(path_out, f"FC_corr_pval_{parcel_name}_mc_null_v_null.csv") + ) + pval_parcel_fdr_corrected.to_csv( + os.path.join( + path_out, + f"FC_corr_pval_fdr_corrected_{parcel_name}_mc_null_v_null.csv", + ) + ) + fdr_filtered_parcel.to_csv( + os.path.join( + path_out, f"FC_corr_fdr_filtered_{parcel_name}_mc_null_v_null.csv" + ) + ) + + print("Done!")