diff --git a/.gitignore b/.gitignore
index 0d05551..918137f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -113,7 +113,7 @@ venv/
ENV/
env.bak/
venv.bak/
-cwas_neuro_env/
+ad_sz_env/
# Spyder project settings
.spyderproject
diff --git a/scripts/baseline_biomarkers.py b/scripts/baseline_biomarkers.py
deleted file mode 100644
index e7ea469..0000000
--- a/scripts/baseline_biomarkers.py
+++ /dev/null
@@ -1,173 +0,0 @@
-import pandas as pd
-import numpy as np
-
-from functools import reduce
-from pathlib import Path
-
-# paths to files
-
-adni_data = Path("__file__").resolve().parent / 'data' / 'adni_spreadsheet.csv'
-tau_data = Path("__file__").resolve().parent / 'data' / 'UCBERKELEYAV1451_04_26_22.csv'
-other_biomarker_data = Path("__file__").resolve().parent / 'data' / 'ADNIMERGE.csv'
-
-# functions
-
-def get_baselines(file):
-
- # load baseline phenotypic data
- pheno = pd.read_csv(file, index_col=0, header=0)
-
- # keep only the variables of interest
- pheno = pheno.filter(['Subject ID','Phase','Sex','Research Group', 'Visit','Study Date','Age'], axis=1)
-
- # convert 'study date' to 'session' in datetime format, to match other spreadsheets
- pheno['session'] = pd.to_datetime(pheno['Study Date'])
-
- # pull out only the subject id and asign it to the index
- pheno_subj = []
- for i in pheno['Subject ID']:
- subj = i.split('_')[2].lstrip("0") # remove leading zeros since it won't match ADNI IDs
- pheno_subj.append(subj)
-
- pheno.index = pheno_subj
- pheno.rename_axis('RID',inplace=True)
- pheno.index = pheno.index.astype('int64')
-
- # separate patients and controls, because (in theory) we can use any control visit as baseline, but
- # for patients we want their actual baseline data
- patient_diagnoses = ['AD', 'EMCI', 'LMCI', 'MCI', 'SMC']
- patient_df = pheno[pheno['Research Group'].isin(patient_diagnoses)] # df of patient diagnoses
-
- control_df = pheno.loc[pheno['Research Group'] == 'CN'] # df of control diagnoses
-
- # I think these visits are acceptable as baseline data, i.e. actual baseline +/-3 months, excluding
- # any initial visits where patient continued from a previous phase
- bl_visits = ['ADNI Screening','ADNI2 Month 3 MRI-New Pt', 'ADNI2 Screening MRI-New Pt',
- 'ADNIGO Month 3 MRI','ADNIGO Screening MRI']
-
- patient_df_bl = patient_df[patient_df['Visit'].isin(bl_visits)]
-
- # rejoin the patients to the controls
- new_df = pd.concat([control_df,patient_df_bl])
-
- # select the earliest visit available for each participant
- new_df.sort_values(['Subject ID', 'Age'], inplace=True) # sort by age
- baseline_df = new_df[~new_df.duplicated(['Subject ID'], keep='first')] # select the first row
-
- # sort df by index
- baseline_df.sort_values(by='RID', inplace=True)
-
- # calculate window for acceptable biomarker data, currently +- 12months
- baseline_df.loc[:,('date_lower')] = baseline_df.loc[:,('session')] - pd.DateOffset(months=18)
- baseline_df.loc[:,('date_upper')] = baseline_df.loc[:,('session')] + pd.DateOffset(months=18)
-
- return (baseline_df)
-
-def load_biomarker_df(biomarker_data):
-
- # load data
- biomarker_df = pd.read_csv(biomarker_data, index_col=0, header=0, low_memory=False)
-
- # convert examdate to datetime
- biomarker_df['EXAMDATE'] = pd.to_datetime(biomarker_df['EXAMDATE'])
-
- # sort df by index and date
- biomarker_df.sort_values(by=['RID', 'EXAMDATE'],inplace=True)
-
- # create column from index (useful for later functions)
- biomarker_df['RID'] = biomarker_df.index
-
- return (biomarker_df)
-
-def match_baselines_biomarker(biomarker_df, baseline_df, biomarker):
-
- df = pd.DataFrame(columns=['RID',biomarker,biomarker+'_EXAMDATE']) #create df
- common_ids = biomarker_df.index.intersection(baseline_df.index) #find ids common to the biomarker and baseline dfs
- biomarker_df = biomarker_df.set_index('EXAMDATE') #reindex, needed to use 'nearest' method
-
- for rid in common_ids:
- participant_df = biomarker_df[(biomarker_df['RID'] == rid)] #create df of all participants results
- participant_df = participant_df.dropna(subset=[biomarker]) #drop NaNs, since we only want nearest date with result available
-
- baseline = baseline_df.loc[rid] #create df of participants baseline data
- session = baseline['session'] #participant's baseline date
-
- if participant_df.empty:
- pass
- else:
- idx_nearest = participant_df.index.get_loc(session, method='nearest') #find the index of the closest test date to session
- nearest_date = participant_df.index[idx_nearest] #which date is it
- nearest_result = participant_df[biomarker][idx_nearest] #find the biomarker result associated with closest date
-
- df.loc[len(df)] = [rid,nearest_result,nearest_date] #add to df
-
- df = df.set_index('RID')
-
- return (df)
-
-def check_visit_window(biomarker_df, baseline_df, biomarker):
-
- '''
- Join closest biomarkers to baseline info, check if the result was collected on a date within the baseline
- window, and if not replace with NaN. Drop unwanted columns and return biomarker info again, ready to merge.
- '''
-
- # create new df, merging biomarker data into baseline df
- baseline_bio = baseline_df.join(biomarker_df)
-
- # create mask of date range, between lower and upper acceptable dates
- mask_date_range = (baseline_bio[biomarker+'_EXAMDATE'] > baseline_bio['date_lower']) & (baseline_bio[biomarker+'_EXAMDATE'] < baseline_bio['date_upper'])
-
- # fill values collected outside date range with NaN
- baseline_bio[biomarker][~mask_date_range] = np.nan
-
- cols_to_drop = ['Subject ID',
- 'Phase',
- 'Sex',
- 'Research Group',
- 'Visit',
- 'Study Date',
- 'Age',
- 'session',
- 'date_lower',
- 'date_upper']
-
- baseline_bio = baseline_bio.drop(cols_to_drop, axis=1) #drop unwanted columns
-
- return (baseline_bio)
-
-def get_biomarker(biomarker_df, baseline_df, biomarker):
-
- #runs two functions to match the baseline to closest biomarker result, and then checks if it falls in the window
-
- find_nearest_biomarker = match_baselines_biomarker(biomarker_df, baseline_df, biomarker)
- window_checked = check_visit_window(find_nearest_biomarker, baseline_df, biomarker)
-
- return (window_checked)
-
-if __name__ == '__main__':
-
- #get baselines from adni spreadsheet
- baseline_df = get_baselines(adni_data)
-
- # load biomarker files
- tau_df = load_biomarker_df(tau_data)
- other_biomarkers_df = load_biomarker_df(other_biomarker_data)
-
- # create one df per biomarker with closest result within a one year window of participant baseline
- tau = get_biomarker(tau_df, baseline_df, 'META_TEMPORAL_SUVR')
- abeta = get_biomarker(other_biomarkers_df, baseline_df, 'ABETA')
- ptau = get_biomarker(other_biomarkers_df, baseline_df, 'PTAU')
- av45 = get_biomarker(other_biomarkers_df, baseline_df, 'AV45')
- fbb = get_biomarker(other_biomarkers_df, baseline_df, 'FBB')
-
- # create list of dataframes (baseline data and all individual biomarkers)
- data_frames = [baseline_df, tau, abeta, ptau, av45, fbb]
-
- # merge dataframes
- master_biomarkers = reduce(lambda left, right:
- pd.merge_asof(left, right, left_index=True, right_index=True),
- data_frames)
-
- # save baseline and matched biomarker data in data directory
- master_biomarkers.to_csv(Path("__file__").resolve().parent / 'data' / 'master_biomarkers.csv')
diff --git a/scripts/biomarker_thresholding.ipynb b/scripts/biomarker_thresholding.ipynb
deleted file mode 100644
index dedac85..0000000
--- a/scripts/biomarker_thresholding.ipynb
+++ /dev/null
@@ -1,294 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 22,
- "id": "d213377a",
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "\n",
- "from pathlib import Path"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "id": "9a4034fa",
- "metadata": {},
- "outputs": [],
- "source": [
- "# set thresholds and calculate 5% margin\n",
- "\n",
- "# CSF biomarkers\n",
- "ab42_threshold = 980\n",
- "ab42_margin = (5*ab42_threshold)/100\n",
- "ab42_upper = ab42_threshold+ab42_margin\n",
- "ab42_lower = ab42_threshold-ab42_margin\n",
- "\n",
- "ptau_threshold = 23\n",
- "ptau_margin = (5*ptau_threshold)/100\n",
- "ptau_upper = ptau_threshold+ptau_margin\n",
- "ptau_lower = ptau_threshold-ptau_margin\n",
- "\n",
- "# PET biomarkers\n",
- "fbb_threshold = 1.08\n",
- "fbb_margin = (5*fbb_threshold)/100\n",
- "fbb_upper = fbb_threshold+fbb_margin\n",
- "fbb_lower = fbb_threshold-fbb_margin\n",
- "\n",
- "av45_threshold = 1.08\n",
- "av45_margin = (5*av45_threshold)/100\n",
- "av45_upper = av45_threshold+av45_margin\n",
- "av45_lower = av45_threshold-av45_margin"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a913b27a",
- "metadata": {},
- "outputs": [],
- "source": [
- "def calculate_tau_cutoff(df):\n",
- " \n",
- " '''\n",
- " Take a df with biomarker results for all participants. Select controls only. Find those negative for amyloid\n",
- " PET (either av45 or FBB). For those who hav flortaucipir results, calculate average. Calculate +2SD, this is\n",
- " the flortaucipir cutoff.\n",
- " \n",
- " '''\n",
- " \n",
- " controls = df.loc[df['Research Group'] == 'CN']\n",
- " \n",
- " controls_Apet_neg = (controls[(controls['FBB'] < fbb_lower) | \n",
- " (controls['AV45'] < av45_lower)])\n",
- " \n",
- " tau_mean = controls_Apet_neg['META_ROI'].mean()\n",
- " tau_sd = controls_Apet_neg['META_ROI'].std()\n",
- " tau_cutoff = tau_mean+(tau_sd*2)\n",
- " \n",
- " return (tau_cutoff) "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "id": "bfa2dd1c",
- "metadata": {},
- "outputs": [],
- "source": [
- "# paths to files\n",
- "biomarker_data = Path(\"__file__\").resolve().parents[1] / 'data' / 'master_biomarkers.csv'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "id": "837cb329",
- "metadata": {},
- "outputs": [],
- "source": [
- "# load baseline data\n",
- "df = pd.read_csv(biomarker_data, index_col=0, header=0)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "id": "c75a9dee",
- "metadata": {},
- "outputs": [],
- "source": [
- "# replace some biomarker data which has been coded as > or <\n",
- "df = df.replace({'ABETA': {'>1700': 1701, '<200': 199}})\n",
- "df = df.replace({'PTAU': {'<8': 7, '>120':121}})\n",
- "\n",
- "df[['ABETA', 'PTAU']] = df[['ABETA', 'PTAU']].apply(pd.to_numeric)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "id": "b2d48836",
- "metadata": {},
- "outputs": [],
- "source": [
- "# calculate study specific cutoff for tau PET meta temporal ROI \n",
- "tau_threshold = calculate_tau_cutoff(df)\n",
- "tau_margin = (5*tau_threshold)/100\n",
- "tau_upper = tau_threshold+tau_margin\n",
- "tau_lower = tau_threshold-tau_margin"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "id": "60d0393b",
- "metadata": {},
- "outputs": [],
- "source": [
- "def biomarker_negative_controls(df):\n",
- " \n",
- " '''\n",
- " Take a df with biomarker results for participants labelled as controls, and return a df excluding those\n",
- " who have a positive result on any one of five markers (or no biomarker information).\n",
- " \n",
- " To get more info on which biomarkers are positive, uncomment last section. But this is only to give an \n",
- " idea, since some may be positive on more than one marker.\n",
- " '''\n",
- " \n",
- " controls_all = df.loc[df['Research Group'] == 'CN']\n",
- " \n",
- " # drop participants who don't have any biomarker data\n",
- " controls = controls_all.dropna(subset=['ABETA','PTAU','AV45','FBB','META_ROI'], how='all')\n",
- " \n",
- " controls_biomarker_neg = (controls[(controls['FBB'] < fbb_lower) | \n",
- " (controls['AV45'] < av45_lower) | \n",
- " (controls['ABETA'] > ab42_upper) | \n",
- " (controls['PTAU'] < ptau_lower) | \n",
- " (controls['META_ROI'] < tau_lower)])\n",
- " \n",
- " n_controls = len(controls_all)\n",
- " no_markers = n_controls-len(controls)\n",
- " n_controls_negative = len(controls_biomarker_neg)\n",
- " n_dropped = (n_controls-no_markers) - n_controls_negative\n",
- " \n",
- " print ('Of {} controls, {} dropped due to no biomarker data and {} due to positive/borderline biomarkers. {} remaining.'.format(\n",
- " n_controls,\n",
- " no_markers, \n",
- " n_dropped,\n",
- " n_controls_negative))\n",
- " \n",
- " '''\n",
- " controls_biomarker_pos = pd.concat([controls,controls_biomarker_neg]).drop_duplicates(keep=False)\n",
- " print ('{} positive/borderline for Abeta42.'.format(len(controls_biomarker_pos[(controls_biomarker_pos['ABETA'] < ab42_upper)])))\n",
- " print ('{} positive/borderline for Ptau.'.format(len(controls_biomarker_pos[(controls_biomarker_pos['PTAU'] > ptau_lower)])))\n",
- " print ('{} positive/borderline for PET amyloid (FBB tracer).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['FBB'] > fbb_lower)])))\n",
- " print ('{} positive/borderline for PET amyloid (AV45 tracer).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['AV45'] > av45_lower)])))\n",
- " print ('{} positive/borderline for PET tau (meta roi).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['META_ROI'] > tau_lower)])))\n",
- " '''\n",
- " return (controls_biomarker_neg)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "id": "b3a95289",
- "metadata": {},
- "outputs": [],
- "source": [
- "def biomarker_positive_patients(df, diagnoses):\n",
- " \n",
- " '''\n",
- " Take a df with biomarker results for participants labelled as a patient (specifying which), \n",
- " and return a df including only those who have a positive result on any one of five markers. \n",
- " Exclude any with no biomarker information.\n",
- " \n",
- " To get more info on which biomarkers are negative, uncomment last section. But this is only to give an \n",
- " idea, since some may be negative on more than one marker.\n",
- " '''\n",
- " \n",
- " patients_all = df.loc[df['Research Group'].isin(diagnoses)]\n",
- " \n",
- " # drop participants who don't have any biomarker data\n",
- " patients = patients_all.dropna(subset=['ABETA','PTAU','AV45','FBB','META_ROI'], how='all')\n",
- " \n",
- " patients_biomarker_pos = (patients[(patients['FBB'] > fbb_upper) | \n",
- " (patients['AV45'] > av45_upper) | \n",
- " (patients['ABETA'] < ab42_lower) | \n",
- " (patients['PTAU'] > ptau_upper) |\n",
- " (patients['META_ROI'] > tau_upper)])\n",
- "\n",
- " n_patients = len(patients_all)\n",
- " no_markers = n_patients-len(patients)\n",
- " n_patients_positive = len(patients_biomarker_pos)\n",
- " n_dropped = (n_patients-no_markers) - n_patients_positive\n",
- " \n",
- " print ('Of {} {}, {} dropped due to no biomarker data and {} due to negative/borderline biomarkers. {} remaining.'.format(\n",
- " n_patients,\n",
- " diagnoses,\n",
- " no_markers,\n",
- " n_dropped,\n",
- " n_patients_positive))\n",
- " \n",
- " '''\n",
- " patients_neg = pd.concat([patients,patients_biomarker_pos]).drop_duplicates(keep=False)\n",
- " \n",
- " print ('{} negative/borderline for Abeta42.'.format(len(patients_neg[(patients_neg['ABETA'] > ab42_lower)])))\n",
- " print ('{} negative/borderline for Ptau.'.format(len(patients_neg[(patients_neg['PTAU'] < ptau_upper)])))\n",
- " print ('{} negative/borderline for PET amyloid (FBB tracer).'.format(len(patients_neg[(patients_neg['FBB'] < fbb_upper)])))\n",
- " print ('{} negative/borderline for PET amyloid (AV45 tracer).'.format(len(patients_neg[(patients_neg['AV45'] < av45_upper)])))\n",
- " print ('{} negative/borderline for PET tau (meta roi).'.format(len(patients_neg[(patients_neg['META_ROI'] < tau_upper)])))\n",
- " '''\n",
- " return (patients_biomarker_pos)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "id": "0d123b9f",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Of 362 controls, 44 dropped due to no biomarker data and 60 due to positive/borderline biomarkers. 258 remaining.\n",
- "Of 58 ['AD'], 7 dropped due to no biomarker data and 3 due to negative/borderline biomarkers. 48 remaining.\n",
- "Of 165 ['EMCI', 'LMCI', 'MCI'], 25 dropped due to no biomarker data and 54 due to negative/borderline biomarkers. 86 remaining.\n"
- ]
- }
- ],
- "source": [
- "# run functions to get negative controls and positive patients\n",
- "controls_bio_neg = biomarker_negative_controls(df)\n",
- "ad_bio_positive = biomarker_positive_patients(df, ['AD'])\n",
- "mci_bio_positive = biomarker_positive_patients(df, ['EMCI','LMCI','MCI'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "id": "979848a1",
- "metadata": {},
- "outputs": [],
- "source": [
- "master_df = pd.concat([controls_bio_neg, ad_bio_positive, mci_bio_positive])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "id": "cf508b49",
- "metadata": {},
- "outputs": [],
- "source": [
- "master_df.to_csv(Path(\"__file__\").resolve().parents[1] / 'data' / 'final_biomarker_spreadsheet.csv')"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/scripts/calculate_meta_roi.ipynb b/scripts/calculate_meta_roi.ipynb
deleted file mode 100644
index b3cb980..0000000
--- a/scripts/calculate_meta_roi.ipynb
+++ /dev/null
@@ -1,723 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "3881d9a6",
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "\n",
- "from pathlib import Path"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "3fdee605",
- "metadata": {},
- "outputs": [],
- "source": [
- "tau_data = Path(\"__file__\").resolve().parents[1] / 'data' / 'UCBERKELEYAV1451_04_26_22.csv'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "7b6b5db3",
- "metadata": {},
- "outputs": [],
- "source": [
- "# load tau PET data\n",
- "df = pd.read_csv(tau_data, index_col=0, header=0)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "83a462f2",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " VISCODE | \n",
- " VISCODE2 | \n",
- " EXAMDATE | \n",
- " INFERIORCEREBELLUM_SUVR | \n",
- " INFERIORCEREBELLUM_VOLUME | \n",
- " ERODED_SUBCORTICALWM_SUVR | \n",
- " ERODED_SUBCORTICALWM_VOLUME | \n",
- " BRAAK1_SUVR | \n",
- " BRAAK1_VOLUME | \n",
- " BRAAK34_SUVR | \n",
- " ... | \n",
- " RIGHT_PUTAMEN_VOLUME | \n",
- " RIGHT_THALAMUS_PROPER_SUVR | \n",
- " RIGHT_THALAMUS_PROPER_VOLUME | \n",
- " RIGHT_VENTRALDC_SUVR | \n",
- " RIGHT_VENTRALDC_VOLUME | \n",
- " RIGHT_VESSEL_SUVR | \n",
- " RIGHT_VESSEL_VOLUME | \n",
- " WM_HYPOINTENSITIES_SUVR | \n",
- " WM_HYPOINTENSITIES_VOLUME | \n",
- " update_stamp | \n",
- "
\n",
- " \n",
- " | RID | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 21 | \n",
- " init | \n",
- " m144 | \n",
- " 2018-02-02 | \n",
- " 1.0285 | \n",
- " 55285 | \n",
- " 1.1771 | \n",
- " 110695 | \n",
- " 1.1937 | \n",
- " 2869 | \n",
- " 1.2152 | \n",
- " ... | \n",
- " 4169 | \n",
- " 1.5224 | \n",
- " 5874 | \n",
- " 1.3860 | \n",
- " 3378 | \n",
- " 1.4576 | \n",
- " 10.0 | \n",
- " 1.0615 | \n",
- " 2460 | \n",
- " 2022-04-27 15:31:20.0 | \n",
- "
\n",
- " \n",
- " | 31 | \n",
- " init | \n",
- " m150 | \n",
- " 2018-04-24 | \n",
- " 0.9647 | \n",
- " 62344 | \n",
- " 1.0258 | \n",
- " 55683 | \n",
- " 1.2463 | \n",
- " 2558 | \n",
- " 1.0969 | \n",
- " ... | \n",
- " 5766 | \n",
- " 1.1421 | \n",
- " 6016 | \n",
- " 1.2340 | \n",
- " 3178 | \n",
- " 1.3691 | \n",
- " 2.0 | \n",
- " 0.7641 | \n",
- " 26845 | \n",
- " 2022-04-27 15:31:20.0 | \n",
- "
\n",
- " \n",
- " | 31 | \n",
- " y1 | \n",
- " m162 | \n",
- " 2019-04-23 | \n",
- " 0.9636 | \n",
- " 61510 | \n",
- " 1.0193 | \n",
- " 50427 | \n",
- " 1.2264 | \n",
- " 2508 | \n",
- " 1.0675 | \n",
- " ... | \n",
- " 5537 | \n",
- " 1.0801 | \n",
- " 5865 | \n",
- " 1.1644 | \n",
- " 3060 | \n",
- " 1.4203 | \n",
- " 15.0 | \n",
- " 0.7805 | \n",
- " 28306 | \n",
- " 2022-04-27 15:31:20.0 | \n",
- "
\n",
- " \n",
- " | 56 | \n",
- " init | \n",
- " m144 | \n",
- " 2018-02-20 | \n",
- " 0.9790 | \n",
- " 64415 | \n",
- " 1.2442 | \n",
- " 114401 | \n",
- " 1.2682 | \n",
- " 3809 | \n",
- " 1.2016 | \n",
- " ... | \n",
- " 4026 | \n",
- " 1.3396 | \n",
- " 6601 | \n",
- " 1.3625 | \n",
- " 3655 | \n",
- " 1.6190 | \n",
- " 42.0 | \n",
- " 1.0556 | \n",
- " 1385 | \n",
- " 2022-04-27 15:31:20.0 | \n",
- "
\n",
- " \n",
- " | 56 | \n",
- " y1 | \n",
- " m156 | \n",
- " 2019-01-10 | \n",
- " 0.9548 | \n",
- " 63302 | \n",
- " 1.2863 | \n",
- " 118625 | \n",
- " 1.1931 | \n",
- " 3666 | \n",
- " 1.1839 | \n",
- " ... | \n",
- " 4145 | \n",
- " 1.3618 | \n",
- " 6349 | \n",
- " 1.3873 | \n",
- " 3625 | \n",
- " 1.4511 | \n",
- " 10.0 | \n",
- " 1.0850 | \n",
- " 1427 | \n",
- " 2022-04-27 15:31:20.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 242 columns
\n",
- "
"
- ],
- "text/plain": [
- " VISCODE VISCODE2 EXAMDATE INFERIORCEREBELLUM_SUVR \\\n",
- "RID \n",
- "21 init m144 2018-02-02 1.0285 \n",
- "31 init m150 2018-04-24 0.9647 \n",
- "31 y1 m162 2019-04-23 0.9636 \n",
- "56 init m144 2018-02-20 0.9790 \n",
- "56 y1 m156 2019-01-10 0.9548 \n",
- "\n",
- " INFERIORCEREBELLUM_VOLUME ERODED_SUBCORTICALWM_SUVR \\\n",
- "RID \n",
- "21 55285 1.1771 \n",
- "31 62344 1.0258 \n",
- "31 61510 1.0193 \n",
- "56 64415 1.2442 \n",
- "56 63302 1.2863 \n",
- "\n",
- " ERODED_SUBCORTICALWM_VOLUME BRAAK1_SUVR BRAAK1_VOLUME BRAAK34_SUVR \\\n",
- "RID \n",
- "21 110695 1.1937 2869 1.2152 \n",
- "31 55683 1.2463 2558 1.0969 \n",
- "31 50427 1.2264 2508 1.0675 \n",
- "56 114401 1.2682 3809 1.2016 \n",
- "56 118625 1.1931 3666 1.1839 \n",
- "\n",
- " ... RIGHT_PUTAMEN_VOLUME RIGHT_THALAMUS_PROPER_SUVR \\\n",
- "RID ... \n",
- "21 ... 4169 1.5224 \n",
- "31 ... 5766 1.1421 \n",
- "31 ... 5537 1.0801 \n",
- "56 ... 4026 1.3396 \n",
- "56 ... 4145 1.3618 \n",
- "\n",
- " RIGHT_THALAMUS_PROPER_VOLUME RIGHT_VENTRALDC_SUVR \\\n",
- "RID \n",
- "21 5874 1.3860 \n",
- "31 6016 1.2340 \n",
- "31 5865 1.1644 \n",
- "56 6601 1.3625 \n",
- "56 6349 1.3873 \n",
- "\n",
- " RIGHT_VENTRALDC_VOLUME RIGHT_VESSEL_SUVR RIGHT_VESSEL_VOLUME \\\n",
- "RID \n",
- "21 3378 1.4576 10.0 \n",
- "31 3178 1.3691 2.0 \n",
- "31 3060 1.4203 15.0 \n",
- "56 3655 1.6190 42.0 \n",
- "56 3625 1.4511 10.0 \n",
- "\n",
- " WM_HYPOINTENSITIES_SUVR WM_HYPOINTENSITIES_VOLUME update_stamp \n",
- "RID \n",
- "21 1.0615 2460 2022-04-27 15:31:20.0 \n",
- "31 0.7641 26845 2022-04-27 15:31:20.0 \n",
- "31 0.7805 28306 2022-04-27 15:31:20.0 \n",
- "56 1.0556 1385 2022-04-27 15:31:20.0 \n",
- "56 1.0850 1427 2022-04-27 15:31:20.0 \n",
- "\n",
- "[5 rows x 242 columns]"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "630a3d56",
- "metadata": {},
- "outputs": [],
- "source": [
- "result_cols = ['CTX_LH_ENTORHINAL_SUVR',\n",
- "'CTX_RH_ENTORHINAL_SUVR',\n",
- "'LEFT_AMYGDALA_SUVR',\n",
- "'RIGHT_AMYGDALA_SUVR',\n",
- "'CTX_LH_PARAHIPPOCAMPAL_SUVR',\n",
- "'CTX_RH_PARAHIPPOCAMPAL_SUVR',\n",
- "'CTX_LH_FUSIFORM_SUVR',\n",
- "'CTX_RH_FUSIFORM_SUVR',\n",
- "'CTX_LH_INFERIORTEMPORAL_SUVR',\n",
- "'CTX_RH_INFERIORTEMPORAL_SUVR',\n",
- "'CTX_LH_MIDDLETEMPORAL_SUVR',\n",
- "'CTX_RH_MIDDLETEMPORAL_SUVR']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "id": "79cb3167",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " VISCODE | \n",
- " VISCODE2 | \n",
- " EXAMDATE | \n",
- " INFERIORCEREBELLUM_SUVR | \n",
- " INFERIORCEREBELLUM_VOLUME | \n",
- " ERODED_SUBCORTICALWM_SUVR | \n",
- " ERODED_SUBCORTICALWM_VOLUME | \n",
- " BRAAK1_SUVR | \n",
- " BRAAK1_VOLUME | \n",
- " BRAAK34_SUVR | \n",
- " ... | \n",
- " RIGHT_THALAMUS_PROPER_VOLUME | \n",
- " RIGHT_VENTRALDC_SUVR | \n",
- " RIGHT_VENTRALDC_VOLUME | \n",
- " RIGHT_VESSEL_SUVR | \n",
- " RIGHT_VESSEL_VOLUME | \n",
- " WM_HYPOINTENSITIES_SUVR | \n",
- " WM_HYPOINTENSITIES_VOLUME | \n",
- " update_stamp | \n",
- " meta_roi | \n",
- " META_ROI | \n",
- "
\n",
- " \n",
- " | RID | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 21 | \n",
- " init | \n",
- " m144 | \n",
- " 2018-02-02 | \n",
- " 1.0285 | \n",
- " 55285 | \n",
- " 1.1771 | \n",
- " 110695 | \n",
- " 1.1937 | \n",
- " 2869 | \n",
- " 1.2152 | \n",
- " ... | \n",
- " 5874 | \n",
- " 1.3860 | \n",
- " 3378 | \n",
- " 1.4576 | \n",
- " 10.0 | \n",
- " 1.0615 | \n",
- " 2460 | \n",
- " 2022-04-27 15:31:20.0 | \n",
- " 1.234958 | \n",
- " 1.234958 | \n",
- "
\n",
- " \n",
- " | 31 | \n",
- " init | \n",
- " m150 | \n",
- " 2018-04-24 | \n",
- " 0.9647 | \n",
- " 62344 | \n",
- " 1.0258 | \n",
- " 55683 | \n",
- " 1.2463 | \n",
- " 2558 | \n",
- " 1.0969 | \n",
- " ... | \n",
- " 6016 | \n",
- " 1.2340 | \n",
- " 3178 | \n",
- " 1.3691 | \n",
- " 2.0 | \n",
- " 0.7641 | \n",
- " 26845 | \n",
- " 2022-04-27 15:31:20.0 | \n",
- " 1.145025 | \n",
- " 1.145025 | \n",
- "
\n",
- " \n",
- " | 31 | \n",
- " y1 | \n",
- " m162 | \n",
- " 2019-04-23 | \n",
- " 0.9636 | \n",
- " 61510 | \n",
- " 1.0193 | \n",
- " 50427 | \n",
- " 1.2264 | \n",
- " 2508 | \n",
- " 1.0675 | \n",
- " ... | \n",
- " 5865 | \n",
- " 1.1644 | \n",
- " 3060 | \n",
- " 1.4203 | \n",
- " 15.0 | \n",
- " 0.7805 | \n",
- " 28306 | \n",
- " 2022-04-27 15:31:20.0 | \n",
- " 1.137825 | \n",
- " 1.137825 | \n",
- "
\n",
- " \n",
- " | 56 | \n",
- " init | \n",
- " m144 | \n",
- " 2018-02-20 | \n",
- " 0.9790 | \n",
- " 64415 | \n",
- " 1.2442 | \n",
- " 114401 | \n",
- " 1.2682 | \n",
- " 3809 | \n",
- " 1.2016 | \n",
- " ... | \n",
- " 6601 | \n",
- " 1.3625 | \n",
- " 3655 | \n",
- " 1.6190 | \n",
- " 42.0 | \n",
- " 1.0556 | \n",
- " 1385 | \n",
- " 2022-04-27 15:31:20.0 | \n",
- " 1.235883 | \n",
- " 1.235883 | \n",
- "
\n",
- " \n",
- " | 56 | \n",
- " y1 | \n",
- " m156 | \n",
- " 2019-01-10 | \n",
- " 0.9548 | \n",
- " 63302 | \n",
- " 1.2863 | \n",
- " 118625 | \n",
- " 1.1931 | \n",
- " 3666 | \n",
- " 1.1839 | \n",
- " ... | \n",
- " 6349 | \n",
- " 1.3873 | \n",
- " 3625 | \n",
- " 1.4511 | \n",
- " 10.0 | \n",
- " 1.0850 | \n",
- " 1427 | \n",
- " 2022-04-27 15:31:20.0 | \n",
- " 1.208933 | \n",
- " 1.208933 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 244 columns
\n",
- "
"
- ],
- "text/plain": [
- " VISCODE VISCODE2 EXAMDATE INFERIORCEREBELLUM_SUVR \\\n",
- "RID \n",
- "21 init m144 2018-02-02 1.0285 \n",
- "31 init m150 2018-04-24 0.9647 \n",
- "31 y1 m162 2019-04-23 0.9636 \n",
- "56 init m144 2018-02-20 0.9790 \n",
- "56 y1 m156 2019-01-10 0.9548 \n",
- "\n",
- " INFERIORCEREBELLUM_VOLUME ERODED_SUBCORTICALWM_SUVR \\\n",
- "RID \n",
- "21 55285 1.1771 \n",
- "31 62344 1.0258 \n",
- "31 61510 1.0193 \n",
- "56 64415 1.2442 \n",
- "56 63302 1.2863 \n",
- "\n",
- " ERODED_SUBCORTICALWM_VOLUME BRAAK1_SUVR BRAAK1_VOLUME BRAAK34_SUVR \\\n",
- "RID \n",
- "21 110695 1.1937 2869 1.2152 \n",
- "31 55683 1.2463 2558 1.0969 \n",
- "31 50427 1.2264 2508 1.0675 \n",
- "56 114401 1.2682 3809 1.2016 \n",
- "56 118625 1.1931 3666 1.1839 \n",
- "\n",
- " ... RIGHT_THALAMUS_PROPER_VOLUME RIGHT_VENTRALDC_SUVR \\\n",
- "RID ... \n",
- "21 ... 5874 1.3860 \n",
- "31 ... 6016 1.2340 \n",
- "31 ... 5865 1.1644 \n",
- "56 ... 6601 1.3625 \n",
- "56 ... 6349 1.3873 \n",
- "\n",
- " RIGHT_VENTRALDC_VOLUME RIGHT_VESSEL_SUVR RIGHT_VESSEL_VOLUME \\\n",
- "RID \n",
- "21 3378 1.4576 10.0 \n",
- "31 3178 1.3691 2.0 \n",
- "31 3060 1.4203 15.0 \n",
- "56 3655 1.6190 42.0 \n",
- "56 3625 1.4511 10.0 \n",
- "\n",
- " WM_HYPOINTENSITIES_SUVR WM_HYPOINTENSITIES_VOLUME \\\n",
- "RID \n",
- "21 1.0615 2460 \n",
- "31 0.7641 26845 \n",
- "31 0.7805 28306 \n",
- "56 1.0556 1385 \n",
- "56 1.0850 1427 \n",
- "\n",
- " update_stamp meta_roi META_ROI \n",
- "RID \n",
- "21 2022-04-27 15:31:20.0 1.234958 1.234958 \n",
- "31 2022-04-27 15:31:20.0 1.145025 1.145025 \n",
- "31 2022-04-27 15:31:20.0 1.137825 1.137825 \n",
- "56 2022-04-27 15:31:20.0 1.235883 1.235883 \n",
- "56 2022-04-27 15:31:20.0 1.208933 1.208933 \n",
- "\n",
- "[5 rows x 244 columns]"
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df['META_ROI'] = df[result_cols].mean(axis=1)\n",
- "df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "id": "5aec7b55",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " EXAMDATE | \n",
- " META_ROI | \n",
- "
\n",
- " \n",
- " | RID | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 21 | \n",
- " 2018-02-02 | \n",
- " 1.234958 | \n",
- "
\n",
- " \n",
- " | 31 | \n",
- " 2018-04-24 | \n",
- " 1.145025 | \n",
- "
\n",
- " \n",
- " | 31 | \n",
- " 2019-04-23 | \n",
- " 1.137825 | \n",
- "
\n",
- " \n",
- " | 56 | \n",
- " 2018-02-20 | \n",
- " 1.235883 | \n",
- "
\n",
- " \n",
- " | 56 | \n",
- " 2019-01-10 | \n",
- " 1.208933 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " EXAMDATE META_ROI\n",
- "RID \n",
- "21 2018-02-02 1.234958\n",
- "31 2018-04-24 1.145025\n",
- "31 2019-04-23 1.137825\n",
- "56 2018-02-20 1.235883\n",
- "56 2019-01-10 1.208933"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "results_df = df[['EXAMDATE','META_ROI']].copy()\n",
- "results_df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "id": "5e52c415",
- "metadata": {},
- "outputs": [],
- "source": [
- "results_df.to_csv(Path(\"__file__\").resolve().parents[1] / 'data' / 'tau_meta_roi.csv')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "663dc7b4",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/scripts/cnvfc/stats.py b/scripts/cnvfc/stats.py
new file mode 100644
index 0000000..3a4f28e
--- /dev/null
+++ b/scripts/cnvfc/stats.py
@@ -0,0 +1,348 @@
+import sys
+import time
+import warnings
+import scipy as sp
+import numpy as np
+import patsy as pat
+import pandas as pd
+from scipy import stats
+import statsmodels.api as sm
+#from .tools import conn2mat
+from sklearn import linear_model as sln
+from sklearn import preprocessing as skp
+from sklearn import model_selection as skm
+from statsmodels.sandbox.stats.multicomp import multipletests as stm
+
+
+def make_weights(data, pattern):
+ if not data.shape[1:] == pattern.shape:
+ raise Exception(f'data and pattern shape mismatch: {data.shape[1:]}, {pattern.shape}')
+ n_nodes = pattern.shape[0]
+ n_data = data.shape[0]
+ weights = np.array([[np.corrcoef(data[data_id, node_id, :],
+ pattern[node_id, :])[0, 1]
+ for node_id in range(n_nodes)]
+ for data_id in range(n_data)])
+ return weights
+
+
+def categorical_enrichment(pheno, data, group, case=None, control=None, node_labels='node_name_not_specificed'):
+ sub_mask, cases_mask = find_subset(pheno, group, [case, control])
+ sub_data = data[sub_mask, ...]
+ n_data = sub_data.shape[1]
+ n_case = np.sum(cases_mask[case])
+ n_control = np.sum(cases_mask[control])
+
+ results = {key: list() for key in ['node', 'U', 'p_value',
+ f'median_{group}_{case}',
+ f'median_{group}_{control}',
+ 'rank_biserial_correlation']}
+ # Conduct Mann-Whitney-U for each node region
+ for node_id in range(n_data):
+ u_right, p = sp.stats.mannwhitneyu(sub_data[cases_mask[case], node_id], sub_data[cases_mask[control], node_id],
+ alternative='two-sided')
+ u_left = n_case * n_control - u_right
+ u_min = np.min([u_left, u_right])
+ # Compute the median for the case and control groups
+ median_case = np.median(sub_data[cases_mask[case], node_id])
+ median_control = np.median(sub_data[cases_mask[control], node_id])
+ # Compute rank biserial correlation
+ r_b = 1 - (2 * u_min) / (n_case * n_control)
+ # Determine if cases > controls or reverse
+ case_gt_con = u_right > u_min
+ if not case_gt_con:
+ r_b = -r_b
+ # Store the results
+ results['node'].append(node_id + 1)
+ results['U'].append(u_min)
+ results['p_value'].append(p)
+ results[f'median_{group}_{case}'].append(median_case)
+ results[f'median_{group}_{control}'].append(median_control)
+ results['rank_biserial_correlation'].append(r_b)
+
+ results_table = pd.DataFrame(data=results)
+ # Correct for multiple comparisons
+ (fdr_pass, qval, _, _) = stm(results_table.p_value, alpha=0.05, method='fdr_bh')
+ results_table['q_value'] = qval
+ # Add the node_names
+ results_table['node_name'] = node_labels
+
+ return results_table
+
+
+def continuous_enrichment(pheno, data, covariate, node_labels='node_name_not_specificed'):
+ sub_mask = find_subset(pheno, covariate)
+ sub_pheno = pheno.loc[sub_mask]
+ sub_data = data[sub_mask, :]
+ n_data = sub_data.shape[1]
+
+ results = {key: list() for key in ['node', 'pearson_r', 'p_value']}
+ for node_id in range(n_data):
+ r, p = stats.pearsonr(sub_pheno[covariate], sub_data[:, node_id])
+ # Store the results
+ results['node'].append(node_id + 1)
+ results['pearson_r'].append(r)
+ results['p_value'].append(p)
+
+ results_table = pd.DataFrame(data=results)
+ # Correct for multiple comparisons
+ (fdr_pass, qval, _, _) = stm(results_table.p_value, alpha=0.05, method='fdr_bh')
+ results_table['q_value'] = qval
+ # Add the node_names
+ results_table['node_name'] = node_labels
+
+ return results_table
+
+
+def find_subset(pheno, column, cases=None):
+ # TODO check pandas type input
+ subset_mask = np.array(~pheno[column].isnull())
+ if cases is not None and not not cases:
+ all_cases = pheno.loc[subset_mask][column].unique()
+ try:
+ case_available = np.array([True if case in all_cases else False for case in cases])
+ except TypeError as e:
+ raise Exception(f'the attribute "cases" needs to be iterable but is: {type(cases)}') from e
+ if not all(case_available):
+ if not any(case_available):
+ raise Exception(f'none of the requested cases of "{column}" are available')
+ else:
+ warnings.warn(
+ f'\nnot all requested cases of "{column}" are available: {list(zip(cases, case_available))}',
+ RuntimeWarning)
+ case_masks = np.array([pheno[column] == case for case in cases])
+ subset_mask = np.any(case_masks, 0)
+ # Return the masked instances of the requested cases
+ cases_dict = {case: case_masks[idx][subset_mask] for idx, case in enumerate(cases)}
+ return subset_mask, cases_dict
+ else:
+ return subset_mask
+
+
+def standardize(data, mask):
+ scaler = skp.StandardScaler(with_mean=False, with_std=True)
+ scaler.fit(data[mask, :])
+ standardized_data = scaler.transform(data)
+ return standardized_data
+
+
+def find_contrast(design_matrix, contrast):
+ # Find the contrast column
+ contrast_columns = [(col_id, col) for col_id, col in enumerate(design_matrix.columns) if f'{contrast}' in col]
+ if not len(contrast_columns) == 1:
+ raise Exception(f'There is no single factor that matches {contrast}: {(list(design_matrix.columns))}')
+ return contrast_columns
+
+
+def fast_glm(data, design_matrix, contrast):
+ # Does not compute p-values but operates in parallel
+ contrast_id, contrast_name = find_contrast(design_matrix, contrast)[0]
+ glm = sln.LinearRegression(fit_intercept=False, normalize=False, n_jobs=-2)
+ res = glm.fit(design_matrix, data)
+ betas = res.coef_[:, contrast_id]
+ return betas
+
+
+def glm(data, design_matrix, contrast):
+ contrast_id, contrast_name = find_contrast(design_matrix, contrast)[0]
+ n_data = data.shape[1]
+
+ # Conduct the GLM
+ betas = np.zeros(shape=n_data)
+ pvals = np.zeros(shape=n_data)
+ for conn_id in range(n_data):
+ model = sm.OLS(data[:, conn_id], design_matrix)
+ results = model.fit()
+ betas[conn_id] = results.params[contrast_id]
+ pvals[conn_id] = results.pvalues[contrast_id]
+
+ return betas, pvals
+
+
+def glm_wrap_cc(conn, pheno, group, case, control, regressors='', report=False, fast=False):
+ # Make sure pheno and conn have the same number of cases
+ if not conn.shape[0] == pheno.shape[0]:
+ print(f'Conn ({conn.shape[0]}) and pheno ({pheno.shape[0]}) must be same number of cases')
+
+ # Define the subset of the sample
+ sub_mask, case_masks = find_subset(pheno, group, [case, control])
+ sub_conn = conn[sub_mask, :]
+ sub_pheno = pheno.loc[sub_mask]
+ n_sub = np.sum(sub_mask)
+ n_case = np.sum(case_masks[case])
+ n_control = np.sum(case_masks[control])
+ n_data = sub_conn.shape[1]
+ if report:
+ print(f'Selected sample based on group variable {group}.\n'
+ f'cases: {case} (n={n_case})\n'
+ f'controls: {control} (n={n_control})\n'
+ f'original sample: n={pheno.shape[0]}; new sample: n={n_sub}\n'
+ f'{n_data} data points available\n'
+ f'standardized estimators are based on {group}=={control}')
+
+ stand_conn = standardize(sub_conn, case_masks[control])
+
+ # Construct design matrix
+ if type(control) == str:
+ contrast = f'C({group}, Treatment("{control}"))'
+ else:
+ contrast = f'C({group}, Treatment({control}))'
+ formula = ' + '.join((regressors, contrast))
+ dmat = pat.dmatrix(formula, sub_pheno, return_type='dataframe')
+
+ if fast:
+ betas = fast_glm(sub_conn, dmat, group)
+ table = pd.DataFrame(data={'betas': betas})
+ else:
+ betas, pvals = glm(sub_conn, dmat, group)
+ stand_betas, _ = glm(stand_conn, dmat, group)
+ table = pd.DataFrame(data={'betas': betas, 'stand_betas': stand_betas, 'pvals': pvals})
+
+ return table
+
+
+def glm_wrap_continuous(conn, pheno, contrast, regressors, report=False, fast=False):
+ # Make sure pheno and conn have the same number of cases
+ if not conn.shape[0] == pheno.shape[0]:
+ print(f'Conn ({conn.shape[0]}) and pheno ({pheno.shape[0]}) must be same number of cases')
+
+ # Define the subset of the sample
+ sub_mask = find_subset(pheno, contrast)
+ sub_conn = conn[sub_mask, :]
+ sub_pheno = pheno.loc[sub_mask]
+ n_sub = sub_pheno.shape[0]
+ n_data = sub_conn.shape[1]
+ sub_conn_stand = standardize(sub_conn, np.ones(n_sub).astype(bool))
+
+ if report:
+ print(f'Selected sample based on contrast variable {contrast}.\n'
+ f'Found {n_sub} subjects with no missing data for {contrast}\n'
+ f'original sample: n={pheno.shape[0]}; new sample: n={n_sub}\n'
+ f'{n_data} data points available\n'
+ f'standardized estimators are based on all subjects with no missing data for {contrast}')
+
+ formula = ' + '.join((regressors, contrast))
+ design_matrix = pat.dmatrix(formula, sub_pheno, return_type='dataframe')
+
+ if fast:
+ betas = fast_glm(sub_conn, design_matrix, contrast)
+ table = pd.DataFrame(data={'betas': betas})
+ else:
+ betas, pvals = glm(sub_conn, design_matrix, contrast)
+ stand_betas, _ = glm(sub_conn_stand, design_matrix, contrast)
+ table = pd.DataFrame(data={'betas': betas, 'stand_betas': stand_betas, 'pvals': pvals})
+
+ return table
+
+
+def permutation_glm_contrast(conn, pheno, n_a, n_a_1, n_b, n_b_1, n_iter=5000):
+ # conn: connectome (n_sub, n_conn)
+ # pheno: pheno pandas table, same order as conn
+ # n_a / n_b: number of subs per group a / b
+ # n_a_1 / n_b_1: number of subs in the first contrast of group a / b
+ # n_iter: number of permutations
+
+ # Some sanity checks because people can be stupid
+ if n_a_1 > n_a:
+ raise ValueError(f'n_a_1 ({n_a_1}) cannot be larger than n_a ({n_a})')
+ if n_b_1 > n_b:
+ raise ValueError(f'n_b_1 ({n_b_1}) cannot be larger than n_b ({n_b})')
+
+ shsp = skm.ShuffleSplit(n_splits=n_iter, test_size=0.5)
+ pheno.reset_index(inplace=True, drop=True)
+ pheno.loc[:, 'group'] = 0
+ n_subjects = pheno.shape[0]
+ beta = np.zeros((conn.shape[1], 2, n_iter))
+
+ start = time.time()
+ for fold_id, (group_a, group_b) in enumerate(shsp.split(np.ones(n_subjects))):
+ pheno_a = pheno.loc[group_a[:n_a]].copy()
+ pheno_b = pheno.loc[group_b[:n_b]].copy()
+ # Assign the groups
+ pheno_a.loc[group_a[:n_a_1], 'group'] = 1
+ pheno_b.loc[group_b[:n_b_1], 'group'] = 1
+ mat1 = pat.dmatrix('sex_dummy + Age_Bin + FD_scrubbed_s1_s2 + group', pheno_a, return_type='dataframe')
+ mat2 = pat.dmatrix('sex_dummy + Age_Bin + FD_scrubbed_s1_s2 + group', pheno_b, return_type='dataframe')
+ beta[:, 0, fold_id] = fast_glm(conn[group_a[:n_a], :], mat1, 'group')
+ beta[:, 1, fold_id] = fast_glm(conn[group_b[:n_b], :], mat2, 'group')
+
+ elapsed = time.time() - start
+ done = fold_id + 1
+ remaining = n_iter - done
+ time_left = (elapsed / done) * remaining
+
+ sys.stdout.write('\r {}/{}. {:.2f}s left ({:.3f}s)'.format(done, n_iter, time_left, elapsed / done))
+ sys.stdout.flush()
+ sys.stdout.write('\r Done. Took {:.2f}s'.format(elapsed))
+ sys.stdout.flush()
+
+ return beta
+
+
+def contrast_correlation(beta, mask, mode='region'):
+ # mode: 'region' or 'global'
+ if mode == 'region':
+ n_iter = beta.shape[2]
+ n_seed = mask.shape[0]
+ corr = np.zeros((n_seed, n_iter))
+ for fold_id in range(n_iter):
+ # Map the betas back to a matrix
+ beta1_mat = conn2mat(beta[:, 0, fold_id], mask)
+ beta2_mat = conn2mat(beta[:, 1, fold_id], mask)
+ # Compute the correlation
+ corr[:, fold_id] = np.array([np.corrcoef(beta1_mat[nid, :], beta2_mat[nid, :])[0, 1] for nid in range(64)])
+ return corr
+ elif mode == 'global':
+ n_iter = beta.shape[2]
+ corr = np.array([np.corrcoef(beta[:, 0, fold_id], beta[:, 1, fold_id])[0, 1] for fold_id in range(n_iter)])
+ return corr
+ else:
+ raise ValueError(f'mode {mode} is not configured!')
+
+
+def permutation_glm(pheno, conn, group, case, control, regressors='', n_iter=1000, stand=False):
+ # Make sure pheno and conn have the same number of cases
+ if not conn.shape[0] == pheno.shape[0]:
+ print(f'Conn ({conn.shape[0]}) and pheno ({pheno.shape[0]}) must be same number of cases')
+ sub_mask, case_masks = find_subset(pheno, group, [case, control])
+ sub_pheno = pheno.loc[sub_mask, :]
+ sub_pheno.loc[:, 'group'] = ''
+ sub_conn = conn[sub_mask, :]
+ # Standardize the input data
+ if stand:
+ sub_conn = standardize(sub_conn, case_masks[control])
+ n_conn = sub_conn.shape[1]
+
+ n1 = np.sum(case_masks[case])
+ n2 = np.sum(case_masks[control])
+ r2 = n2 / (n1 + n2)
+
+ splitter = skm.ShuffleSplit(n_splits=n_iter, test_size=r2, random_state=1)
+
+ i_pheno = sub_pheno.copy()
+ group_id = list(i_pheno.columns).index('group')
+
+ betas = np.zeros(shape=(n_iter, n_conn))
+ start = time.time()
+ for fold_id, (train_id, test_id) in enumerate(splitter.split(sub_conn)):
+ n1_ids = train_id[:n1]
+ n2_ids = test_id[:n2]
+ i_pheno = sub_pheno.copy()
+ i_pheno['group'] = ''
+ i_pheno.iloc[n1_ids, group_id] = 'case'
+ i_pheno.iloc[n2_ids, group_id] = 'control'
+ table = glm_wrap_cc(sub_conn, i_pheno, 'group', 'case', 'control', regressors=regressors,
+ report=False, fast=True)
+ betas[fold_id, :] = table['betas'].values
+
+ elapsed = time.time() - start
+ done = fold_id + 1
+ remaining = n_iter - done
+ time_left = (elapsed / done) * remaining
+
+ sys.stdout.write('\r {}/{}. {:.2f}s left ({:.3f}s per permutation)'.format(done, n_iter, time_left, elapsed / done))
+ sys.stdout.flush()
+ sys.stdout.write('\r Done. Took {:.2f}s'.format(elapsed))
+ sys.stdout.flush()
+
+ return betas
diff --git a/scripts/h5_to_npy.py b/scripts/h5_to_npy.py
new file mode 100644
index 0000000..805bb0b
--- /dev/null
+++ b/scripts/h5_to_npy.py
@@ -0,0 +1,173 @@
+#import os dont think I need this?
+import h5py
+import numpy as np
+import pandas as pd
+from pathlib import Path
+
+def load_connectome(file, participant_id, session, identifier, no_session):
+ # Construct the dataset path
+ if no_session:
+ dataset_path = f"sub-{participant_id}/{identifier}_atlas-MIST_desc-64_connectome"
+ else:
+ dataset_path = f"sub-{participant_id}/ses-{session}/{identifier}_atlas-MIST_desc-64_connectome"
+
+ dataset = file.get(dataset_path)
+
+ connectome_npy = None # Initialise to None in case missing
+ if dataset is not None:
+ connectome_npy = dataset[...] # Load the dataset into a NumPy array
+ return connectome_npy
+
+def save_connectome(connectome_npy, output_p, identifier):
+ if connectome_npy is not None:
+ output_file_p = output_p / f"{identifier}.npy"
+ np.save(output_file_p, connectome_npy)
+ print(f"Saved data for {identifier}")
+ else:
+ print(f"Connectome for {identifier} not found or could not be loaded")
+
+def process_connectome_participant(row, base_dir, output_p):
+ identifier = row["identifier"]
+ participant_id = row["participant_id"]
+ session = row["ses"]
+ # Construct path to subject's HDF5 file
+ hdf5_file_p = base_dir / f"{participant_id}" / f"sub-{participant_id}_atlas-MIST_desc-scrubbing.5+gsr.h5"
+
+ if hdf5_file_p.exists():
+ with h5py.File(hdf5_file_p, "r") as file:
+ connectome_npy = load_connectome(file, participant_id, session, identifier)
+ save_connectome(connectome_npy, output_p, identifier)
+
+def process_connectome_group(row, file, output_p, no_session):
+ identifier = row["identifier"]
+ participant_id = row["participant_id"]
+ session = row["ses"]
+
+ connectome_npy = load_connectome(file, participant_id, session, identifier, no_session)
+ save_connectome(connectome_npy, output_p, identifier)
+
+def process_cobre(conn_p, df, output_dir):
+ base_dir = conn_p / "cobre_connectome-0.4.1"
+ hdf5_file_p = base_dir / "atlas-MIST_desc-scrubbing.5+gsr.h5"
+ output_p = output_dir / "cobre"
+ if not output_p.exists():
+ output_p.mkdir(parents=True, exist_ok=True)
+
+ # Filter df for dataset
+ filtered_df = df[df['dataset'] == 'cobre']
+
+ with h5py.File(hdf5_file_p, "r") as file:
+ # Loop through the filtered dataframe
+ for index, row in filtered_df.iterrows():
+ process_connectome_group(row, file, output_p)
+
+def process_ds000030(conn_p, df, output_dir):
+ base_dir = conn_p / "ds000030_connectomes-0.4.1"
+ hdf5_file_p = base_dir / "atlas-MIST_desc-scrubbing.5+gsr.h5"
+ output_p = output_dir / "ds000030"
+ if not output_p.exists():
+ output_p.mkdir(parents=True, exist_ok=True)
+
+ # Filter df for dataset
+ filtered_df = df[df['dataset'] == 'ds000030']
+
+ with h5py.File(hdf5_file_p, "r") as file:
+ # Loop through the filtered dataframe
+ for index, row in filtered_df.iterrows():
+ process_connectome_group(row, file, output_p, no_session=True)
+
+def process_hcpep(conn_p, df, output_dir):
+ base_dir = conn_p / "hcp-ep_connectome-0.4.1"
+ output_p = output_dir / "hcpep"
+ if not output_p.exists():
+ output_p.mkdir(parents=True, exist_ok=True)
+
+ # Filter df for dataset
+ filtered_df = df[df['dataset'] == 'hcpep']
+
+ # Loop through the filtered dataframe
+ for index, row in filtered_df.iterrows():
+ process_connectome_participant(row, base_dir, output_p)
+
+def process_srpbs(conn_p, df, output_dir):
+ base_dir = conn_p / "srpbs_connectome-0.4.1"
+ output_p = output_dir / "srpbs"
+ if not output_p.exists():
+ output_p.mkdir(parents=True, exist_ok=True)
+
+ # Filter df for dataset
+ filtered_df = df[df['dataset'] == 'srpbs']
+
+ # Loop through the filtered dataframe
+ for index, row in filtered_df.iterrows():
+ process_connectome_participant(row, base_dir, output_p)
+
+def process_adni(conn_p, final_file_p, output_dir):
+ base_dir = conn_p / "adni_connectomes-0.4.1"
+ final_file = final_file_p / "final_adni.tsv"
+ output_p = output_dir / "adni"
+ if not output_p.exists():
+ output_p.mkdir(parents=True, exist_ok=True)
+
+ # Load the file to get identifiers and participant_ids
+ df = pd.read_csv(final_file, sep="\t")
+
+ # Loop through the filtered dataframe
+ for index, row in df.iterrows():
+ process_connectome_participant(row, base_dir, output_p)
+
+def process_oasis3(conn_p, final_file_p, output_dir):
+ base_dir = conn_p / "oasis3_connectomes-0.4.1"
+ final_file = final_file_p / "final_oasis3.tsv"
+ output_p = output_dir / "oasis3"
+ if not output_p.exists():
+ output_p.mkdir(parents=True, exist_ok=True)
+
+ # Load the file to get identifiers and participant_ids
+ df = pd.read_csv(final_file, sep="\t")
+
+ # Loop through the filtered dataframe
+ for index, row in df.iterrows():
+ process_connectome_participant(row, base_dir, output_p)
+
+def process_cimaq(conn_p, final_file_p, output_dir):
+ base_dir = conn_p / "cimaq_connectomes-0.4.1"
+ final_file = final_file_p / "final_cimaq.tsv"
+ output_p = output_dir / "cimaq"
+ if not output_p.exists():
+ output_p.mkdir(parents=True, exist_ok=True)
+
+ # Load the file to get identifiers and participant_ids
+ df = pd.read_csv(final_file, sep="\t")
+
+ # Loop through the filtered dataframe
+ for index, row in df.iterrows():
+ process_connectome_participant(row, base_dir, output_p)
+
+
+if __name__ == "__main__":
+ passed_qc_file_p = Path("/home/nclarke/projects/rrg-pbellec/nclarke/ad_sz/files/passed_qc_master.tsv")
+ conn_p = Path("/home/nclarke/scratch")
+ output_dir = Path("/home/nclarke/scratch/npy_connectome")
+
+ # These datasets needed some extra steps aftre checking QC, so get identifiers from a different tsv
+ final_file_p = Path("/home/nclarke/projects/rrg-pbellec/nclarke/ad_sz/files")
+
+ # Load the file to get identifiers and participant_ids
+ df = pd.read_csv(passed_qc_file_p, sep="\t")
+
+ #process_cobre(conn_p, df, output_dir)
+ process_ds000030(conn_p, df, output_dir)
+ #process_hcpep(conn_p, df, output_dir)
+ #process_srpbs(conn_p, df, output_dir)
+ #process_adni(conn_p, final_file_p, output_dir)
+ #process_cimaq(conn_p, final_file_p, output_dir)
+ #process_oasis3(conn_p, final_file_p, output_dir)
+
+
+
+
+
+
+
+
diff --git a/scripts/prepare_combat_file.py b/scripts/prepare_combat_file.py
new file mode 100644
index 0000000..5d09044
--- /dev/null
+++ b/scripts/prepare_combat_file.py
@@ -0,0 +1,61 @@
+from pathlib import Path
+import numpy as np
+
+def load_connectomes(base_directory_p):
+ """
+ Load connectome data from .npy files across multiple subdirectories.
+
+ Parameters:
+ - base_directory_p: Path to the base directory containing subdirectories of connectome files.
+
+ Returns:
+ - connectomes: A list of connectome matrices, one per participant across all subdirectories.
+ """
+ connectomes = []
+ # Iterate over each subdirectory in the base directory
+ for directory_p in base_directory_p.iterdir():
+ if directory_p.is_dir():
+ for file_p in directory_p.glob("*.npy"):
+ connectome = np.load(file_p)
+ connectomes.append(connectome)
+ return connectomes
+
+def connectome_to_edges_matrix(connectomes):
+ """
+ Transform a list of connectomes into a matrix suitable for ComBat,
+ where rows are ROI pairs and columns are participants.
+ """
+ num_participants = len(connectomes)
+ num_rois = connectomes[0].shape[0]
+ num_edges = num_rois * (num_rois - 1) // 2
+
+ edges_matrix = np.zeros((num_edges, num_participants))
+ for i, connectome in enumerate(connectomes):
+ edge_index = 0
+ for row in range(num_rois):
+ for col in range(row + 1, num_rois):
+ edges_matrix[edge_index, i] = connectome[row, col]
+ edge_index += 1
+
+ return edges_matrix
+
+# Set paths
+base_directory_p = Path("/home/neuromod/ad_sz/data/npy_connectome")
+output_p = "/home/neuromod/ad_sz/data/edges_matrix.tsv"
+
+# Load connectomes from all subdirectories.
+connectomes = load_connectomes(base_directory_p)
+
+print(f"Number of connectomes loaded: {len(connectomes)}")
+for connectome in connectomes[:5]: # Print the shape of the first 5 connectomes as a check
+ print(connectome.shape)
+
+# Transform into the ComBat-ready matrix.
+edges_matrix = connectome_to_edges_matrix(connectomes)
+
+print(f"Edges matrix shape: {edges_matrix.shape}")
+
+# Save the edges_matrix to a .tsv file
+np.savetxt(output_p, edges_matrix, delimiter='\t')
+
+print(f"Saved edges matrix to {output_p}")
diff --git a/scripts/prepare_input_data.py b/scripts/prepare_input_data.py
new file mode 100644
index 0000000..34f5f76
--- /dev/null
+++ b/scripts/prepare_input_data.py
@@ -0,0 +1,284 @@
+import h5py
+import numpy as np
+import pandas as pd
+from pathlib import Path
+
+dataset_configs = {
+ "cobre": {
+ "connectome_path": "cobre_connectome-0.4.1_MIST_afc",
+ "no_session": False,
+ "subject_folder": False,
+ },
+ "cimaq": {
+ "connectome_path": "cimaq_connectome-0.4.1_MIST_afc",
+ "no_session": False,
+ "subject_folder": True,
+ },
+ "hcpep": {
+ "connectome_path": "hcp-ep_connectome-0.4.1",
+ "no_session": False,
+ "subject_folder": True,
+ },
+ "ds000030": {
+ "connectome_path": "ds000030_connectomes-0.4.1",
+ "no_session": True,
+ "subject_folder": False,
+ },
+ "oasis3": {
+ "connectome_path": "oasis3_connectomes-0.4.1",
+ "no_session": False,
+ "subject_folder": True,
+ },
+ "srpbs": {
+ "connectome_path": "srpbs_connectome-0.4.1",
+ "no_session": False,
+ "subject_folder": True,
+ },
+ "adni": {
+ "connectome_path": "adni_connectome-0.4.1_MIST_afc",
+ "no_session": False,
+ "subject_folder": True,
+ },
+}
+
+
+def load_connectome(file, participant_id, session, identifier, no_session):
+ # Path construction within .h5 file
+ dataset_path = f"sub-{participant_id}"
+
+ if not no_session:
+ dataset_path += f"/ses-{session}"
+
+ dataset_path += f"/{identifier}_atlas-MIST_desc-64_connectome"
+ dataset = file.get(dataset_path)
+
+ if dataset is not None:
+ return dataset[...]
+ else:
+ print(f"Connectome for {identifier} not found")
+ return None
+
+
+def save_npy_connectome(connectome_npy, output_p, identifier):
+ # Not using this function currently
+ if connectome_npy is not None:
+ output_file_p = output_p / f"{identifier}.npy"
+ np.save(output_file_p, connectome_npy)
+ print(f"Saved data for {identifier}")
+ else:
+ print(f"Connectome for {identifier} not found")
+
+
+def connectome_to_edge_matrix(connectome):
+ """
+ Transform a single connectome into an edge matrix.
+ Extract the upper triangular part of the connectome, including the diagonal.
+ """
+ # Get the upper triangular indices including the diagonal
+ row_indices, col_indices = np.triu_indices_from(connectome)
+
+ # Extract the values at these indices
+ edge_matrix = connectome[row_indices, col_indices]
+
+ return edge_matrix
+
+
+def process_connectomes(connectomes_by_participant):
+ """
+ Apply Fisher z transform, compute the global signal and convert to an edge matrix. For multiple connectomes per participant, connectomes are averaged first.
+ """
+ edges_matrix_dict = {}
+ global_signal_dict = {}
+ for participant_id, connectomes in connectomes_by_participant.items():
+ # Compute the mean connectome
+ mean_connectome = np.mean(connectomes, axis=0)
+ # Apply Fisher Z transformation
+ transformed_connectome = np.arctanh(mean_connectome)
+ # Compute global signal (mean of the Z values)
+ global_signal = np.mean(transformed_connectome)
+ global_signal_dict[participant_id] = global_signal
+ # Convert connectome to a marix for ComBat
+ edges_matrix = connectome_to_edge_matrix(transformed_connectome)
+ edges_matrix_dict[participant_id] = edges_matrix
+
+ return edges_matrix_dict, global_signal_dict
+
+
+def matrix_dict_to_df(edges_matrix_dict):
+ # Create a df from the edge matrices
+ # Convert dictionary values to a 2D numpy array
+ edge_matrix_combined = np.column_stack(list(edges_matrix_dict.values()))
+ participants = list(edges_matrix_dict.keys())
+ edges_df = pd.DataFrame(edge_matrix_combined, columns=participants)
+
+ # Transpose so suitable for ComBat
+ edges_df = edges_df.T
+
+ return edges_df
+
+
+def process_datasets(conn_p, df, dataset_name):
+ config = dataset_configs[dataset_name]
+ base_dir = conn_p / config["connectome_path"]
+
+ # Filter the df for required scans for the dataset
+ filtered_df = df[df["dataset"] == dataset_name]
+ valid_rows = [] # To store rows where the connectome was found
+
+ connectomes_by_participant = {}
+ for index, row in filtered_df.iterrows():
+ participant_id = row["participant_id"]
+ identifier = row["identifier"]
+ session = None if config["no_session"] else row.get("ses", None)
+
+ # Adjust file path based on whether dataset uses a subject folder
+ if config["subject_folder"]:
+ file_path = (
+ base_dir
+ / participant_id
+ / f"sub-{participant_id}_atlas-MIST_desc-scrubbing.5+gsr.h5"
+ )
+ else:
+ file_path = base_dir / "atlas-MIST_desc-scrubbing.5+gsr.h5"
+ try:
+ with h5py.File(file_path, "r") as file:
+ connectome = load_connectome(
+ file, participant_id, session, identifier, config["no_session"]
+ )
+ if connectome is not None:
+ # Append the connectome to the participant's list in the dictionary
+ if participant_id not in connectomes_by_participant:
+ connectomes_by_participant[participant_id] = []
+ connectomes_by_participant[participant_id].append(connectome)
+
+ # Add the row to valid_rows if the connectome is found
+ valid_rows.append(row)
+
+ except FileNotFoundError:
+ print(f"File not found: {file_path}")
+
+ edges_df = pd.DataFrame()
+ # Process connectomes
+ if connectomes_by_participant:
+ edges_matrix_dict, global_signal_dict = process_connectomes(
+ connectomes_by_participant
+ )
+ edges_df = matrix_dict_to_df(edges_matrix_dict)
+
+ # Convert the list of valid rows to a df
+ valid_df = pd.DataFrame(valid_rows)
+
+ return edges_df, valid_df, global_signal_dict
+
+
+def create_pheno_df(valid_df, global_signal_dict):
+ # Group by 'participant_id' and aggregate, taking the first entry for variables that do not change (for this use case)
+ aggregation_functions = {
+ "sex": "first",
+ "age": "first",
+ "diagnosis": "first",
+ "site": "first",
+ "dataset": "first",
+ "mean_fd_scrubbed": "mean", # Averaging mean framewise displacement across scans
+ "group": "first",
+ }
+ pheno_df = valid_df.groupby("participant_id").agg(aggregation_functions)
+ # Add global signal to pheno_df
+ pheno_df["mean_conn"] = pheno_df.index.map(global_signal_dict)
+ return pheno_df
+
+
+def create_covariates_df_old(valid_df):
+ # Group by participant_id and select the first entry for each group
+ first_entries = valid_df.groupby("participant_id").first()
+
+ # Extract the relevant covariates
+ covariates_df = first_entries[["site", "sex", "age", "diagnosis"]]
+
+ return covariates_df
+
+
+def one_hot_encode_column(df, column_name):
+ dummies = pd.get_dummies(df[column_name], prefix=column_name)
+ df = pd.concat([df, dummies], axis=1)
+ df.drop(column_name, axis=1, inplace=True)
+ return df
+
+
+def one_hot_encode_column_no_prefix(df, column_name):
+ dummies = pd.get_dummies(df[column_name])
+ df = pd.concat([df, dummies], axis=1)
+ df.drop(column_name, axis=1, inplace=True)
+ return df
+
+
+def create_covariates_df(pheno_df):
+ # Extract the relevant covariates from pheno_df
+ covariates_df = pheno_df[
+ [
+ "site",
+ "sex_male",
+ "sex_female",
+ "age",
+ "diagnosis_MCI",
+ "diagnosis_ADD",
+ "diagnosis_CON",
+ "diagnosis_SCHZ",
+ ]
+ ] # split CON into controls from AD and SCHZ datasets?
+
+ return covariates_df
+
+
+if __name__ == "__main__":
+ conn_p = Path("/home/nclarke/scratch")
+ output_p = Path("/home/nclarke/ad_sz/data/input_data")
+ df = pd.read_csv("/home/nclarke/ad_sz/data/final_qc_pheno.tsv", sep="\t")
+
+ # Convert adni sessions to strings so can formulate paths correctly
+ mask = df["dataset"] == "adni"
+ df.loc[mask, "ses"] = pd.to_datetime(df.loc[mask, "ses"]).dt.strftime("%Y%m%d")
+
+ if not output_p.exists():
+ output_p.mkdir(parents=True, exist_ok=True)
+
+ all_edges = []
+ all_pheno = []
+ for dataset in dataset_configs:
+ edges_df, valid_df, global_signal_dict = process_datasets(conn_p, df, dataset)
+ pheno_df = create_pheno_df(valid_df, global_signal_dict)
+
+ # Collect data per dataset
+ all_edges.append(edges_df)
+ all_pheno.append(pheno_df)
+
+ # Concatenate data across datasets
+ combined_edges_df = pd.concat(all_edges)
+ combined_pheno_df = pd.concat(all_pheno)
+
+ # One-hot encode columns
+ combined_pheno_df = one_hot_encode_column(combined_pheno_df, "sex")
+ combined_pheno_df = one_hot_encode_column(combined_pheno_df, "diagnosis")
+ combined_pheno_df = one_hot_encode_column(combined_pheno_df, "group")
+
+ # Create covariates_df from pheno_df
+ covariates_df = create_covariates_df(combined_pheno_df)
+
+ # Capitalize column names, necessary for combat etc
+ covariates_df.columns = [x.upper() for x in covariates_df.columns]
+ combined_pheno_df.columns = [x.upper() for x in combined_pheno_df.columns]
+
+ print("Shape of combined_edges_df = ", combined_edges_df.shape)
+ print("Shape of covariates_df = ", covariates_df.shape)
+ print("Shape of combined_pheno_df = ", combined_pheno_df.shape)
+
+ # Ensure order of data is identical and output
+ combined_edges_df.sort_index().to_csv(
+ output_p / "combat_edges.tsv", sep="\t", index=True
+ )
+ covariates_df.sort_index().to_csv(
+ output_p / "combat_covariates.tsv", sep="\t", index=True
+ )
+ combined_pheno_df.sort_index().to_csv(output_p / "pheno.tsv", sep="\t", index=True)
+
+ print(f"Saved edges matrix, covariates and pheno to {output_p}")
diff --git a/scripts/significance_corr.py b/scripts/significance_corr.py
new file mode 100644
index 0000000..05a1e7e
--- /dev/null
+++ b/scripts/significance_corr.py
@@ -0,0 +1,265 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import util
+import time
+import os
+import itertools
+from statsmodels.stats.multitest import multipletests
+from scipy.stats import pearsonr
+from argparse import ArgumentParser
+
+
+# SEBASTIEN URCHS
+def p_permut(empirical_value, permutation_values):
+ n_permutation = len(permutation_values)
+ if empirical_value >= 0:
+ return (np.sum(permutation_values > empirical_value) + 1) / (n_permutation + 1)
+ return (np.sum(permutation_values < empirical_value) + 1) / (n_permutation + 1)
+
+
+def filter_fdr(df, contrasts):
+ df_filtered = df[
+ (df["pair0"].isin(contrasts)) & (df["pair1"].isin(contrasts))
+ ].copy()
+ _, fdr, _, _ = multipletests(df_filtered["pval"], method="fdr_bh")
+ df_filtered["fdr_filtered"] = fdr
+ return df_filtered
+
+
+def mat_form(df, contrasts, value="betamap_corr"):
+ n = len(contrasts)
+ d = dict(zip(contrasts, range(n)))
+ mat = np.zeros((n, n))
+ for c in contrasts:
+ # fill out vertical strip of mat
+ for i in range(n):
+ if i == d[c]:
+ val = 1
+ else:
+ val = df[
+ ((df["pair0"] == c) | (df["pair1"] == c))
+ & ((df["pair0"] == contrasts[i]) | (df["pair1"] == contrasts[i]))
+ ][value]
+ mat[i, d[c]] = val
+ mat[d[c], i] = val
+ return pd.DataFrame(mat, columns=contrasts, index=contrasts)
+
+
+def make_matrices(df, contrasts, fdr="fdr_filtered"):
+ "Param fdr can be set to 'fdr_filtered': FDR is performed using the pvalues only from the chosen contrasts"
+ " or 'fdr': values taken from FDR performed on full set of 42 contrasts"
+ if fdr == "fdr_filtered":
+ df = filter_fdr(df, contrasts)
+ mat_corr = mat_form(df, contrasts, value="betamap_corr")
+ mat_pval = mat_form(df, contrasts, value="pval")
+ mat_fdr = mat_form(df, contrasts, value=fdr)
+ return mat_corr, mat_pval, mat_fdr
+
+
+def get_corr_dist(cases, nulls, path_out, tag="wholeconn"):
+ # For each unique pair, between the null maps.
+ n_pairs = int((len(cases)) * (len(cases) - 1) / 2)
+ corr = np.zeros((n_pairs, 5000))
+
+ print(
+ "Getting correlation between 5000 null maps for {} unique pairs for {} cases...".format(
+ n_pairs, len(cases)
+ )
+ )
+ pair = []
+ l = 0
+ for i in itertools.combinations(cases, 2):
+ for j in range(5000):
+ corr[l, j] = pearsonr(
+ nulls.loc[i[0]].values[j, :], nulls.loc[i[1]].values[j, :]
+ )[0]
+
+ pair.append(i)
+ if l % 50 == 0:
+ print("{}/{}".format(l, n_pairs))
+ l = l + 1
+
+ df = pd.DataFrame(corr)
+ df["pair"] = pair
+ df.to_csv(os.path.join(path_out, "correlation_dist_{}.csv".format(tag)))
+ return df
+
+
+def get_corr(cases, betas, path_out, tag="wholeconn"):
+ # For each unique pair, correlation between betamaps. Use standardized betas here (as in rest of paper).
+ n_pairs = int((len(cases)) * (len(cases) - 1) / 2)
+ corr = np.zeros(n_pairs)
+
+ print(
+ "Getting correlation between betamaps for {} unique pairs for {} cases...".format(
+ n_pairs, len(cases)
+ )
+ )
+ pair = []
+ l = 0
+ for i in itertools.combinations(cases, 2):
+ corr[l] = pearsonr(betas.loc[i[0]].values, betas.loc[i[1]].values)[0]
+ l = l + 1
+ pair.append(i)
+ df = pd.DataFrame(corr)
+ df["pair"] = pair
+ df.to_csv(os.path.join(path_out, "correlation_betas_{}.csv".format(tag)))
+ return df
+
+
+def get_corr_pval(maps, nulls, betas, path_out, tag="wholeconn"):
+ df = get_corr_dist(maps, nulls, path_out, tag=tag)
+ df_bb = get_corr(maps, betas, path_out, tag=tag)
+
+ df_bb = df_bb.rename(columns={0: "betamap_corr"})
+ df_master = df_bb.merge(df, on="pair")
+
+ print("Calculating pvals...")
+ # CALCULATE PVALS
+ pval = []
+ for i in df_master.index:
+ p = p_permut(df_master.loc[i, "betamap_corr"], df_master[range(5000)].loc[i])
+ pval.append(p)
+ df_master["pval"] = pval
+
+ # ADD LABELS
+ pair0 = [p[0] for p in df["pair"].tolist()]
+ pair1 = [p[1] for p in df["pair"].tolist()]
+ df_master["pair0"] = pair0
+ df_master["pair1"] = pair1
+
+ df_compact = df_master[["pair0", "pair1", "betamap_corr", "pval"]]
+ df_compact.to_csv(
+ os.path.join(path_out, "corr_pval_null_v_null_{}.csv".format(tag))
+ )
+
+ return df_compact
+
+
+def _make_region_masks(row):
+ mask = np.tri(64, k=0, dtype=bool)
+
+ # Get the parcel number from the beginning of the row and subtract 1
+ parcel_num = int(row[0]) - 1
+
+ # Get the parcel name from the second column value
+ parcel_name = str(row[1]).upper().replace(" ", "_")
+
+ # Generate the mask
+ parcel = np.zeros((64, 64), bool)
+ parcel[:, parcel_num] = True
+ parcel_mask = parcel + np.transpose(parcel)
+ parcel_mask = np.tril(parcel_mask)
+ parcel_mask = parcel_mask[mask]
+
+ return (parcel_mask, parcel_name)
+
+
+if __name__ == "__main__":
+ parser = ArgumentParser()
+ parser.add_argument(
+ "--n_path_mc", help="path to mc null models dir", dest="n_path_mc"
+ )
+ parser.add_argument("--b_path_mc", help="path to mc betamaps dir", dest="b_path_mc")
+ parser.add_argument("--path_out", help="path to output directory", dest="path_out")
+ parser.add_argument(
+ "--path_corr", help="path to corr dir", dest="path_corr", default=None
+ )
+ parser.add_argument(
+ "--path_parcellation", help="path to MIST_64.cv", dest="path_parcellation"
+ )
+ args = parser.parse_args()
+
+ n_path_mc = os.path.join(args.n_path_mc, "null_model_{}_vs_control_mc.npy")
+ # cont_n_path_mc = os.path.join(args.n_path_mc,'{}_null_model_mc.npy')
+ b_path_mc = os.path.join(args.b_path_mc, "{}_vs_con_mean.csv")
+ # cont_b_path_mc = os.path.join(args.b_path_mc,'cont_{}_results_mc.csv')
+ path_out = args.path_out
+ path_corr = args.path_corr
+
+ cases = ["mci_neg", "mci_pos", "sz"]
+
+ maps = cases # + cont
+
+ #############
+ # LOAD DATA #
+ #############
+ null = []
+ beta_std = []
+
+ for c in cases:
+ null.append(pd.DataFrame(np.load(n_path_mc.format(c))))
+ beta_std.append(pd.read_csv(b_path_mc.format(c))["stand_betas"].values)
+
+ betamaps_std = pd.DataFrame(beta_std, index=maps)
+ nullmodels = pd.concat(null, keys=maps)
+
+ ####################
+ # WHOLE CONNECTOME #
+ ####################
+ print("Creating correlation distributions for whole connectome...")
+ df_wc = get_corr_pval(maps, nullmodels, betamaps_std, path_out, tag="wholeconn")
+
+ # make matrices
+ print("Preparing correlation matrices...")
+ subset = ["mci_neg", "mci_pos", "sz"]
+
+ corr, pval, fdr = make_matrices(df_wc, subset, fdr="fdr_filtered")
+
+ corr.to_csv(os.path.join(path_out, "FC_corr_wholebrain_mc_null_v_null.csv"))
+ pval.to_csv(os.path.join(path_out, "FC_corr_pval_wholebrain_mc_null_v_null.csv"))
+ fdr.to_csv(
+ os.path.join(path_out, "FC_corr_fdr_filtered_wholebrain_mc_null_v_null.csv")
+ )
+
+ ####################
+ # REGIONS #
+ ####################
+ df_MIST = pd.read_csv(args.path_parcellation, delimiter=";")
+
+ for index, row in df_MIST.iterrows():
+ # create region mask
+ parcel_mask, parcel_name = _make_region_masks(row)
+
+ # filter maps
+ null_parcel = [n.transpose()[parcel_mask].transpose() for n in null]
+ beta_std_parcel = [b[parcel_mask] for b in beta_std]
+
+ betamaps_std_parcel = pd.DataFrame(beta_std_parcel, index=maps)
+ nullmodels_parcel = pd.concat(null_parcel, keys=maps)
+
+ print(f"Creating correlation distributions for {parcel_name}...")
+ df_parcel = get_corr_pval(
+ maps, nullmodels_parcel, betamaps_std_parcel, path_out, tag=parcel_name
+ )
+
+ # make matrices
+ corr_parcel, pval_parcel, fdr_filtered_parcel = make_matrices(
+ df_parcel, subset, fdr="fdr_filtered"
+ )
+
+ _, pval_parcel_fdr_corrected, _, _ = multipletests(pval_parcel, method="fdr_bh")
+
+ # TO DO - see if I need to correct fdr filtered, I think so?
+ if (pval_parcel < 0.05).any():
+ corr_parcel.to_csv(
+ os.path.join(path_out, f"FC_corr_{parcel_name}_mc_null_v_null.csv")
+ )
+ pval_parcel.to_csv(
+ os.path.join(path_out, f"FC_corr_pval_{parcel_name}_mc_null_v_null.csv")
+ )
+ pval_parcel_fdr_corrected.to_csv(
+ os.path.join(
+ path_out,
+ f"FC_corr_pval_fdr_corrected_{parcel_name}_mc_null_v_null.csv",
+ )
+ )
+ fdr_filtered_parcel.to_csv(
+ os.path.join(
+ path_out, f"FC_corr_fdr_filtered_{parcel_name}_mc_null_v_null.csv"
+ )
+ )
+
+ print("Done!")