diff --git a/.gitignore b/.gitignore
index 0d05551..918137f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -113,7 +113,7 @@ venv/
 ENV/
 env.bak/
 venv.bak/
-cwas_neuro_env/
+ad_sz_env/
 
 # Spyder project settings
 .spyderproject
diff --git a/scripts/baseline_biomarkers.py b/scripts/baseline_biomarkers.py
deleted file mode 100644
index e7ea469..0000000
--- a/scripts/baseline_biomarkers.py
+++ /dev/null
@@ -1,173 +0,0 @@
-import pandas as pd
-import numpy as np
-
-from functools import reduce
-from pathlib import Path
-
-# paths to files
-
-adni_data = Path("__file__").resolve().parent / 'data' / 'adni_spreadsheet.csv'
-tau_data = Path("__file__").resolve().parent / 'data' / 'UCBERKELEYAV1451_04_26_22.csv'
-other_biomarker_data = Path("__file__").resolve().parent / 'data' / 'ADNIMERGE.csv'
-
-# functions
-
-def get_baselines(file):
-    
-    # load baseline phenotypic data
-    pheno = pd.read_csv(file, index_col=0, header=0)
-
-    # keep only the variables of interest
-    pheno = pheno.filter(['Subject ID','Phase','Sex','Research Group', 'Visit','Study Date','Age'], axis=1)
-
-    # convert 'study date' to 'session' in datetime format, to match other spreadsheets
-    pheno['session'] = pd.to_datetime(pheno['Study Date'])
-    
-    # pull out only the subject id and asign it to the index
-    pheno_subj = []
-    for i in pheno['Subject ID']:
-        subj = i.split('_')[2].lstrip("0") # remove leading zeros since it won't match ADNI IDs
-        pheno_subj.append(subj)
-    
-    pheno.index = pheno_subj
-    pheno.rename_axis('RID',inplace=True)
-    pheno.index = pheno.index.astype('int64')
-    
-    # separate patients and controls, because (in theory) we can use any control visit as baseline, but
-    # for patients we want their actual baseline data
-    patient_diagnoses = ['AD', 'EMCI', 'LMCI', 'MCI', 'SMC']
-    patient_df = pheno[pheno['Research Group'].isin(patient_diagnoses)] # df of patient diagnoses
-
-    control_df = pheno.loc[pheno['Research Group'] == 'CN'] # df of control diagnoses
-
-    # I think these visits are acceptable as baseline data, i.e. actual baseline +/-3 months, excluding
-    # any initial visits where patient continued from a previous phase
-    bl_visits = ['ADNI Screening','ADNI2 Month 3 MRI-New Pt', 'ADNI2 Screening MRI-New Pt', 
-                   'ADNIGO Month 3 MRI','ADNIGO Screening MRI']
-
-    patient_df_bl = patient_df[patient_df['Visit'].isin(bl_visits)]
-    
-    # rejoin the patients to the controls
-    new_df = pd.concat([control_df,patient_df_bl])
-    
-    # select the earliest visit available for each participant
-    new_df.sort_values(['Subject ID', 'Age'], inplace=True) # sort by age
-    baseline_df = new_df[~new_df.duplicated(['Subject ID'], keep='first')] # select the first row
-    
-    # sort df by index
-    baseline_df.sort_values(by='RID', inplace=True)
-    
-    # calculate window for acceptable biomarker data, currently +- 12months
-    baseline_df.loc[:,('date_lower')] = baseline_df.loc[:,('session')] - pd.DateOffset(months=18)
-    baseline_df.loc[:,('date_upper')] = baseline_df.loc[:,('session')] + pd.DateOffset(months=18)
-
-    return (baseline_df)
-
-def load_biomarker_df(biomarker_data):
-
-    # load data
-    biomarker_df = pd.read_csv(biomarker_data, index_col=0, header=0, low_memory=False)
-
-    # convert examdate to datetime
-    biomarker_df['EXAMDATE'] = pd.to_datetime(biomarker_df['EXAMDATE'])
-
-    # sort df by index and date
-    biomarker_df.sort_values(by=['RID', 'EXAMDATE'],inplace=True)
-
-    # create column from index (useful for later functions)
-    biomarker_df['RID'] = biomarker_df.index
-    
-    return (biomarker_df)
-
-def match_baselines_biomarker(biomarker_df, baseline_df, biomarker):
-    
-    df = pd.DataFrame(columns=['RID',biomarker,biomarker+'_EXAMDATE']) #create df
-    common_ids = biomarker_df.index.intersection(baseline_df.index) #find ids common to the biomarker and baseline dfs
-    biomarker_df = biomarker_df.set_index('EXAMDATE') #reindex, needed to use 'nearest' method
-
-    for rid in common_ids:
-        participant_df = biomarker_df[(biomarker_df['RID'] == rid)] #create df of all participants results
-        participant_df = participant_df.dropna(subset=[biomarker]) #drop NaNs, since we only want nearest date with result available
-        
-        baseline = baseline_df.loc[rid] #create df of participants baseline data
-        session = baseline['session'] #participant's baseline date
-
-        if participant_df.empty:
-            pass
-        else:
-            idx_nearest = participant_df.index.get_loc(session, method='nearest') #find the index of the closest test date to session
-            nearest_date = participant_df.index[idx_nearest] #which date is it
-            nearest_result = participant_df[biomarker][idx_nearest] #find the biomarker result associated with closest date
-
-            df.loc[len(df)] = [rid,nearest_result,nearest_date] #add to df
-    
-    df = df.set_index('RID')        
-    
-    return (df)
-
-def check_visit_window(biomarker_df, baseline_df, biomarker):
-    
-    '''
-    Join closest biomarkers to baseline info, check if the result was collected on a date within the baseline
-    window, and if not replace with NaN. Drop unwanted columns and return biomarker info again, ready to merge.
-    '''
-    
-    # create new df, merging biomarker data into baseline df
-    baseline_bio = baseline_df.join(biomarker_df)
-    
-    # create mask of date range, between lower and upper acceptable dates
-    mask_date_range = (baseline_bio[biomarker+'_EXAMDATE'] > baseline_bio['date_lower']) & (baseline_bio[biomarker+'_EXAMDATE'] < baseline_bio['date_upper'])
-    
-    # fill values collected outside date range with NaN
-    baseline_bio[biomarker][~mask_date_range] = np.nan
-    
-    cols_to_drop = ['Subject ID',
-     'Phase',
-     'Sex',
-     'Research Group',
-     'Visit',
-     'Study Date',
-     'Age',
-     'session',
-     'date_lower',
-     'date_upper']
-    
-    baseline_bio = baseline_bio.drop(cols_to_drop, axis=1) #drop unwanted columns
-    
-    return (baseline_bio)
-
-def get_biomarker(biomarker_df, baseline_df, biomarker):
-
-    #runs two functions to match the baseline to closest biomarker result, and then checks if it falls in the window
-    
-    find_nearest_biomarker = match_baselines_biomarker(biomarker_df, baseline_df, biomarker)
-    window_checked = check_visit_window(find_nearest_biomarker, baseline_df, biomarker)
-    
-    return (window_checked)
-
-if __name__ == '__main__':
-    
-    #get baselines from adni spreadsheet
-    baseline_df = get_baselines(adni_data)
-
-    # load biomarker files
-    tau_df = load_biomarker_df(tau_data)
-    other_biomarkers_df = load_biomarker_df(other_biomarker_data)
-    
-    # create one df per biomarker with closest result within a one year window of participant baseline
-    tau = get_biomarker(tau_df, baseline_df, 'META_TEMPORAL_SUVR')
-    abeta = get_biomarker(other_biomarkers_df, baseline_df, 'ABETA')
-    ptau = get_biomarker(other_biomarkers_df, baseline_df, 'PTAU')
-    av45 = get_biomarker(other_biomarkers_df, baseline_df, 'AV45')
-    fbb = get_biomarker(other_biomarkers_df, baseline_df, 'FBB')
-
-    # create list of dataframes (baseline data and all individual biomarkers)
-    data_frames = [baseline_df, tau, abeta, ptau, av45, fbb]
-
-    # merge dataframes
-    master_biomarkers = reduce(lambda left, right:
-             pd.merge_asof(left, right, left_index=True, right_index=True),
-             data_frames)
-    
-    # save baseline and matched biomarker data in data directory
-    master_biomarkers.to_csv(Path("__file__").resolve().parent / 'data' / 'master_biomarkers.csv')
diff --git a/scripts/biomarker_thresholding.ipynb b/scripts/biomarker_thresholding.ipynb
deleted file mode 100644
index dedac85..0000000
--- a/scripts/biomarker_thresholding.ipynb
+++ /dev/null
@@ -1,294 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "d213377a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "\n",
-    "from pathlib import Path"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "9a4034fa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# set thresholds and calculate 5% margin\n",
-    "\n",
-    "# CSF biomarkers\n",
-    "ab42_threshold = 980\n",
-    "ab42_margin = (5*ab42_threshold)/100\n",
-    "ab42_upper = ab42_threshold+ab42_margin\n",
-    "ab42_lower = ab42_threshold-ab42_margin\n",
-    "\n",
-    "ptau_threshold = 23\n",
-    "ptau_margin = (5*ptau_threshold)/100\n",
-    "ptau_upper = ptau_threshold+ptau_margin\n",
-    "ptau_lower = ptau_threshold-ptau_margin\n",
-    "\n",
-    "# PET biomarkers\n",
-    "fbb_threshold = 1.08\n",
-    "fbb_margin = (5*fbb_threshold)/100\n",
-    "fbb_upper = fbb_threshold+fbb_margin\n",
-    "fbb_lower = fbb_threshold-fbb_margin\n",
-    "\n",
-    "av45_threshold = 1.08\n",
-    "av45_margin = (5*av45_threshold)/100\n",
-    "av45_upper = av45_threshold+av45_margin\n",
-    "av45_lower = av45_threshold-av45_margin"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a913b27a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def calculate_tau_cutoff(df):\n",
-    "    \n",
-    "    '''\n",
-    "    Take a df with biomarker results for all participants. Select controls only. Find those negative for amyloid\n",
-    "    PET (either av45 or FBB). For those who hav flortaucipir results, calculate average. Calculate +2SD, this is\n",
-    "    the flortaucipir cutoff.\n",
-    "    \n",
-    "    '''\n",
-    "    \n",
-    "    controls = df.loc[df['Research Group'] == 'CN']\n",
-    "    \n",
-    "    controls_Apet_neg = (controls[(controls['FBB'] < fbb_lower) | \n",
-    "             (controls['AV45'] < av45_lower)])\n",
-    "    \n",
-    "    tau_mean = controls_Apet_neg['META_ROI'].mean()\n",
-    "    tau_sd = controls_Apet_neg['META_ROI'].std()\n",
-    "    tau_cutoff = tau_mean+(tau_sd*2)\n",
-    "                                       \n",
-    "    return (tau_cutoff) "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "id": "bfa2dd1c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# paths to files\n",
-    "biomarker_data = Path(\"__file__\").resolve().parents[1] / 'data' / 'master_biomarkers.csv'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "id": "837cb329",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# load baseline data\n",
-    "df = pd.read_csv(biomarker_data, index_col=0, header=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "c75a9dee",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# replace some biomarker data which has been coded as > or <\n",
-    "df = df.replace({'ABETA': {'>1700': 1701, '<200': 199}})\n",
-    "df = df.replace({'PTAU': {'<8': 7, '>120':121}})\n",
-    "\n",
-    "df[['ABETA', 'PTAU']] = df[['ABETA', 'PTAU']].apply(pd.to_numeric)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "id": "b2d48836",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# calculate study specific cutoff for tau PET meta temporal ROI \n",
-    "tau_threshold = calculate_tau_cutoff(df)\n",
-    "tau_margin = (5*tau_threshold)/100\n",
-    "tau_upper = tau_threshold+tau_margin\n",
-    "tau_lower = tau_threshold-tau_margin"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "id": "60d0393b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def biomarker_negative_controls(df):\n",
-    "    \n",
-    "    '''\n",
-    "    Take a df with biomarker results for participants labelled as controls, and return a df excluding those\n",
-    "    who have a positive result on any one of five markers (or no biomarker information).\n",
-    "    \n",
-    "    To get more info on which biomarkers are positive, uncomment last section. But this is only to give an \n",
-    "    idea, since some may be positive on more than one marker.\n",
-    "    '''\n",
-    "    \n",
-    "    controls_all = df.loc[df['Research Group'] == 'CN']\n",
-    "    \n",
-    "    # drop participants who don't have any biomarker data\n",
-    "    controls = controls_all.dropna(subset=['ABETA','PTAU','AV45','FBB','META_ROI'], how='all')\n",
-    "    \n",
-    "    controls_biomarker_neg = (controls[(controls['FBB'] < fbb_lower) | \n",
-    "             (controls['AV45'] < av45_lower) | \n",
-    "             (controls['ABETA'] > ab42_upper) | \n",
-    "            (controls['PTAU'] < ptau_lower) | \n",
-    "            (controls['META_ROI'] < tau_lower)])\n",
-    "    \n",
-    "    n_controls = len(controls_all)\n",
-    "    no_markers = n_controls-len(controls)\n",
-    "    n_controls_negative = len(controls_biomarker_neg)\n",
-    "    n_dropped = (n_controls-no_markers) - n_controls_negative\n",
-    "    \n",
-    "    print ('Of {} controls, {} dropped due to no biomarker data and {} due to positive/borderline biomarkers. {} remaining.'.format(\n",
-    "        n_controls,\n",
-    "        no_markers, \n",
-    "        n_dropped,\n",
-    "        n_controls_negative))\n",
-    "    \n",
-    "    '''\n",
-    "    controls_biomarker_pos = pd.concat([controls,controls_biomarker_neg]).drop_duplicates(keep=False)\n",
-    "    print ('{} positive/borderline for Abeta42.'.format(len(controls_biomarker_pos[(controls_biomarker_pos['ABETA'] < ab42_upper)])))\n",
-    "    print ('{} positive/borderline for Ptau.'.format(len(controls_biomarker_pos[(controls_biomarker_pos['PTAU'] > ptau_lower)])))\n",
-    "    print ('{} positive/borderline for PET amyloid (FBB tracer).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['FBB'] > fbb_lower)])))\n",
-    "    print ('{} positive/borderline for PET amyloid (AV45 tracer).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['AV45'] > av45_lower)])))\n",
-    "    print ('{} positive/borderline for PET tau (meta roi).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['META_ROI'] > tau_lower)])))\n",
-    "    '''\n",
-    "    return (controls_biomarker_neg)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "id": "b3a95289",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def biomarker_positive_patients(df, diagnoses):\n",
-    "    \n",
-    "    '''\n",
-    "    Take a df with biomarker results for participants labelled as a patient (specifying which), \n",
-    "    and return a df including only those who have a positive result on any one of five markers. \n",
-    "    Exclude any with no biomarker information.\n",
-    "    \n",
-    "    To get more info on which biomarkers are negative, uncomment last section. But this is only to give an \n",
-    "    idea, since some may be negative on more than one marker.\n",
-    "    '''\n",
-    "    \n",
-    "    patients_all = df.loc[df['Research Group'].isin(diagnoses)]\n",
-    "    \n",
-    "    # drop participants who don't have any biomarker data\n",
-    "    patients = patients_all.dropna(subset=['ABETA','PTAU','AV45','FBB','META_ROI'], how='all')\n",
-    "    \n",
-    "    patients_biomarker_pos = (patients[(patients['FBB'] > fbb_upper) | \n",
-    "             (patients['AV45'] > av45_upper) | \n",
-    "             (patients['ABETA'] < ab42_lower) | \n",
-    "             (patients['PTAU'] > ptau_upper) |\n",
-    "            (patients['META_ROI'] > tau_upper)])\n",
-    "\n",
-    "    n_patients = len(patients_all)\n",
-    "    no_markers = n_patients-len(patients)\n",
-    "    n_patients_positive = len(patients_biomarker_pos)\n",
-    "    n_dropped = (n_patients-no_markers) - n_patients_positive\n",
-    "    \n",
-    "    print ('Of {} {}, {} dropped due to no biomarker data and {} due to negative/borderline biomarkers. {} remaining.'.format(\n",
-    "        n_patients,\n",
-    "        diagnoses,\n",
-    "        no_markers,\n",
-    "       n_dropped,\n",
-    "      n_patients_positive))\n",
-    "    \n",
-    "    '''\n",
-    "    patients_neg = pd.concat([patients,patients_biomarker_pos]).drop_duplicates(keep=False)\n",
-    "    \n",
-    "    print ('{} negative/borderline for Abeta42.'.format(len(patients_neg[(patients_neg['ABETA'] > ab42_lower)])))\n",
-    "    print ('{} negative/borderline for Ptau.'.format(len(patients_neg[(patients_neg['PTAU'] < ptau_upper)])))\n",
-    "    print ('{} negative/borderline for PET amyloid (FBB tracer).'.format(len(patients_neg[(patients_neg['FBB'] < fbb_upper)])))\n",
-    "    print ('{} negative/borderline for PET amyloid (AV45 tracer).'.format(len(patients_neg[(patients_neg['AV45'] < av45_upper)])))\n",
-    "    print ('{} negative/borderline for PET tau (meta roi).'.format(len(patients_neg[(patients_neg['META_ROI'] < tau_upper)])))\n",
-    "    '''\n",
-    "    return (patients_biomarker_pos)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "id": "0d123b9f",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Of 362 controls, 44 dropped due to no biomarker data and 60 due to positive/borderline biomarkers. 258 remaining.\n",
-      "Of 58 ['AD'], 7 dropped due to no biomarker data and 3 due to negative/borderline biomarkers. 48 remaining.\n",
-      "Of 165 ['EMCI', 'LMCI', 'MCI'], 25 dropped due to no biomarker data and 54 due to negative/borderline biomarkers. 86 remaining.\n"
-     ]
-    }
-   ],
-   "source": [
-    "# run functions to get negative controls and positive patients\n",
-    "controls_bio_neg = biomarker_negative_controls(df)\n",
-    "ad_bio_positive = biomarker_positive_patients(df, ['AD'])\n",
-    "mci_bio_positive = biomarker_positive_patients(df, ['EMCI','LMCI','MCI'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 51,
-   "id": "979848a1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "master_df = pd.concat([controls_bio_neg, ad_bio_positive, mci_bio_positive])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "id": "cf508b49",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "master_df.to_csv(Path(\"__file__\").resolve().parents[1] / 'data' / 'final_biomarker_spreadsheet.csv')"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/scripts/calculate_meta_roi.ipynb b/scripts/calculate_meta_roi.ipynb
deleted file mode 100644
index b3cb980..0000000
--- a/scripts/calculate_meta_roi.ipynb
+++ /dev/null
@@ -1,723 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "3881d9a6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "\n",
-    "from pathlib import Path"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "3fdee605",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tau_data = Path(\"__file__\").resolve().parents[1] / 'data' / 'UCBERKELEYAV1451_04_26_22.csv'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "7b6b5db3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# load tau PET data\n",
-    "df = pd.read_csv(tau_data, index_col=0, header=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "83a462f2",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>VISCODE</th>\n",
-       "      <th>VISCODE2</th>\n",
-       "      <th>EXAMDATE</th>\n",
-       "      <th>INFERIORCEREBELLUM_SUVR</th>\n",
-       "      <th>INFERIORCEREBELLUM_VOLUME</th>\n",
-       "      <th>ERODED_SUBCORTICALWM_SUVR</th>\n",
-       "      <th>ERODED_SUBCORTICALWM_VOLUME</th>\n",
-       "      <th>BRAAK1_SUVR</th>\n",
-       "      <th>BRAAK1_VOLUME</th>\n",
-       "      <th>BRAAK34_SUVR</th>\n",
-       "      <th>...</th>\n",
-       "      <th>RIGHT_PUTAMEN_VOLUME</th>\n",
-       "      <th>RIGHT_THALAMUS_PROPER_SUVR</th>\n",
-       "      <th>RIGHT_THALAMUS_PROPER_VOLUME</th>\n",
-       "      <th>RIGHT_VENTRALDC_SUVR</th>\n",
-       "      <th>RIGHT_VENTRALDC_VOLUME</th>\n",
-       "      <th>RIGHT_VESSEL_SUVR</th>\n",
-       "      <th>RIGHT_VESSEL_VOLUME</th>\n",
-       "      <th>WM_HYPOINTENSITIES_SUVR</th>\n",
-       "      <th>WM_HYPOINTENSITIES_VOLUME</th>\n",
-       "      <th>update_stamp</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>RID</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>21</th>\n",
-       "      <td>init</td>\n",
-       "      <td>m144</td>\n",
-       "      <td>2018-02-02</td>\n",
-       "      <td>1.0285</td>\n",
-       "      <td>55285</td>\n",
-       "      <td>1.1771</td>\n",
-       "      <td>110695</td>\n",
-       "      <td>1.1937</td>\n",
-       "      <td>2869</td>\n",
-       "      <td>1.2152</td>\n",
-       "      <td>...</td>\n",
-       "      <td>4169</td>\n",
-       "      <td>1.5224</td>\n",
-       "      <td>5874</td>\n",
-       "      <td>1.3860</td>\n",
-       "      <td>3378</td>\n",
-       "      <td>1.4576</td>\n",
-       "      <td>10.0</td>\n",
-       "      <td>1.0615</td>\n",
-       "      <td>2460</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31</th>\n",
-       "      <td>init</td>\n",
-       "      <td>m150</td>\n",
-       "      <td>2018-04-24</td>\n",
-       "      <td>0.9647</td>\n",
-       "      <td>62344</td>\n",
-       "      <td>1.0258</td>\n",
-       "      <td>55683</td>\n",
-       "      <td>1.2463</td>\n",
-       "      <td>2558</td>\n",
-       "      <td>1.0969</td>\n",
-       "      <td>...</td>\n",
-       "      <td>5766</td>\n",
-       "      <td>1.1421</td>\n",
-       "      <td>6016</td>\n",
-       "      <td>1.2340</td>\n",
-       "      <td>3178</td>\n",
-       "      <td>1.3691</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>0.7641</td>\n",
-       "      <td>26845</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31</th>\n",
-       "      <td>y1</td>\n",
-       "      <td>m162</td>\n",
-       "      <td>2019-04-23</td>\n",
-       "      <td>0.9636</td>\n",
-       "      <td>61510</td>\n",
-       "      <td>1.0193</td>\n",
-       "      <td>50427</td>\n",
-       "      <td>1.2264</td>\n",
-       "      <td>2508</td>\n",
-       "      <td>1.0675</td>\n",
-       "      <td>...</td>\n",
-       "      <td>5537</td>\n",
-       "      <td>1.0801</td>\n",
-       "      <td>5865</td>\n",
-       "      <td>1.1644</td>\n",
-       "      <td>3060</td>\n",
-       "      <td>1.4203</td>\n",
-       "      <td>15.0</td>\n",
-       "      <td>0.7805</td>\n",
-       "      <td>28306</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>56</th>\n",
-       "      <td>init</td>\n",
-       "      <td>m144</td>\n",
-       "      <td>2018-02-20</td>\n",
-       "      <td>0.9790</td>\n",
-       "      <td>64415</td>\n",
-       "      <td>1.2442</td>\n",
-       "      <td>114401</td>\n",
-       "      <td>1.2682</td>\n",
-       "      <td>3809</td>\n",
-       "      <td>1.2016</td>\n",
-       "      <td>...</td>\n",
-       "      <td>4026</td>\n",
-       "      <td>1.3396</td>\n",
-       "      <td>6601</td>\n",
-       "      <td>1.3625</td>\n",
-       "      <td>3655</td>\n",
-       "      <td>1.6190</td>\n",
-       "      <td>42.0</td>\n",
-       "      <td>1.0556</td>\n",
-       "      <td>1385</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>56</th>\n",
-       "      <td>y1</td>\n",
-       "      <td>m156</td>\n",
-       "      <td>2019-01-10</td>\n",
-       "      <td>0.9548</td>\n",
-       "      <td>63302</td>\n",
-       "      <td>1.2863</td>\n",
-       "      <td>118625</td>\n",
-       "      <td>1.1931</td>\n",
-       "      <td>3666</td>\n",
-       "      <td>1.1839</td>\n",
-       "      <td>...</td>\n",
-       "      <td>4145</td>\n",
-       "      <td>1.3618</td>\n",
-       "      <td>6349</td>\n",
-       "      <td>1.3873</td>\n",
-       "      <td>3625</td>\n",
-       "      <td>1.4511</td>\n",
-       "      <td>10.0</td>\n",
-       "      <td>1.0850</td>\n",
-       "      <td>1427</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5 rows × 242 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    VISCODE VISCODE2    EXAMDATE  INFERIORCEREBELLUM_SUVR  \\\n",
-       "RID                                                         \n",
-       "21     init     m144  2018-02-02                   1.0285   \n",
-       "31     init     m150  2018-04-24                   0.9647   \n",
-       "31       y1     m162  2019-04-23                   0.9636   \n",
-       "56     init     m144  2018-02-20                   0.9790   \n",
-       "56       y1     m156  2019-01-10                   0.9548   \n",
-       "\n",
-       "     INFERIORCEREBELLUM_VOLUME  ERODED_SUBCORTICALWM_SUVR  \\\n",
-       "RID                                                         \n",
-       "21                       55285                     1.1771   \n",
-       "31                       62344                     1.0258   \n",
-       "31                       61510                     1.0193   \n",
-       "56                       64415                     1.2442   \n",
-       "56                       63302                     1.2863   \n",
-       "\n",
-       "     ERODED_SUBCORTICALWM_VOLUME  BRAAK1_SUVR  BRAAK1_VOLUME  BRAAK34_SUVR  \\\n",
-       "RID                                                                          \n",
-       "21                        110695       1.1937           2869        1.2152   \n",
-       "31                         55683       1.2463           2558        1.0969   \n",
-       "31                         50427       1.2264           2508        1.0675   \n",
-       "56                        114401       1.2682           3809        1.2016   \n",
-       "56                        118625       1.1931           3666        1.1839   \n",
-       "\n",
-       "     ...  RIGHT_PUTAMEN_VOLUME  RIGHT_THALAMUS_PROPER_SUVR  \\\n",
-       "RID  ...                                                     \n",
-       "21   ...                  4169                      1.5224   \n",
-       "31   ...                  5766                      1.1421   \n",
-       "31   ...                  5537                      1.0801   \n",
-       "56   ...                  4026                      1.3396   \n",
-       "56   ...                  4145                      1.3618   \n",
-       "\n",
-       "     RIGHT_THALAMUS_PROPER_VOLUME  RIGHT_VENTRALDC_SUVR  \\\n",
-       "RID                                                       \n",
-       "21                           5874                1.3860   \n",
-       "31                           6016                1.2340   \n",
-       "31                           5865                1.1644   \n",
-       "56                           6601                1.3625   \n",
-       "56                           6349                1.3873   \n",
-       "\n",
-       "     RIGHT_VENTRALDC_VOLUME  RIGHT_VESSEL_SUVR  RIGHT_VESSEL_VOLUME  \\\n",
-       "RID                                                                   \n",
-       "21                     3378             1.4576                 10.0   \n",
-       "31                     3178             1.3691                  2.0   \n",
-       "31                     3060             1.4203                 15.0   \n",
-       "56                     3655             1.6190                 42.0   \n",
-       "56                     3625             1.4511                 10.0   \n",
-       "\n",
-       "     WM_HYPOINTENSITIES_SUVR  WM_HYPOINTENSITIES_VOLUME           update_stamp  \n",
-       "RID                                                                             \n",
-       "21                    1.0615                       2460  2022-04-27 15:31:20.0  \n",
-       "31                    0.7641                      26845  2022-04-27 15:31:20.0  \n",
-       "31                    0.7805                      28306  2022-04-27 15:31:20.0  \n",
-       "56                    1.0556                       1385  2022-04-27 15:31:20.0  \n",
-       "56                    1.0850                       1427  2022-04-27 15:31:20.0  \n",
-       "\n",
-       "[5 rows x 242 columns]"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "630a3d56",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "result_cols = ['CTX_LH_ENTORHINAL_SUVR',\n",
-    "'CTX_RH_ENTORHINAL_SUVR',\n",
-    "'LEFT_AMYGDALA_SUVR',\n",
-    "'RIGHT_AMYGDALA_SUVR',\n",
-    "'CTX_LH_PARAHIPPOCAMPAL_SUVR',\n",
-    "'CTX_RH_PARAHIPPOCAMPAL_SUVR',\n",
-    "'CTX_LH_FUSIFORM_SUVR',\n",
-    "'CTX_RH_FUSIFORM_SUVR',\n",
-    "'CTX_LH_INFERIORTEMPORAL_SUVR',\n",
-    "'CTX_RH_INFERIORTEMPORAL_SUVR',\n",
-    "'CTX_LH_MIDDLETEMPORAL_SUVR',\n",
-    "'CTX_RH_MIDDLETEMPORAL_SUVR']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "79cb3167",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>VISCODE</th>\n",
-       "      <th>VISCODE2</th>\n",
-       "      <th>EXAMDATE</th>\n",
-       "      <th>INFERIORCEREBELLUM_SUVR</th>\n",
-       "      <th>INFERIORCEREBELLUM_VOLUME</th>\n",
-       "      <th>ERODED_SUBCORTICALWM_SUVR</th>\n",
-       "      <th>ERODED_SUBCORTICALWM_VOLUME</th>\n",
-       "      <th>BRAAK1_SUVR</th>\n",
-       "      <th>BRAAK1_VOLUME</th>\n",
-       "      <th>BRAAK34_SUVR</th>\n",
-       "      <th>...</th>\n",
-       "      <th>RIGHT_THALAMUS_PROPER_VOLUME</th>\n",
-       "      <th>RIGHT_VENTRALDC_SUVR</th>\n",
-       "      <th>RIGHT_VENTRALDC_VOLUME</th>\n",
-       "      <th>RIGHT_VESSEL_SUVR</th>\n",
-       "      <th>RIGHT_VESSEL_VOLUME</th>\n",
-       "      <th>WM_HYPOINTENSITIES_SUVR</th>\n",
-       "      <th>WM_HYPOINTENSITIES_VOLUME</th>\n",
-       "      <th>update_stamp</th>\n",
-       "      <th>meta_roi</th>\n",
-       "      <th>META_ROI</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>RID</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>21</th>\n",
-       "      <td>init</td>\n",
-       "      <td>m144</td>\n",
-       "      <td>2018-02-02</td>\n",
-       "      <td>1.0285</td>\n",
-       "      <td>55285</td>\n",
-       "      <td>1.1771</td>\n",
-       "      <td>110695</td>\n",
-       "      <td>1.1937</td>\n",
-       "      <td>2869</td>\n",
-       "      <td>1.2152</td>\n",
-       "      <td>...</td>\n",
-       "      <td>5874</td>\n",
-       "      <td>1.3860</td>\n",
-       "      <td>3378</td>\n",
-       "      <td>1.4576</td>\n",
-       "      <td>10.0</td>\n",
-       "      <td>1.0615</td>\n",
-       "      <td>2460</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "      <td>1.234958</td>\n",
-       "      <td>1.234958</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31</th>\n",
-       "      <td>init</td>\n",
-       "      <td>m150</td>\n",
-       "      <td>2018-04-24</td>\n",
-       "      <td>0.9647</td>\n",
-       "      <td>62344</td>\n",
-       "      <td>1.0258</td>\n",
-       "      <td>55683</td>\n",
-       "      <td>1.2463</td>\n",
-       "      <td>2558</td>\n",
-       "      <td>1.0969</td>\n",
-       "      <td>...</td>\n",
-       "      <td>6016</td>\n",
-       "      <td>1.2340</td>\n",
-       "      <td>3178</td>\n",
-       "      <td>1.3691</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>0.7641</td>\n",
-       "      <td>26845</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "      <td>1.145025</td>\n",
-       "      <td>1.145025</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31</th>\n",
-       "      <td>y1</td>\n",
-       "      <td>m162</td>\n",
-       "      <td>2019-04-23</td>\n",
-       "      <td>0.9636</td>\n",
-       "      <td>61510</td>\n",
-       "      <td>1.0193</td>\n",
-       "      <td>50427</td>\n",
-       "      <td>1.2264</td>\n",
-       "      <td>2508</td>\n",
-       "      <td>1.0675</td>\n",
-       "      <td>...</td>\n",
-       "      <td>5865</td>\n",
-       "      <td>1.1644</td>\n",
-       "      <td>3060</td>\n",
-       "      <td>1.4203</td>\n",
-       "      <td>15.0</td>\n",
-       "      <td>0.7805</td>\n",
-       "      <td>28306</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "      <td>1.137825</td>\n",
-       "      <td>1.137825</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>56</th>\n",
-       "      <td>init</td>\n",
-       "      <td>m144</td>\n",
-       "      <td>2018-02-20</td>\n",
-       "      <td>0.9790</td>\n",
-       "      <td>64415</td>\n",
-       "      <td>1.2442</td>\n",
-       "      <td>114401</td>\n",
-       "      <td>1.2682</td>\n",
-       "      <td>3809</td>\n",
-       "      <td>1.2016</td>\n",
-       "      <td>...</td>\n",
-       "      <td>6601</td>\n",
-       "      <td>1.3625</td>\n",
-       "      <td>3655</td>\n",
-       "      <td>1.6190</td>\n",
-       "      <td>42.0</td>\n",
-       "      <td>1.0556</td>\n",
-       "      <td>1385</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "      <td>1.235883</td>\n",
-       "      <td>1.235883</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>56</th>\n",
-       "      <td>y1</td>\n",
-       "      <td>m156</td>\n",
-       "      <td>2019-01-10</td>\n",
-       "      <td>0.9548</td>\n",
-       "      <td>63302</td>\n",
-       "      <td>1.2863</td>\n",
-       "      <td>118625</td>\n",
-       "      <td>1.1931</td>\n",
-       "      <td>3666</td>\n",
-       "      <td>1.1839</td>\n",
-       "      <td>...</td>\n",
-       "      <td>6349</td>\n",
-       "      <td>1.3873</td>\n",
-       "      <td>3625</td>\n",
-       "      <td>1.4511</td>\n",
-       "      <td>10.0</td>\n",
-       "      <td>1.0850</td>\n",
-       "      <td>1427</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "      <td>1.208933</td>\n",
-       "      <td>1.208933</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5 rows × 244 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    VISCODE VISCODE2    EXAMDATE  INFERIORCEREBELLUM_SUVR  \\\n",
-       "RID                                                         \n",
-       "21     init     m144  2018-02-02                   1.0285   \n",
-       "31     init     m150  2018-04-24                   0.9647   \n",
-       "31       y1     m162  2019-04-23                   0.9636   \n",
-       "56     init     m144  2018-02-20                   0.9790   \n",
-       "56       y1     m156  2019-01-10                   0.9548   \n",
-       "\n",
-       "     INFERIORCEREBELLUM_VOLUME  ERODED_SUBCORTICALWM_SUVR  \\\n",
-       "RID                                                         \n",
-       "21                       55285                     1.1771   \n",
-       "31                       62344                     1.0258   \n",
-       "31                       61510                     1.0193   \n",
-       "56                       64415                     1.2442   \n",
-       "56                       63302                     1.2863   \n",
-       "\n",
-       "     ERODED_SUBCORTICALWM_VOLUME  BRAAK1_SUVR  BRAAK1_VOLUME  BRAAK34_SUVR  \\\n",
-       "RID                                                                          \n",
-       "21                        110695       1.1937           2869        1.2152   \n",
-       "31                         55683       1.2463           2558        1.0969   \n",
-       "31                         50427       1.2264           2508        1.0675   \n",
-       "56                        114401       1.2682           3809        1.2016   \n",
-       "56                        118625       1.1931           3666        1.1839   \n",
-       "\n",
-       "     ...  RIGHT_THALAMUS_PROPER_VOLUME  RIGHT_VENTRALDC_SUVR  \\\n",
-       "RID  ...                                                       \n",
-       "21   ...                          5874                1.3860   \n",
-       "31   ...                          6016                1.2340   \n",
-       "31   ...                          5865                1.1644   \n",
-       "56   ...                          6601                1.3625   \n",
-       "56   ...                          6349                1.3873   \n",
-       "\n",
-       "     RIGHT_VENTRALDC_VOLUME  RIGHT_VESSEL_SUVR  RIGHT_VESSEL_VOLUME  \\\n",
-       "RID                                                                   \n",
-       "21                     3378             1.4576                 10.0   \n",
-       "31                     3178             1.3691                  2.0   \n",
-       "31                     3060             1.4203                 15.0   \n",
-       "56                     3655             1.6190                 42.0   \n",
-       "56                     3625             1.4511                 10.0   \n",
-       "\n",
-       "     WM_HYPOINTENSITIES_SUVR  WM_HYPOINTENSITIES_VOLUME  \\\n",
-       "RID                                                       \n",
-       "21                    1.0615                       2460   \n",
-       "31                    0.7641                      26845   \n",
-       "31                    0.7805                      28306   \n",
-       "56                    1.0556                       1385   \n",
-       "56                    1.0850                       1427   \n",
-       "\n",
-       "              update_stamp  meta_roi  META_ROI  \n",
-       "RID                                             \n",
-       "21   2022-04-27 15:31:20.0  1.234958  1.234958  \n",
-       "31   2022-04-27 15:31:20.0  1.145025  1.145025  \n",
-       "31   2022-04-27 15:31:20.0  1.137825  1.137825  \n",
-       "56   2022-04-27 15:31:20.0  1.235883  1.235883  \n",
-       "56   2022-04-27 15:31:20.0  1.208933  1.208933  \n",
-       "\n",
-       "[5 rows x 244 columns]"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df['META_ROI'] = df[result_cols].mean(axis=1)\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "5aec7b55",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>EXAMDATE</th>\n",
-       "      <th>META_ROI</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>RID</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>21</th>\n",
-       "      <td>2018-02-02</td>\n",
-       "      <td>1.234958</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31</th>\n",
-       "      <td>2018-04-24</td>\n",
-       "      <td>1.145025</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31</th>\n",
-       "      <td>2019-04-23</td>\n",
-       "      <td>1.137825</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>56</th>\n",
-       "      <td>2018-02-20</td>\n",
-       "      <td>1.235883</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>56</th>\n",
-       "      <td>2019-01-10</td>\n",
-       "      <td>1.208933</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       EXAMDATE  META_ROI\n",
-       "RID                      \n",
-       "21   2018-02-02  1.234958\n",
-       "31   2018-04-24  1.145025\n",
-       "31   2019-04-23  1.137825\n",
-       "56   2018-02-20  1.235883\n",
-       "56   2019-01-10  1.208933"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "results_df = df[['EXAMDATE','META_ROI']].copy()\n",
-    "results_df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "5e52c415",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "results_df.to_csv(Path(\"__file__\").resolve().parents[1] / 'data' / 'tau_meta_roi.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "663dc7b4",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/scripts/cnvfc/stats.py b/scripts/cnvfc/stats.py
new file mode 100644
index 0000000..3a4f28e
--- /dev/null
+++ b/scripts/cnvfc/stats.py
@@ -0,0 +1,348 @@
+import sys
+import time
+import warnings
+import scipy as sp
+import numpy as np
+import patsy as pat
+import pandas as pd
+from scipy import stats
+import statsmodels.api as sm
+#from .tools import conn2mat
+from sklearn import linear_model as sln
+from sklearn import preprocessing as skp
+from sklearn import model_selection as skm
+from statsmodels.sandbox.stats.multicomp import multipletests as stm
+
+
+def make_weights(data, pattern):
+    if not data.shape[1:] == pattern.shape:
+        raise Exception(f'data and pattern shape mismatch: {data.shape[1:]}, {pattern.shape}')
+    n_nodes = pattern.shape[0]
+    n_data = data.shape[0]
+    weights = np.array([[np.corrcoef(data[data_id, node_id, :],
+                                     pattern[node_id, :])[0, 1]
+                         for node_id in range(n_nodes)]
+                        for data_id in range(n_data)])
+    return weights
+
+
+def categorical_enrichment(pheno, data, group, case=None, control=None, node_labels='node_name_not_specificed'):
+    sub_mask, cases_mask = find_subset(pheno, group, [case, control])
+    sub_data = data[sub_mask, ...]
+    n_data = sub_data.shape[1]
+    n_case = np.sum(cases_mask[case])
+    n_control = np.sum(cases_mask[control])
+
+    results = {key: list() for key in ['node', 'U', 'p_value',
+                                       f'median_{group}_{case}',
+                                       f'median_{group}_{control}',
+                                       'rank_biserial_correlation']}
+    # Conduct Mann-Whitney-U for each node region
+    for node_id in range(n_data):
+        u_right, p = sp.stats.mannwhitneyu(sub_data[cases_mask[case], node_id], sub_data[cases_mask[control], node_id],
+                                           alternative='two-sided')
+        u_left = n_case * n_control - u_right
+        u_min = np.min([u_left, u_right])
+        # Compute the median for the case and control groups
+        median_case = np.median(sub_data[cases_mask[case], node_id])
+        median_control = np.median(sub_data[cases_mask[control], node_id])
+        # Compute rank biserial correlation
+        r_b = 1 - (2 * u_min) / (n_case * n_control)
+        # Determine if cases > controls or reverse
+        case_gt_con = u_right > u_min
+        if not case_gt_con:
+            r_b = -r_b
+        # Store the results
+        results['node'].append(node_id + 1)
+        results['U'].append(u_min)
+        results['p_value'].append(p)
+        results[f'median_{group}_{case}'].append(median_case)
+        results[f'median_{group}_{control}'].append(median_control)
+        results['rank_biserial_correlation'].append(r_b)
+
+    results_table = pd.DataFrame(data=results)
+    # Correct for multiple comparisons
+    (fdr_pass, qval, _, _) = stm(results_table.p_value, alpha=0.05, method='fdr_bh')
+    results_table['q_value'] = qval
+    # Add the node_names
+    results_table['node_name'] = node_labels
+
+    return results_table
+
+
+def continuous_enrichment(pheno, data, covariate, node_labels='node_name_not_specificed'):
+    sub_mask = find_subset(pheno, covariate)
+    sub_pheno = pheno.loc[sub_mask]
+    sub_data = data[sub_mask, :]
+    n_data = sub_data.shape[1]
+
+    results = {key: list() for key in ['node', 'pearson_r', 'p_value']}
+    for node_id in range(n_data):
+        r, p = stats.pearsonr(sub_pheno[covariate], sub_data[:, node_id])
+        # Store the results
+        results['node'].append(node_id + 1)
+        results['pearson_r'].append(r)
+        results['p_value'].append(p)
+
+    results_table = pd.DataFrame(data=results)
+    # Correct for multiple comparisons
+    (fdr_pass, qval, _, _) = stm(results_table.p_value, alpha=0.05, method='fdr_bh')
+    results_table['q_value'] = qval
+    # Add the node_names
+    results_table['node_name'] = node_labels
+
+    return results_table
+
+
+def find_subset(pheno, column, cases=None):
+    # TODO check pandas type input
+    subset_mask = np.array(~pheno[column].isnull())
+    if cases is not None and not not cases:
+        all_cases = pheno.loc[subset_mask][column].unique()
+        try:
+            case_available = np.array([True if case in all_cases else False for case in cases])
+        except TypeError as e:
+            raise Exception(f'the attribute "cases" needs to be iterable but is: {type(cases)}') from e
+        if not all(case_available):
+            if not any(case_available):
+                raise Exception(f'none of the requested cases of "{column}" are available')
+            else:
+                warnings.warn(
+                    f'\nnot all requested cases of "{column}" are available: {list(zip(cases, case_available))}',
+                    RuntimeWarning)
+        case_masks = np.array([pheno[column] == case for case in cases])
+        subset_mask = np.any(case_masks, 0)
+        # Return the masked instances of the requested cases
+        cases_dict = {case: case_masks[idx][subset_mask] for idx, case in enumerate(cases)}
+        return subset_mask, cases_dict
+    else:
+        return subset_mask
+
+
+def standardize(data, mask):
+    scaler = skp.StandardScaler(with_mean=False, with_std=True)
+    scaler.fit(data[mask, :])
+    standardized_data = scaler.transform(data)
+    return standardized_data
+
+
+def find_contrast(design_matrix, contrast):
+    # Find the contrast column
+    contrast_columns = [(col_id, col) for col_id, col in enumerate(design_matrix.columns) if f'{contrast}' in col]
+    if not len(contrast_columns) == 1:
+        raise Exception(f'There is no single factor that matches {contrast}: {(list(design_matrix.columns))}')
+    return contrast_columns
+
+
+def fast_glm(data, design_matrix, contrast):
+    # Does not compute p-values but operates in parallel
+    contrast_id, contrast_name = find_contrast(design_matrix, contrast)[0]
+    glm = sln.LinearRegression(fit_intercept=False, normalize=False, n_jobs=-2)
+    res = glm.fit(design_matrix, data)
+    betas = res.coef_[:, contrast_id]
+    return betas
+
+
+def glm(data, design_matrix, contrast):
+    contrast_id, contrast_name = find_contrast(design_matrix, contrast)[0]
+    n_data = data.shape[1]
+
+    # Conduct the GLM
+    betas = np.zeros(shape=n_data)
+    pvals = np.zeros(shape=n_data)
+    for conn_id in range(n_data):
+        model = sm.OLS(data[:, conn_id], design_matrix)
+        results = model.fit()
+        betas[conn_id] = results.params[contrast_id]
+        pvals[conn_id] = results.pvalues[contrast_id]
+
+    return betas, pvals
+
+
+def glm_wrap_cc(conn, pheno, group, case, control, regressors='', report=False, fast=False):
+    # Make sure pheno and conn have the same number of cases
+    if not conn.shape[0] == pheno.shape[0]:
+        print(f'Conn ({conn.shape[0]}) and pheno ({pheno.shape[0]}) must be same number of cases')
+
+    # Define the subset of the sample
+    sub_mask, case_masks = find_subset(pheno, group, [case, control])
+    sub_conn = conn[sub_mask, :]
+    sub_pheno = pheno.loc[sub_mask]
+    n_sub = np.sum(sub_mask)
+    n_case = np.sum(case_masks[case])
+    n_control = np.sum(case_masks[control])
+    n_data = sub_conn.shape[1]
+    if report:
+        print(f'Selected sample based on group variable {group}.\n'
+              f'cases: {case} (n={n_case})\n'
+              f'controls: {control} (n={n_control})\n'
+              f'original sample: n={pheno.shape[0]}; new sample: n={n_sub}\n'
+              f'{n_data} data points available\n'
+              f'standardized estimators are based on {group}=={control}')
+
+    stand_conn = standardize(sub_conn, case_masks[control])
+
+    # Construct design matrix
+    if type(control) == str:
+        contrast = f'C({group}, Treatment("{control}"))'
+    else:
+        contrast = f'C({group}, Treatment({control}))'
+    formula = ' + '.join((regressors, contrast))
+    dmat = pat.dmatrix(formula, sub_pheno, return_type='dataframe')
+
+    if fast:
+        betas = fast_glm(sub_conn, dmat, group)
+        table = pd.DataFrame(data={'betas': betas})
+    else:
+        betas, pvals = glm(sub_conn, dmat, group)
+        stand_betas, _ = glm(stand_conn, dmat, group)
+        table = pd.DataFrame(data={'betas': betas, 'stand_betas': stand_betas, 'pvals': pvals})
+
+    return table
+
+
+def glm_wrap_continuous(conn, pheno, contrast, regressors, report=False, fast=False):
+    # Make sure pheno and conn have the same number of cases
+    if not conn.shape[0] == pheno.shape[0]:
+        print(f'Conn ({conn.shape[0]}) and pheno ({pheno.shape[0]}) must be same number of cases')
+
+    # Define the subset of the sample
+    sub_mask = find_subset(pheno, contrast)
+    sub_conn = conn[sub_mask, :]
+    sub_pheno = pheno.loc[sub_mask]
+    n_sub = sub_pheno.shape[0]
+    n_data = sub_conn.shape[1]
+    sub_conn_stand = standardize(sub_conn, np.ones(n_sub).astype(bool))
+
+    if report:
+        print(f'Selected sample based on contrast variable {contrast}.\n'
+              f'Found {n_sub} subjects with no missing data for {contrast}\n'
+              f'original sample: n={pheno.shape[0]}; new sample: n={n_sub}\n'
+              f'{n_data} data points available\n'
+              f'standardized estimators are based on all subjects with no missing data for {contrast}')
+
+    formula = ' + '.join((regressors, contrast))
+    design_matrix = pat.dmatrix(formula, sub_pheno, return_type='dataframe')
+
+    if fast:
+        betas = fast_glm(sub_conn, design_matrix, contrast)
+        table = pd.DataFrame(data={'betas': betas})
+    else:
+        betas, pvals = glm(sub_conn, design_matrix, contrast)
+        stand_betas, _ = glm(sub_conn_stand, design_matrix, contrast)
+        table = pd.DataFrame(data={'betas': betas, 'stand_betas': stand_betas, 'pvals': pvals})
+
+    return table
+
+
+def permutation_glm_contrast(conn, pheno, n_a, n_a_1, n_b, n_b_1, n_iter=5000):
+    # conn: connectome (n_sub, n_conn)
+    # pheno: pheno pandas table, same order as conn
+    # n_a / n_b: number of subs per group a / b
+    # n_a_1 / n_b_1: number of subs in the first contrast of group a / b
+    # n_iter: number of permutations
+
+    # Some sanity checks because people can be stupid
+    if n_a_1 > n_a:
+        raise ValueError(f'n_a_1 ({n_a_1}) cannot be larger than n_a ({n_a})')
+    if n_b_1 > n_b:
+        raise ValueError(f'n_b_1 ({n_b_1}) cannot be larger than n_b ({n_b})')
+
+    shsp = skm.ShuffleSplit(n_splits=n_iter, test_size=0.5)
+    pheno.reset_index(inplace=True, drop=True)
+    pheno.loc[:, 'group'] = 0
+    n_subjects = pheno.shape[0]
+    beta = np.zeros((conn.shape[1], 2, n_iter))
+
+    start = time.time()
+    for fold_id, (group_a, group_b) in enumerate(shsp.split(np.ones(n_subjects))):
+        pheno_a = pheno.loc[group_a[:n_a]].copy()
+        pheno_b = pheno.loc[group_b[:n_b]].copy()
+        # Assign the groups
+        pheno_a.loc[group_a[:n_a_1], 'group'] = 1
+        pheno_b.loc[group_b[:n_b_1], 'group'] = 1
+        mat1 = pat.dmatrix('sex_dummy + Age_Bin + FD_scrubbed_s1_s2 + group', pheno_a, return_type='dataframe')
+        mat2 = pat.dmatrix('sex_dummy + Age_Bin + FD_scrubbed_s1_s2 + group', pheno_b, return_type='dataframe')
+        beta[:, 0, fold_id] = fast_glm(conn[group_a[:n_a], :], mat1, 'group')
+        beta[:, 1, fold_id] = fast_glm(conn[group_b[:n_b], :], mat2, 'group')
+
+        elapsed = time.time() - start
+        done = fold_id + 1
+        remaining = n_iter - done
+        time_left = (elapsed / done) * remaining
+
+        sys.stdout.write('\r {}/{}. {:.2f}s left ({:.3f}s)'.format(done, n_iter, time_left, elapsed / done))
+        sys.stdout.flush()
+    sys.stdout.write('\r Done. Took {:.2f}s'.format(elapsed))
+    sys.stdout.flush()
+
+    return beta
+
+
+def contrast_correlation(beta, mask, mode='region'):
+    # mode: 'region' or 'global'
+    if mode == 'region':
+        n_iter = beta.shape[2]
+        n_seed = mask.shape[0]
+        corr = np.zeros((n_seed, n_iter))
+        for fold_id in range(n_iter):
+            # Map the betas back to a matrix
+            beta1_mat = conn2mat(beta[:, 0, fold_id], mask)
+            beta2_mat = conn2mat(beta[:, 1, fold_id], mask)
+            # Compute the correlation
+            corr[:, fold_id] = np.array([np.corrcoef(beta1_mat[nid, :], beta2_mat[nid, :])[0, 1] for nid in range(64)])
+        return corr
+    elif mode == 'global':
+        n_iter = beta.shape[2]
+        corr = np.array([np.corrcoef(beta[:, 0, fold_id], beta[:, 1, fold_id])[0, 1] for fold_id in range(n_iter)])
+        return corr
+    else:
+        raise ValueError(f'mode {mode} is not configured!')
+
+
+def permutation_glm(pheno, conn, group, case, control, regressors='', n_iter=1000, stand=False):
+    # Make sure pheno and conn have the same number of cases
+    if not conn.shape[0] == pheno.shape[0]:
+        print(f'Conn ({conn.shape[0]}) and pheno ({pheno.shape[0]}) must be same number of cases')
+    sub_mask, case_masks = find_subset(pheno, group, [case, control])
+    sub_pheno = pheno.loc[sub_mask, :]
+    sub_pheno.loc[:, 'group'] = ''
+    sub_conn = conn[sub_mask, :]
+    # Standardize the input data
+    if stand:
+        sub_conn = standardize(sub_conn, case_masks[control])
+    n_conn = sub_conn.shape[1]
+
+    n1 = np.sum(case_masks[case])
+    n2 = np.sum(case_masks[control])
+    r2 = n2 / (n1 + n2)
+
+    splitter = skm.ShuffleSplit(n_splits=n_iter, test_size=r2, random_state=1)
+
+    i_pheno = sub_pheno.copy()
+    group_id = list(i_pheno.columns).index('group')
+
+    betas = np.zeros(shape=(n_iter, n_conn))
+    start = time.time()
+    for fold_id, (train_id, test_id) in enumerate(splitter.split(sub_conn)):
+        n1_ids = train_id[:n1]
+        n2_ids = test_id[:n2]
+        i_pheno = sub_pheno.copy()
+        i_pheno['group'] = ''
+        i_pheno.iloc[n1_ids, group_id] = 'case'
+        i_pheno.iloc[n2_ids, group_id] = 'control'
+        table = glm_wrap_cc(sub_conn, i_pheno, 'group', 'case', 'control', regressors=regressors,
+                            report=False, fast=True)
+        betas[fold_id, :] = table['betas'].values
+
+        elapsed = time.time() - start
+        done = fold_id + 1
+        remaining = n_iter - done
+        time_left = (elapsed / done) * remaining
+
+        sys.stdout.write('\r {}/{}. {:.2f}s left ({:.3f}s per permutation)'.format(done, n_iter, time_left, elapsed / done))
+        sys.stdout.flush()
+    sys.stdout.write('\r Done. Took {:.2f}s'.format(elapsed))
+    sys.stdout.flush()
+
+    return betas
diff --git a/scripts/h5_to_npy.py b/scripts/h5_to_npy.py
new file mode 100644
index 0000000..805bb0b
--- /dev/null
+++ b/scripts/h5_to_npy.py
@@ -0,0 +1,173 @@
+#import os dont think I need this?
+import h5py
+import numpy as np
+import pandas as pd
+from pathlib import Path
+
+def load_connectome(file, participant_id, session, identifier, no_session):
+    # Construct the dataset path
+    if no_session:
+        dataset_path = f"sub-{participant_id}/{identifier}_atlas-MIST_desc-64_connectome"
+    else:
+        dataset_path = f"sub-{participant_id}/ses-{session}/{identifier}_atlas-MIST_desc-64_connectome"
+
+    dataset = file.get(dataset_path)
+
+    connectome_npy = None  # Initialise to None in case missing
+    if dataset is not None:
+        connectome_npy = dataset[...]  # Load the dataset into a NumPy array
+    return connectome_npy
+
+def save_connectome(connectome_npy, output_p, identifier):
+    if connectome_npy is not None:
+        output_file_p = output_p / f"{identifier}.npy"
+        np.save(output_file_p, connectome_npy)
+        print(f"Saved data for {identifier}")
+    else:
+        print(f"Connectome for {identifier} not found or could not be loaded")
+
+def process_connectome_participant(row, base_dir, output_p):
+    identifier = row["identifier"]
+    participant_id = row["participant_id"]
+    session = row["ses"]
+    # Construct path to subject's HDF5 file
+    hdf5_file_p = base_dir / f"{participant_id}" / f"sub-{participant_id}_atlas-MIST_desc-scrubbing.5+gsr.h5"
+
+    if hdf5_file_p.exists():
+        with h5py.File(hdf5_file_p, "r") as file:
+            connectome_npy = load_connectome(file, participant_id, session, identifier)
+            save_connectome(connectome_npy, output_p, identifier)
+
+def process_connectome_group(row, file, output_p, no_session):
+    identifier = row["identifier"]
+    participant_id = row["participant_id"]
+    session = row["ses"]
+
+    connectome_npy = load_connectome(file, participant_id, session, identifier, no_session)
+    save_connectome(connectome_npy, output_p, identifier)
+
+def process_cobre(conn_p, df, output_dir):
+    base_dir = conn_p / "cobre_connectome-0.4.1"
+    hdf5_file_p = base_dir / "atlas-MIST_desc-scrubbing.5+gsr.h5"
+    output_p = output_dir / "cobre"
+    if not output_p.exists():
+        output_p.mkdir(parents=True, exist_ok=True)
+
+    # Filter df for dataset
+    filtered_df = df[df['dataset'] == 'cobre']
+
+    with h5py.File(hdf5_file_p, "r") as file:
+        # Loop through the filtered dataframe
+        for index, row in filtered_df.iterrows():
+            process_connectome_group(row, file, output_p)
+
+def process_ds000030(conn_p, df, output_dir):
+    base_dir = conn_p / "ds000030_connectomes-0.4.1"
+    hdf5_file_p = base_dir / "atlas-MIST_desc-scrubbing.5+gsr.h5"
+    output_p = output_dir / "ds000030"
+    if not output_p.exists():
+        output_p.mkdir(parents=True, exist_ok=True)
+
+    # Filter df for dataset
+    filtered_df = df[df['dataset'] == 'ds000030']
+
+    with h5py.File(hdf5_file_p, "r") as file:
+        # Loop through the filtered dataframe
+        for index, row in filtered_df.iterrows():
+            process_connectome_group(row, file, output_p, no_session=True)
+
+def process_hcpep(conn_p, df, output_dir):
+    base_dir = conn_p / "hcp-ep_connectome-0.4.1"
+    output_p = output_dir / "hcpep"
+    if not output_p.exists():
+        output_p.mkdir(parents=True, exist_ok=True)
+
+    # Filter df for dataset
+    filtered_df = df[df['dataset'] == 'hcpep']
+
+    # Loop through the filtered dataframe
+    for index, row in filtered_df.iterrows():
+        process_connectome_participant(row, base_dir, output_p)
+
+def process_srpbs(conn_p, df, output_dir):
+    base_dir = conn_p / "srpbs_connectome-0.4.1"
+    output_p = output_dir / "srpbs"
+    if not output_p.exists():
+        output_p.mkdir(parents=True, exist_ok=True)
+
+    # Filter df for dataset
+    filtered_df = df[df['dataset'] == 'srpbs']
+
+    # Loop through the filtered dataframe
+    for index, row in filtered_df.iterrows():
+        process_connectome_participant(row, base_dir, output_p)
+
+def process_adni(conn_p, final_file_p, output_dir):
+    base_dir = conn_p / "adni_connectomes-0.4.1"
+    final_file = final_file_p / "final_adni.tsv"
+    output_p = output_dir / "adni"
+    if not output_p.exists():
+        output_p.mkdir(parents=True, exist_ok=True)
+
+    # Load the file to get identifiers and participant_ids
+    df = pd.read_csv(final_file, sep="\t")
+
+    # Loop through the filtered dataframe
+    for index, row in df.iterrows():
+        process_connectome_participant(row, base_dir, output_p)
+
+def process_oasis3(conn_p, final_file_p, output_dir):
+    base_dir = conn_p / "oasis3_connectomes-0.4.1"
+    final_file = final_file_p / "final_oasis3.tsv"
+    output_p = output_dir / "oasis3"
+    if not output_p.exists():
+        output_p.mkdir(parents=True, exist_ok=True)
+
+    # Load the file to get identifiers and participant_ids
+    df = pd.read_csv(final_file, sep="\t")
+
+    # Loop through the filtered dataframe
+    for index, row in df.iterrows():
+        process_connectome_participant(row, base_dir, output_p)
+
+def process_cimaq(conn_p, final_file_p, output_dir):
+    base_dir = conn_p / "cimaq_connectomes-0.4.1"
+    final_file = final_file_p / "final_cimaq.tsv"
+    output_p = output_dir / "cimaq"
+    if not output_p.exists():
+        output_p.mkdir(parents=True, exist_ok=True)
+
+    # Load the file to get identifiers and participant_ids
+    df = pd.read_csv(final_file, sep="\t")
+
+    # Loop through the filtered dataframe
+    for index, row in df.iterrows():
+        process_connectome_participant(row, base_dir, output_p)
+
+
+if __name__ == "__main__":
+    passed_qc_file_p = Path("/home/nclarke/projects/rrg-pbellec/nclarke/ad_sz/files/passed_qc_master.tsv")
+    conn_p = Path("/home/nclarke/scratch")
+    output_dir = Path("/home/nclarke/scratch/npy_connectome")
+
+    # These datasets needed some extra steps aftre checking QC, so get identifiers from a different tsv
+    final_file_p = Path("/home/nclarke/projects/rrg-pbellec/nclarke/ad_sz/files")
+
+    # Load the file to get identifiers and participant_ids
+    df = pd.read_csv(passed_qc_file_p, sep="\t")
+
+    #process_cobre(conn_p, df, output_dir)
+    process_ds000030(conn_p, df, output_dir)
+    #process_hcpep(conn_p, df, output_dir)
+    #process_srpbs(conn_p, df, output_dir)
+    #process_adni(conn_p, final_file_p, output_dir)
+    #process_cimaq(conn_p, final_file_p, output_dir)
+    #process_oasis3(conn_p, final_file_p, output_dir)
+
+
+
+
+
+
+
+
diff --git a/scripts/prepare_combat_file.py b/scripts/prepare_combat_file.py
new file mode 100644
index 0000000..5d09044
--- /dev/null
+++ b/scripts/prepare_combat_file.py
@@ -0,0 +1,61 @@
+from pathlib import Path
+import numpy as np
+
+def load_connectomes(base_directory_p):
+    """
+    Load connectome data from .npy files across multiple subdirectories.
+
+    Parameters:
+    - base_directory_p: Path to the base directory containing subdirectories of connectome files.
+
+    Returns:
+    - connectomes: A list of connectome matrices, one per participant across all subdirectories.
+    """
+    connectomes = []
+    # Iterate over each subdirectory in the base directory
+    for directory_p in base_directory_p.iterdir():
+        if directory_p.is_dir():
+            for file_p in directory_p.glob("*.npy"):
+                connectome = np.load(file_p)
+                connectomes.append(connectome)
+    return connectomes
+
+def connectome_to_edges_matrix(connectomes):
+    """
+    Transform a list of connectomes into a matrix suitable for ComBat,
+    where rows are ROI pairs and columns are participants.
+    """
+    num_participants = len(connectomes)
+    num_rois = connectomes[0].shape[0]
+    num_edges = num_rois * (num_rois - 1) // 2
+
+    edges_matrix = np.zeros((num_edges, num_participants))
+    for i, connectome in enumerate(connectomes):
+        edge_index = 0
+        for row in range(num_rois):
+            for col in range(row + 1, num_rois):
+                edges_matrix[edge_index, i] = connectome[row, col]
+                edge_index += 1
+
+    return edges_matrix
+
+# Set paths
+base_directory_p = Path("/home/neuromod/ad_sz/data/npy_connectome")
+output_p = "/home/neuromod/ad_sz/data/edges_matrix.tsv"
+
+# Load connectomes from all subdirectories.
+connectomes = load_connectomes(base_directory_p)
+
+print(f"Number of connectomes loaded: {len(connectomes)}")
+for connectome in connectomes[:5]:  # Print the shape of the first 5 connectomes as a check
+    print(connectome.shape)
+
+# Transform into the ComBat-ready matrix.
+edges_matrix = connectome_to_edges_matrix(connectomes)
+
+print(f"Edges matrix shape: {edges_matrix.shape}")
+
+# Save the edges_matrix to a .tsv file
+np.savetxt(output_p, edges_matrix, delimiter='\t')
+
+print(f"Saved edges matrix to {output_p}")
diff --git a/scripts/prepare_input_data.py b/scripts/prepare_input_data.py
new file mode 100644
index 0000000..34f5f76
--- /dev/null
+++ b/scripts/prepare_input_data.py
@@ -0,0 +1,284 @@
+import h5py
+import numpy as np
+import pandas as pd
+from pathlib import Path
+
+dataset_configs = {
+    "cobre": {
+        "connectome_path": "cobre_connectome-0.4.1_MIST_afc",
+        "no_session": False,
+        "subject_folder": False,
+    },
+    "cimaq": {
+        "connectome_path": "cimaq_connectome-0.4.1_MIST_afc",
+        "no_session": False,
+        "subject_folder": True,
+    },
+    "hcpep": {
+        "connectome_path": "hcp-ep_connectome-0.4.1",
+        "no_session": False,
+        "subject_folder": True,
+    },
+    "ds000030": {
+        "connectome_path": "ds000030_connectomes-0.4.1",
+        "no_session": True,
+        "subject_folder": False,
+    },
+    "oasis3": {
+        "connectome_path": "oasis3_connectomes-0.4.1",
+        "no_session": False,
+        "subject_folder": True,
+    },
+    "srpbs": {
+        "connectome_path": "srpbs_connectome-0.4.1",
+        "no_session": False,
+        "subject_folder": True,
+    },
+        "adni": {
+        "connectome_path": "adni_connectome-0.4.1_MIST_afc",
+        "no_session": False,
+        "subject_folder": True,
+    },
+}
+
+
+def load_connectome(file, participant_id, session, identifier, no_session):
+    # Path construction within .h5 file
+    dataset_path = f"sub-{participant_id}"
+
+    if not no_session:
+        dataset_path += f"/ses-{session}"
+
+    dataset_path += f"/{identifier}_atlas-MIST_desc-64_connectome"
+    dataset = file.get(dataset_path)
+
+    if dataset is not None:
+        return dataset[...]
+    else:
+        print(f"Connectome for {identifier} not found")
+        return None
+
+
+def save_npy_connectome(connectome_npy, output_p, identifier):
+    # Not using this function currently
+    if connectome_npy is not None:
+        output_file_p = output_p / f"{identifier}.npy"
+        np.save(output_file_p, connectome_npy)
+        print(f"Saved data for {identifier}")
+    else:
+        print(f"Connectome for {identifier} not found")
+
+
+def connectome_to_edge_matrix(connectome):
+    """
+    Transform a single connectome into an edge matrix.
+    Extract the upper triangular part of the connectome, including the diagonal.
+    """
+    # Get the upper triangular indices including the diagonal
+    row_indices, col_indices = np.triu_indices_from(connectome)
+
+    # Extract the values at these indices
+    edge_matrix = connectome[row_indices, col_indices]
+
+    return edge_matrix
+
+
+def process_connectomes(connectomes_by_participant):
+    """
+    Apply Fisher z transform, compute the global signal and convert to an edge matrix. For multiple connectomes per participant, connectomes are averaged first.
+    """
+    edges_matrix_dict = {}
+    global_signal_dict = {}
+    for participant_id, connectomes in connectomes_by_participant.items():
+        # Compute the mean connectome
+        mean_connectome = np.mean(connectomes, axis=0)
+        # Apply Fisher Z transformation
+        transformed_connectome = np.arctanh(mean_connectome)
+        # Compute global signal (mean of the Z values)
+        global_signal = np.mean(transformed_connectome)
+        global_signal_dict[participant_id] = global_signal
+        # Convert connectome to a marix for ComBat
+        edges_matrix = connectome_to_edge_matrix(transformed_connectome)
+        edges_matrix_dict[participant_id] = edges_matrix
+
+    return edges_matrix_dict, global_signal_dict
+
+
+def matrix_dict_to_df(edges_matrix_dict):
+    # Create a df from the edge matrices
+    # Convert dictionary values to a 2D numpy array
+    edge_matrix_combined = np.column_stack(list(edges_matrix_dict.values()))
+    participants = list(edges_matrix_dict.keys())
+    edges_df = pd.DataFrame(edge_matrix_combined, columns=participants)
+
+    # Transpose so suitable for ComBat
+    edges_df = edges_df.T
+
+    return edges_df
+
+
+def process_datasets(conn_p, df, dataset_name):
+    config = dataset_configs[dataset_name]
+    base_dir = conn_p / config["connectome_path"]
+
+    # Filter the df for required scans for the dataset
+    filtered_df = df[df["dataset"] == dataset_name]
+    valid_rows = []  # To store rows where the connectome was found
+
+    connectomes_by_participant = {}
+    for index, row in filtered_df.iterrows():
+        participant_id = row["participant_id"]
+        identifier = row["identifier"]
+        session = None if config["no_session"] else row.get("ses", None)
+
+        # Adjust file path based on whether dataset uses a subject folder
+        if config["subject_folder"]:
+            file_path = (
+                base_dir
+                / participant_id
+                / f"sub-{participant_id}_atlas-MIST_desc-scrubbing.5+gsr.h5"
+            )
+        else:
+            file_path = base_dir / "atlas-MIST_desc-scrubbing.5+gsr.h5"
+        try:
+            with h5py.File(file_path, "r") as file:
+                connectome = load_connectome(
+                    file, participant_id, session, identifier, config["no_session"]
+                )
+                if connectome is not None:
+                    # Append the connectome to the participant's list in the dictionary
+                    if participant_id not in connectomes_by_participant:
+                        connectomes_by_participant[participant_id] = []
+                    connectomes_by_participant[participant_id].append(connectome)
+
+                    # Add the row to valid_rows if the connectome is found
+                    valid_rows.append(row)
+
+        except FileNotFoundError:
+            print(f"File not found: {file_path}")
+
+    edges_df = pd.DataFrame()
+    # Process connectomes
+    if connectomes_by_participant:
+        edges_matrix_dict, global_signal_dict = process_connectomes(
+            connectomes_by_participant
+        )
+        edges_df = matrix_dict_to_df(edges_matrix_dict)
+
+    # Convert the list of valid rows to a df
+    valid_df = pd.DataFrame(valid_rows)
+
+    return edges_df, valid_df, global_signal_dict
+
+
+def create_pheno_df(valid_df, global_signal_dict):
+    # Group by 'participant_id' and aggregate, taking the first entry for variables that do not change (for this use case)
+    aggregation_functions = {
+        "sex": "first",
+        "age": "first",
+        "diagnosis": "first",
+        "site": "first",
+        "dataset": "first",
+        "mean_fd_scrubbed": "mean",  # Averaging mean framewise displacement across scans
+        "group": "first",
+    }
+    pheno_df = valid_df.groupby("participant_id").agg(aggregation_functions)
+    # Add global signal to pheno_df
+    pheno_df["mean_conn"] = pheno_df.index.map(global_signal_dict)
+    return pheno_df
+
+
+def create_covariates_df_old(valid_df):
+    # Group by participant_id and select the first entry for each group
+    first_entries = valid_df.groupby("participant_id").first()
+
+    # Extract the relevant covariates
+    covariates_df = first_entries[["site", "sex", "age", "diagnosis"]]
+
+    return covariates_df
+
+
+def one_hot_encode_column(df, column_name):
+    dummies = pd.get_dummies(df[column_name], prefix=column_name)
+    df = pd.concat([df, dummies], axis=1)
+    df.drop(column_name, axis=1, inplace=True)
+    return df
+
+
+def one_hot_encode_column_no_prefix(df, column_name):
+    dummies = pd.get_dummies(df[column_name])
+    df = pd.concat([df, dummies], axis=1)
+    df.drop(column_name, axis=1, inplace=True)
+    return df
+
+
+def create_covariates_df(pheno_df):
+    # Extract the relevant covariates from pheno_df
+    covariates_df = pheno_df[
+        [
+            "site",
+            "sex_male",
+            "sex_female",
+            "age",
+            "diagnosis_MCI",
+            "diagnosis_ADD",
+            "diagnosis_CON",
+            "diagnosis_SCHZ",
+        ]
+    ]  # split CON into controls from AD and SCHZ datasets?
+
+    return covariates_df
+
+
+if __name__ == "__main__":
+    conn_p = Path("/home/nclarke/scratch")
+    output_p = Path("/home/nclarke/ad_sz/data/input_data")
+    df = pd.read_csv("/home/nclarke/ad_sz/data/final_qc_pheno.tsv", sep="\t")
+
+    # Convert adni sessions to strings so can formulate paths correctly
+    mask = df["dataset"] == "adni"
+    df.loc[mask, "ses"] = pd.to_datetime(df.loc[mask, "ses"]).dt.strftime("%Y%m%d")
+
+    if not output_p.exists():
+        output_p.mkdir(parents=True, exist_ok=True)
+
+    all_edges = []
+    all_pheno = []
+    for dataset in dataset_configs:
+        edges_df, valid_df, global_signal_dict = process_datasets(conn_p, df, dataset)
+        pheno_df = create_pheno_df(valid_df, global_signal_dict)
+
+        # Collect data per dataset
+        all_edges.append(edges_df)
+        all_pheno.append(pheno_df)
+
+    # Concatenate data across datasets
+    combined_edges_df = pd.concat(all_edges)
+    combined_pheno_df = pd.concat(all_pheno)
+
+    # One-hot encode columns
+    combined_pheno_df = one_hot_encode_column(combined_pheno_df, "sex")
+    combined_pheno_df = one_hot_encode_column(combined_pheno_df, "diagnosis")
+    combined_pheno_df = one_hot_encode_column(combined_pheno_df, "group")
+
+    # Create covariates_df from pheno_df
+    covariates_df = create_covariates_df(combined_pheno_df)
+
+    # Capitalize column names, necessary for combat etc
+    covariates_df.columns = [x.upper() for x in covariates_df.columns]
+    combined_pheno_df.columns = [x.upper() for x in combined_pheno_df.columns]
+
+    print("Shape of combined_edges_df = ", combined_edges_df.shape)
+    print("Shape of covariates_df = ", covariates_df.shape)
+    print("Shape of combined_pheno_df = ", combined_pheno_df.shape)
+
+    # Ensure order of data is identical and output
+    combined_edges_df.sort_index().to_csv(
+        output_p / "combat_edges.tsv", sep="\t", index=True
+    )
+    covariates_df.sort_index().to_csv(
+        output_p / "combat_covariates.tsv", sep="\t", index=True
+    )
+    combined_pheno_df.sort_index().to_csv(output_p / "pheno.tsv", sep="\t", index=True)
+
+    print(f"Saved edges matrix, covariates and pheno to {output_p}")
diff --git a/scripts/significance_corr.py b/scripts/significance_corr.py
new file mode 100644
index 0000000..05a1e7e
--- /dev/null
+++ b/scripts/significance_corr.py
@@ -0,0 +1,265 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import util
+import time
+import os
+import itertools
+from statsmodels.stats.multitest import multipletests
+from scipy.stats import pearsonr
+from argparse import ArgumentParser
+
+
+# SEBASTIEN URCHS
+def p_permut(empirical_value, permutation_values):
+    n_permutation = len(permutation_values)
+    if empirical_value >= 0:
+        return (np.sum(permutation_values > empirical_value) + 1) / (n_permutation + 1)
+    return (np.sum(permutation_values < empirical_value) + 1) / (n_permutation + 1)
+
+
+def filter_fdr(df, contrasts):
+    df_filtered = df[
+        (df["pair0"].isin(contrasts)) & (df["pair1"].isin(contrasts))
+    ].copy()
+    _, fdr, _, _ = multipletests(df_filtered["pval"], method="fdr_bh")
+    df_filtered["fdr_filtered"] = fdr
+    return df_filtered
+
+
+def mat_form(df, contrasts, value="betamap_corr"):
+    n = len(contrasts)
+    d = dict(zip(contrasts, range(n)))
+    mat = np.zeros((n, n))
+    for c in contrasts:
+        # fill out vertical strip of mat
+        for i in range(n):
+            if i == d[c]:
+                val = 1
+            else:
+                val = df[
+                    ((df["pair0"] == c) | (df["pair1"] == c))
+                    & ((df["pair0"] == contrasts[i]) | (df["pair1"] == contrasts[i]))
+                ][value]
+            mat[i, d[c]] = val
+            mat[d[c], i] = val
+    return pd.DataFrame(mat, columns=contrasts, index=contrasts)
+
+
+def make_matrices(df, contrasts, fdr="fdr_filtered"):
+    "Param fdr can be set to 'fdr_filtered': FDR is performed using the pvalues only from the chosen contrasts"
+    "                              or 'fdr': values taken from FDR performed on full set of 42 contrasts"
+    if fdr == "fdr_filtered":
+        df = filter_fdr(df, contrasts)
+    mat_corr = mat_form(df, contrasts, value="betamap_corr")
+    mat_pval = mat_form(df, contrasts, value="pval")
+    mat_fdr = mat_form(df, contrasts, value=fdr)
+    return mat_corr, mat_pval, mat_fdr
+
+
+def get_corr_dist(cases, nulls, path_out, tag="wholeconn"):
+    # For each unique pair, between the null maps.
+    n_pairs = int((len(cases)) * (len(cases) - 1) / 2)
+    corr = np.zeros((n_pairs, 5000))
+
+    print(
+        "Getting correlation between 5000 null maps for {} unique pairs for {} cases...".format(
+            n_pairs, len(cases)
+        )
+    )
+    pair = []
+    l = 0
+    for i in itertools.combinations(cases, 2):
+        for j in range(5000):
+            corr[l, j] = pearsonr(
+                nulls.loc[i[0]].values[j, :], nulls.loc[i[1]].values[j, :]
+            )[0]
+
+        pair.append(i)
+        if l % 50 == 0:
+            print("{}/{}".format(l, n_pairs))
+        l = l + 1
+
+    df = pd.DataFrame(corr)
+    df["pair"] = pair
+    df.to_csv(os.path.join(path_out, "correlation_dist_{}.csv".format(tag)))
+    return df
+
+
+def get_corr(cases, betas, path_out, tag="wholeconn"):
+    # For each unique pair, correlation between betamaps. Use standardized betas here (as in rest of paper).
+    n_pairs = int((len(cases)) * (len(cases) - 1) / 2)
+    corr = np.zeros(n_pairs)
+
+    print(
+        "Getting correlation between betamaps for {} unique pairs for {} cases...".format(
+            n_pairs, len(cases)
+        )
+    )
+    pair = []
+    l = 0
+    for i in itertools.combinations(cases, 2):
+        corr[l] = pearsonr(betas.loc[i[0]].values, betas.loc[i[1]].values)[0]
+        l = l + 1
+        pair.append(i)
+    df = pd.DataFrame(corr)
+    df["pair"] = pair
+    df.to_csv(os.path.join(path_out, "correlation_betas_{}.csv".format(tag)))
+    return df
+
+
+def get_corr_pval(maps, nulls, betas, path_out, tag="wholeconn"):
+    df = get_corr_dist(maps, nulls, path_out, tag=tag)
+    df_bb = get_corr(maps, betas, path_out, tag=tag)
+
+    df_bb = df_bb.rename(columns={0: "betamap_corr"})
+    df_master = df_bb.merge(df, on="pair")
+
+    print("Calculating pvals...")
+    # CALCULATE PVALS
+    pval = []
+    for i in df_master.index:
+        p = p_permut(df_master.loc[i, "betamap_corr"], df_master[range(5000)].loc[i])
+        pval.append(p)
+    df_master["pval"] = pval
+
+    # ADD LABELS
+    pair0 = [p[0] for p in df["pair"].tolist()]
+    pair1 = [p[1] for p in df["pair"].tolist()]
+    df_master["pair0"] = pair0
+    df_master["pair1"] = pair1
+
+    df_compact = df_master[["pair0", "pair1", "betamap_corr", "pval"]]
+    df_compact.to_csv(
+        os.path.join(path_out, "corr_pval_null_v_null_{}.csv".format(tag))
+    )
+
+    return df_compact
+
+
+def _make_region_masks(row):
+    mask = np.tri(64, k=0, dtype=bool)
+
+    # Get the parcel number from the beginning of the row and subtract 1
+    parcel_num = int(row[0]) - 1
+
+    # Get the parcel name from the second column value
+    parcel_name = str(row[1]).upper().replace(" ", "_")
+
+    # Generate the mask
+    parcel = np.zeros((64, 64), bool)
+    parcel[:, parcel_num] = True
+    parcel_mask = parcel + np.transpose(parcel)
+    parcel_mask = np.tril(parcel_mask)
+    parcel_mask = parcel_mask[mask]
+
+    return (parcel_mask, parcel_name)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--n_path_mc", help="path to mc null models dir", dest="n_path_mc"
+    )
+    parser.add_argument("--b_path_mc", help="path to mc betamaps dir", dest="b_path_mc")
+    parser.add_argument("--path_out", help="path to output directory", dest="path_out")
+    parser.add_argument(
+        "--path_corr", help="path to corr dir", dest="path_corr", default=None
+    )
+    parser.add_argument(
+        "--path_parcellation", help="path to MIST_64.cv", dest="path_parcellation"
+    )
+    args = parser.parse_args()
+
+    n_path_mc = os.path.join(args.n_path_mc, "null_model_{}_vs_control_mc.npy")
+    # cont_n_path_mc = os.path.join(args.n_path_mc,'{}_null_model_mc.npy')
+    b_path_mc = os.path.join(args.b_path_mc, "{}_vs_con_mean.csv")
+    # cont_b_path_mc = os.path.join(args.b_path_mc,'cont_{}_results_mc.csv')
+    path_out = args.path_out
+    path_corr = args.path_corr
+
+    cases = ["mci_neg", "mci_pos", "sz"]
+
+    maps = cases  # + cont
+
+    #############
+    # LOAD DATA #
+    #############
+    null = []
+    beta_std = []
+
+    for c in cases:
+        null.append(pd.DataFrame(np.load(n_path_mc.format(c))))
+        beta_std.append(pd.read_csv(b_path_mc.format(c))["stand_betas"].values)
+
+    betamaps_std = pd.DataFrame(beta_std, index=maps)
+    nullmodels = pd.concat(null, keys=maps)
+
+    ####################
+    # WHOLE CONNECTOME #
+    ####################
+    print("Creating correlation distributions for whole connectome...")
+    df_wc = get_corr_pval(maps, nullmodels, betamaps_std, path_out, tag="wholeconn")
+
+    # make matrices
+    print("Preparing correlation matrices...")
+    subset = ["mci_neg", "mci_pos", "sz"]
+
+    corr, pval, fdr = make_matrices(df_wc, subset, fdr="fdr_filtered")
+
+    corr.to_csv(os.path.join(path_out, "FC_corr_wholebrain_mc_null_v_null.csv"))
+    pval.to_csv(os.path.join(path_out, "FC_corr_pval_wholebrain_mc_null_v_null.csv"))
+    fdr.to_csv(
+        os.path.join(path_out, "FC_corr_fdr_filtered_wholebrain_mc_null_v_null.csv")
+    )
+
+    ####################
+    # REGIONS #
+    ####################
+    df_MIST = pd.read_csv(args.path_parcellation, delimiter=";")
+
+    for index, row in df_MIST.iterrows():
+        # create region mask
+        parcel_mask, parcel_name = _make_region_masks(row)
+
+        # filter maps
+        null_parcel = [n.transpose()[parcel_mask].transpose() for n in null]
+        beta_std_parcel = [b[parcel_mask] for b in beta_std]
+
+        betamaps_std_parcel = pd.DataFrame(beta_std_parcel, index=maps)
+        nullmodels_parcel = pd.concat(null_parcel, keys=maps)
+
+        print(f"Creating correlation distributions for {parcel_name}...")
+        df_parcel = get_corr_pval(
+            maps, nullmodels_parcel, betamaps_std_parcel, path_out, tag=parcel_name
+        )
+
+        # make matrices
+        corr_parcel, pval_parcel, fdr_filtered_parcel = make_matrices(
+            df_parcel, subset, fdr="fdr_filtered"
+        )
+
+        _, pval_parcel_fdr_corrected, _, _ = multipletests(pval_parcel, method="fdr_bh")
+
+        # TO DO - see if I need to correct fdr filtered, I think so?
+        if (pval_parcel < 0.05).any():
+            corr_parcel.to_csv(
+                os.path.join(path_out, f"FC_corr_{parcel_name}_mc_null_v_null.csv")
+            )
+            pval_parcel.to_csv(
+                os.path.join(path_out, f"FC_corr_pval_{parcel_name}_mc_null_v_null.csv")
+            )
+            pval_parcel_fdr_corrected.to_csv(
+                os.path.join(
+                    path_out,
+                    f"FC_corr_pval_fdr_corrected_{parcel_name}_mc_null_v_null.csv",
+                )
+            )
+            fdr_filtered_parcel.to_csv(
+                os.path.join(
+                    path_out, f"FC_corr_fdr_filtered_{parcel_name}_mc_null_v_null.csv"
+                )
+            )
+
+    print("Done!")