From 1b572806d3d95fa066b7a1425c55ab92d7face17 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Fri, 13 Sep 2024 11:05:38 -0500
Subject: [PATCH 01/54] style: formatting changes

---
 funmap/funmap.py | 469 +++++++++++++++++++++++++++++------------------
 1 file changed, 291 insertions(+), 178 deletions(-)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index 65b6075d..01aa93cb 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -25,27 +25,32 @@
 
 log = setup_logger(__name__)
 
+
 def get_valid_gs_data(gs_path: str, valid_gene_list: List[str], md5=None):
     log.info(f'Loading gold standard feature file "{gs_path}" ...')
     if is_url_scheme(gs_path):
-        gs_edge_df = read_csv_with_md5_check(gs_path, expected_md5=md5,
-                            local_path='download_gs_file', sep='\t')
+        gs_edge_df = read_csv_with_md5_check(
+            gs_path, expected_md5=md5, local_path="download_gs_file", sep="\t"
+        )
         if gs_edge_df is None:
-            raise ValueError('Failed to download gold standard file')
+            raise ValueError("Failed to download gold standard file")
     else:
-        gs_edge_df = pd.read_csv(gs_path, sep='\t')
-
-    log.info('Done loading gold standard feature file')
-    gs_edge_df = gs_edge_df.rename(columns={gs_edge_df.columns[0]: 'P1',
-                                            gs_edge_df.columns[1]: 'P2'})
-    gs_edge_df = gs_edge_df[gs_edge_df['P1'].isin(valid_gene_list) &
-                            gs_edge_df['P2'].isin(valid_gene_list) &
-                            (gs_edge_df['P1'] != gs_edge_df['P2'])]
-    m = ~pd.DataFrame(np.sort(gs_edge_df[['P1','P2']], axis=1)).duplicated()
+        gs_edge_df = pd.read_csv(gs_path, sep="\t")
+
+    log.info("Done loading gold standard feature file")
+    gs_edge_df = gs_edge_df.rename(
+        columns={gs_edge_df.columns[0]: "P1", gs_edge_df.columns[1]: "P2"}
+    )
+    gs_edge_df = gs_edge_df[
+        gs_edge_df["P1"].isin(valid_gene_list)
+        & gs_edge_df["P2"].isin(valid_gene_list)
+        & (gs_edge_df["P1"] != gs_edge_df["P2"])
+    ]
+    m = ~pd.DataFrame(np.sort(gs_edge_df[["P1", "P2"]], axis=1)).duplicated()
     gs_edge_df = gs_edge_df[list(m)]
     gs_edge_df.reset_index(drop=True, inplace=True)
     # rename the last column name to 'label'
-    gs_edge_df.rename(columns={gs_edge_df.columns[-1]: 'label'}, inplace=True)
+    gs_edge_df.rename(columns={gs_edge_df.columns[-1]: "label"}, inplace=True)
 
     return gs_edge_df
 
@@ -108,16 +113,22 @@ def pairwise_mutual_rank(pcc_matrix):
     valid_a = ~np.isnan(pcc_matrix)
     valid_b = valid_a.T
 
-    rank_ab = np.argsort(pcc_matrix, axis=1).argsort(axis=1, kind='mergesort') + 1  # Start ranks from 1
-    rank_ba = np.argsort(pcc_matrix, axis=0).argsort(axis=0, kind='mergesort') + 1  # Start ranks from 1
+    rank_ab = (
+        np.argsort(pcc_matrix, axis=1).argsort(axis=1, kind="mergesort") + 1
+    )  # Start ranks from 1
+    rank_ba = (
+        np.argsort(pcc_matrix, axis=0).argsort(axis=0, kind="mergesort") + 1
+    )  # Start ranks from 1
 
     n_a = np.sum(valid_a, axis=1)
     n_b = np.sum(valid_b, axis=0)
 
     valid_indices_a, valid_indices_b = np.where(valid_a)
 
-    mr_values = np.sqrt((rank_ab[valid_indices_a, valid_indices_b] / n_b[valid_indices_b]) *
-                        (rank_ba[valid_indices_a, valid_indices_b] / n_a[valid_indices_a]))
+    mr_values = np.sqrt(
+        (rank_ab[valid_indices_a, valid_indices_b] / n_b[valid_indices_b])
+        * (rank_ba[valid_indices_a, valid_indices_b] / n_a[valid_indices_a])
+    )
 
     mr_matrix = np.full_like(pcc_matrix, np.nan)
     mr_matrix[valid_indices_a, valid_indices_b] = mr_values
@@ -126,26 +137,25 @@ def pairwise_mutual_rank(pcc_matrix):
 
 
 def compute_features(cfg, feature_type, min_sample_count, output_dir):
-    """Compute the pearson correlation coefficient for each edge in the list of edges and for each
-    """
+    """Compute the pearson correlation coefficient for each edge in the list of edges and for each"""
     data_dict, all_valid_ids = get_data_dict(cfg, min_sample_count)
     cc_dict = {}
     for i in data_dict:
-        cc_file = os.path.join(output_dir, f'cc_{i}.h5')
+        cc_file = os.path.join(output_dir, f"cc_{i}.h5")
         cc_dict[i] = cc_file
 
     mr_dict = {}
     for i in data_dict:
-        mr_file = os.path.join(output_dir, f'mr_{i}.h5')
+        mr_file = os.path.join(output_dir, f"mr_{i}.h5")
         mr_dict[i] = mr_file
 
     all_cc_exist = all(os.path.exists(file_path) for file_path in cc_dict.values())
     if all_cc_exist:
         log.info("All cc files exist. Skipping feature computation.")
-        if feature_type == 'cc':
+        if feature_type == "cc":
             return cc_dict, mr_dict, all_valid_ids
 
-    if feature_type == 'mr':
+    if feature_type == "mr":
         all_mr_exist = all(os.path.exists(file_path) for file_path in mr_dict.values())
         if all_cc_exist and all_mr_exist:
             log.info("All mr files exist. Skipping feature computation.")
@@ -159,24 +169,24 @@ def compute_features(cfg, feature_type, min_sample_count, output_dir):
         log.info(f"Computing pearson correlation coefficient matrix for {i}")
         x = data_dict[i].values.astype(np.float32)
         df = pd.DataFrame(x)
-        corr_matrix = df.corr(method='pearson', min_periods=min_sample_count)
+        corr_matrix = df.corr(method="pearson", min_periods=min_sample_count)
         arr = corr_matrix.values
         upper_indices = np.triu_indices(arr.shape[0])
-        with h5py.File(cc_dict[i], 'w') as hf:
+        with h5py.File(cc_dict[i], "w") as hf:
             # only store the upper triangle part
-            hf.create_dataset('cc',  data=arr[upper_indices])
-            hf.create_dataset('ids', data=data_dict[i].columns.values.astype('S'))
+            hf.create_dataset("cc", data=arr[upper_indices])
+            hf.create_dataset("ids", data=data_dict[i].columns.values.astype("S"))
         cc_dict[i] = cc_file
 
         # compute pairwise mutual rank features
-        if feature_type == 'mr':
+        if feature_type == "mr":
             log.info(f"Computing mutual rank matrix for {i}")
             arr_mr = pairwise_mutual_rank(arr)
             upper_indices = np.triu_indices(arr_mr.shape[0])
-            with h5py.File(mr_dict[i], 'w') as hf:
+            with h5py.File(mr_dict[i], "w") as hf:
                 # only store the upper triangle part
-                hf.create_dataset('mr',  data=arr_mr[upper_indices])
-                hf.create_dataset('ids', data=data_dict[i].columns.values.astype('S'))
+                hf.create_dataset("mr", data=arr_mr[upper_indices])
+                hf.create_dataset("ids", data=data_dict[i].columns.values.astype("S"))
 
     return cc_dict, mr_dict, all_valid_ids
 
@@ -193,37 +203,54 @@ def balance_classes(df, random_state=42):
     minority_class = class_0 if len(class_0) < len(class_1) else class_1
     majority_class = class_1 if minority_class is class_0 else class_0
 
-    majority_class_undersampled = resample(majority_class, replace=False,
-                            n_samples=len(minority_class), random_state=random_state)
+    majority_class_undersampled = resample(
+        majority_class,
+        replace=False,
+        n_samples=len(minority_class),
+        random_state=random_state,
+    )
 
     balanced_df = pd.concat([minority_class, majority_class_undersampled])
     # Shuffle the rows in the balanced DataFrame
-    balanced_df = balanced_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
+    balanced_df = balanced_df.sample(frac=1, random_state=random_state).reset_index(
+        drop=True
+    )
 
     return balanced_df
 
-def assemble_feature_df(h5_file_mapping, df, dataset='cc'):
+
+def assemble_feature_df(h5_file_mapping, df, dataset="cc"):
     df.reset_index(drop=True, inplace=True)
     # Initialize feature_df with columns for HDF5 file keys and 'label'
     file_keys = list(h5_file_mapping.keys())
-    feature_df = pd.DataFrame(columns=file_keys + ['label'])
+    feature_df = pd.DataFrame(columns=file_keys + ["label"])
 
     # Iterate over HDF5 files and load feature values
     for key, file_path in h5_file_mapping.items():
-        with h5py.File(file_path, 'r') as h5_file:
-            gene_ids = h5_file['ids'][:]
+        with h5py.File(file_path, "r") as h5_file:
+            gene_ids = h5_file["ids"][:]
             gene_to_index = {gene.astype(str): idx for idx, gene in enumerate(gene_ids)}
 
             # Get gene indices for P1 and P2
-            p1_indices = np.array([gene_to_index.get(gene, -1) for gene in df.iloc[:, 0]])
-            p2_indices = np.array([gene_to_index.get(gene, -1) for gene in df.iloc[:, 1]])
+            p1_indices = np.array(
+                [gene_to_index.get(gene, -1) for gene in df.iloc[:, 0]]
+            )
+            p2_indices = np.array(
+                [gene_to_index.get(gene, -1) for gene in df.iloc[:, 1]]
+            )
 
             f_dataset = h5_file[dataset]
             f_values = np.empty(len(df), dtype=float)
             valid_indices = (p1_indices != -1) & (p2_indices != -1)
 
-            linear_indices_func = lambda row_indices, col_indices, n: np.array(col_indices) - np.array(row_indices) + (2*n - np.array(row_indices) + 1) * np.array(row_indices) // 2
-            linear_indices = linear_indices_func(p1_indices[valid_indices], p2_indices[valid_indices], len(gene_ids))
+            linear_indices_func = (
+                lambda row_indices, col_indices, n: np.array(col_indices)
+                - np.array(row_indices)
+                + (2 * n - np.array(row_indices) + 1) * np.array(row_indices) // 2
+            )
+            linear_indices = linear_indices_func(
+                p1_indices[valid_indices], p2_indices[valid_indices], len(gene_ids)
+            )
 
             f_values[valid_indices] = f_dataset[:][linear_indices]
             f_values[~valid_indices] = np.nan
@@ -232,35 +259,40 @@ def assemble_feature_df(h5_file_mapping, df, dataset='cc'):
             feature_df[key] = f_values
 
     # if the last column is 'label', assign it to feature_df
-    if df.columns[-1] == 'label':
-        feature_df['label'] = df[df.columns[-1]]
+    if df.columns[-1] == "label":
+        feature_df["label"] = df[df.columns[-1]]
     else:
         # delete the 'label' column from feature_df
-        del feature_df['label']
+        del feature_df["label"]
 
     return feature_df
 
 
-def extract_features(df, feature_type, cc_dict,  ppi_feature=None, extra_feature=None, mr_dict=None):
-    if feature_type == 'mr':
+def extract_features(
+    df, feature_type, cc_dict, ppi_feature=None, extra_feature=None, mr_dict=None
+):
+    if feature_type == "mr":
         if not mr_dict:
-            raise ValueError('mr dict is empty')
+            raise ValueError("mr dict is empty")
 
-    feature_dict = cc_dict if feature_type == 'cc' else mr_dict
+    feature_dict = cc_dict if feature_type == "cc" else mr_dict
     feature_df = assemble_feature_df(feature_dict, df, feature_type)
     if ppi_feature is not None:
         ppi_dict = {key: set(value) for key, value in ppi_feature.items()}
         for ppi_source, ppi_tuples in ppi_dict.items():
             feature_df[ppi_source] = df.apply(
-                lambda row: 1 if (row['P1'], row['P2']) in ppi_tuples else 0, axis=1)
+                lambda row: 1 if (row["P1"], row["P2"]) in ppi_tuples else 0, axis=1
+            )
 
     # TODO: add extra features if provided
     if extra_feature is not None:
         pass
 
     # move 'label' column to the end of the dataframe if it exists
-    if 'label' in feature_df.columns:
-        feature_df = feature_df[[col for col in feature_df.columns if col != 'label'] + ['label']]
+    if "label" in feature_df.columns:
+        feature_df = feature_df[
+            [col for col in feature_df.columns if col != "label"] + ["label"]
+        ]
 
     return feature_df
 
@@ -279,72 +311,94 @@ def get_ppi_feature():
         A dictionary with PPI features, where the keys are the feature names and the values are lists of tuples
         representing the protein interactions.
     """
-    feature_names = ['BioGRID', 'BioPlex', 'HI-union']
-    urls = [network_info['url'][i] for i in range(len(network_info['name']))
-                if network_info['name'][i] in feature_names]
+    feature_names = ["BioGRID", "BioPlex", "HI-union"]
+    urls = [
+        network_info["url"][i]
+        for i in range(len(network_info["name"]))
+        if network_info["name"][i] in feature_names
+    ]
 
     ppi_features = {}
     # use pandas to read the file
-    for (i, url) in enumerate(urls):
-        data = pd.read_csv(url, sep='\t', header=None)
+    for i, url in enumerate(urls):
+        data = pd.read_csv(url, sep="\t", header=None)
         data = data.apply(lambda x: tuple(sorted(x)), axis=1)
-        ppi_name = f'{feature_names[i]}_PPI'
+        ppi_name = f"{feature_names[i]}_PPI"
         ppi_features[ppi_name] = data.tolist()
 
     return ppi_features
 
 
 def train_ml_model(data_df, ml_type, seed, n_jobs, feature_mapping, model_dir):
-    assert ml_type == 'xgboost', 'ML model must be xgboost'
-    models = train_model(data_df.iloc[:, :-1], data_df.iloc[:, -1], seed, n_jobs, feature_mapping,
-                        model_dir)
+    assert ml_type == "xgboost", "ML model must be xgboost"
+    models = train_model(
+        data_df.iloc[:, :-1],
+        data_df.iloc[:, -1],
+        seed,
+        n_jobs,
+        feature_mapping,
+        model_dir,
+    )
 
     return models
 
 
 def train_model(X, y, seed, n_jobs, feature_mapping, model_dir):
     model_params = {
-        'n_estimators': [10, 20, 50, 100],
-        'max_depth': [1, 2, 3, 4, 5],
-        'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0]
+        "n_estimators": [10, 20, 50, 100],
+        "max_depth": [1, 2, 3, 4, 5],
+        "learning_rate": [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0],
     }
 
     models = {}
     for ft in feature_mapping:
         # use only mutual rank
-        log.info(f'Training model for {ft} ...')
+        log.info(f"Training model for {ft} ...")
         xgb_model = xgb.XGBClassifier(random_state=seed, n_jobs=n_jobs)
         cv = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
-        clf = GridSearchCV(xgb_model, model_params, scoring='roc_auc', cv=cv,
-                        n_jobs=1, verbose=2)
-        if ft == 'ex':
+        clf = GridSearchCV(
+            xgb_model, model_params, scoring="roc_auc", cv=cv, n_jobs=1, verbose=2
+        )
+        if ft == "ex":
             # exclude ppi features
-            Xtrain = X.loc[:, ~X.columns.str.endswith('_PPI')]
+            Xtrain = X.loc[:, ~X.columns.str.endswith("_PPI")]
         else:
             Xtrain = X
         model = clf.fit(Xtrain, y)
         models[ft] = model
-        ml_model_file = model_dir / f'model_{ft}.pkl.gz'
-        with gzip.open(ml_model_file, 'wb') as fh:
+        ml_model_file = model_dir / f"model_{ft}.pkl.gz"
+        with gzip.open(ml_model_file, "wb") as fh:
             pickle.dump(model, fh)
 
-        log.info(f'Training model for {ft} ... done')
+        log.info(f"Training model for {ft} ... done")
 
     return models
 
 
-def compute_llr(predicted_all_pairs, llr_res_file, start_edge_num, max_num_edges, step_size,
-                gs_test):
+def compute_llr(
+    predicted_all_pairs, llr_res_file, start_edge_num, max_num_edges, step_size, gs_test
+):
     # make sure max_num_edges is smaller than the number of non-NA values
-    assert max_num_edges < np.count_nonzero(~np.isnan(predicted_all_pairs.iloc[:, -1].values)), \
-        f'max_num_edges should be smaller than the number of non-NA values'
+    assert max_num_edges < np.count_nonzero(
+        ~np.isnan(predicted_all_pairs.iloc[:, -1].values)
+    ), f"max_num_edges should be smaller than the number of non-NA values"
 
-    cur_col_name = 'prediction'
+    cur_col_name = "prediction"
     cur_results = predicted_all_pairs.nlargest(max_num_edges, cur_col_name)
-    selected_edges_all = cur_results[['P1', 'P2']].apply(lambda row: tuple(sorted({row['P1'], row['P2']})), axis=1)
-
-    gs_test_pos_set = set(gs_test[gs_test['label'] == 1][['P1', 'P2']].apply(lambda row: tuple(sorted({row['P1'], row['P2']})), axis=1))
-    gs_test_neg_set = set(gs_test[gs_test['label'] == 0][['P1', 'P2']].apply(lambda row: tuple(sorted({row['P1'], row['P2']})), axis=1))
+    selected_edges_all = cur_results[["P1", "P2"]].apply(
+        lambda row: tuple(sorted({row["P1"], row["P2"]})), axis=1
+    )
+
+    gs_test_pos_set = set(
+        gs_test[gs_test["label"] == 1][["P1", "P2"]].apply(
+            lambda row: tuple(sorted({row["P1"], row["P2"]})), axis=1
+        )
+    )
+    gs_test_neg_set = set(
+        gs_test[gs_test["label"] == 0][["P1", "P2"]].apply(
+            lambda row: tuple(sorted({row["P1"], row["P2"]})), axis=1
+        )
+    )
     n_gs_test_pos_set = len(gs_test_pos_set)
     n_gs_test_neg_set = len(gs_test_neg_set)
 
@@ -352,73 +406,86 @@ def compute_llr(predicted_all_pairs, llr_res_file, start_edge_num, max_num_edges
     # llr_res_dict only save maximum of max_steps data points for downstream
     # analysis / plotting
     total = math.ceil((max_num_edges - start_edge_num) / step_size) + 1
-    for k in tqdm(range(start_edge_num, max_num_edges+step_size, step_size), total=total, ascii=' >='):
+    for k in tqdm(
+        range(start_edge_num, max_num_edges + step_size, step_size),
+        total=total,
+        ascii=" >=",
+    ):
         selected_edges = set(selected_edges_all[:k])
         all_nodes = set(itertools.chain.from_iterable(selected_edges))
         common_pos_edges = selected_edges & gs_test_pos_set
         common_neg_edges = selected_edges & gs_test_neg_set
         try:
-            lr = len(common_pos_edges) / len(common_neg_edges) / (n_gs_test_pos_set / n_gs_test_neg_set)
+            lr = (
+                len(common_pos_edges)
+                / len(common_neg_edges)
+                / (n_gs_test_pos_set / n_gs_test_neg_set)
+            )
         except ZeroDivisionError:
             lr = 0
         llr = np.log(lr) if lr > 0 else np.nan
         n_node = len(all_nodes)
-        result_dict['k'].append(k)
-        result_dict['n'].append(n_node)
-        result_dict['llr'].append(llr)
+        result_dict["k"].append(k)
+        result_dict["n"].append(n_node)
+        result_dict["llr"].append(llr)
 
     llr_res = pd.DataFrame(result_dict)
     if llr_res_file is not None:
-        llr_res.to_csv(llr_res_file, sep='\t', index=False)
+        llr_res.to_csv(llr_res_file, sep="\t", index=False)
 
     return llr_res
 
 
 def prepare_gs_data(**kwargs):
-    task = kwargs['task']
-    cc_dict = kwargs['cc_dict']
-    mr_dict = kwargs['mr_dict']
-    gs_file = kwargs['gs_file']
+    task = kwargs["task"]
+    cc_dict = kwargs["cc_dict"]
+    mr_dict = kwargs["mr_dict"]
+    gs_file = kwargs["gs_file"]
     gs_file_md5 = None
-    feature_type = kwargs['feature_type']
-    extra_feature_file = kwargs['extra_feature_file']
-    valid_id_list = kwargs['valid_id_list']
-    test_size = kwargs['test_size']
-    seed = kwargs['seed']
+    feature_type = kwargs["feature_type"]
+    extra_feature_file = kwargs["extra_feature_file"]
+    valid_id_list = kwargs["valid_id_list"]
+    test_size = kwargs["test_size"]
+    seed = kwargs["seed"]
 
     if gs_file is None:
-        gs_file = urls['reactome_gold_standard']
-        gs_file_md5 = urls['reactome_gold_standard_md5']
+        gs_file = urls["reactome_gold_standard"]
+        gs_file_md5 = urls["reactome_gold_standard_md5"]
 
-    log.info('Preparing gold standard data')
+    log.info("Preparing gold standard data")
     gs_df = get_valid_gs_data(gs_file, valid_id_list, md5=gs_file_md5)
     gs_df_balanced = balance_classes(gs_df, random_state=seed)
     del gs_df
-    gs_train, gs_test = train_test_split(gs_df_balanced,
-            test_size=test_size, random_state=seed,
-            stratify=gs_df_balanced.iloc[:, -1])
-    if task == 'protein_func':
+    gs_train, gs_test = train_test_split(
+        gs_df_balanced,
+        test_size=test_size,
+        random_state=seed,
+        stratify=gs_df_balanced.iloc[:, -1],
+    )
+    if task == "protein_func":
         ppi_feature = get_ppi_feature()
     else:
         ppi_feature = None
-    gs_train_df = extract_features(gs_train, feature_type, cc_dict, ppi_feature,
-                                    extra_feature_file, mr_dict)
-    gs_test_df =  extract_features(gs_test, feature_type, cc_dict, ppi_feature,
-                                    extra_feature_file, mr_dict)
+    gs_train_df = extract_features(
+        gs_train, feature_type, cc_dict, ppi_feature, extra_feature_file, mr_dict
+    )
+    gs_test_df = extract_features(
+        gs_test, feature_type, cc_dict, ppi_feature, extra_feature_file, mr_dict
+    )
 
     # store both the ids with gs_test_df for later use
     # add the first two column of gs_test to gs_test_df at the beginning
     gs_test_df = pd.concat([gs_test.iloc[:, :2], gs_test_df], axis=1)
 
-    log.info('Preparing gs data ... done')
+    log.info("Preparing gs data ... done")
     return gs_train_df, gs_test_df
 
 
-def extract_dataset_feature(all_pairs, feature_file, feature_type='cc'):
+def extract_dataset_feature(all_pairs, feature_file, feature_type="cc"):
     # convert all_pairs to a dataframe
-    df = pd.DataFrame(all_pairs, columns=['P1', 'P2'])
-    with h5py.File(feature_file, 'r') as h5_file:
-        gene_ids = h5_file['ids'][:]
+    df = pd.DataFrame(all_pairs, columns=["P1", "P2"])
+    with h5py.File(feature_file, "r") as h5_file:
+        gene_ids = h5_file["ids"][:]
         gene_to_index = {gene.astype(str): idx for idx, gene in enumerate(gene_ids)}
 
         # Get gene indices for P1 and P2
@@ -429,50 +496,71 @@ def extract_dataset_feature(all_pairs, feature_file, feature_type='cc'):
         f_values = np.empty(len(df), dtype=float)
         valid_indices = (p1_indices != -1) & (p2_indices != -1)
 
-        linear_indices_func = lambda row_indices, col_indices, n: np.array(col_indices) - np.array(row_indices) + (2*n - np.array(row_indices) + 1) * np.array(row_indices) // 2
-        linear_indices = linear_indices_func(p1_indices[valid_indices], p2_indices[valid_indices], len(gene_ids))
+        linear_indices_func = (
+            lambda row_indices, col_indices, n: np.array(col_indices)
+            - np.array(row_indices)
+            + (2 * n - np.array(row_indices) + 1) * np.array(row_indices) // 2
+        )
+        linear_indices = linear_indices_func(
+            p1_indices[valid_indices], p2_indices[valid_indices], len(gene_ids)
+        )
 
         f_values[valid_indices] = f_dataset[:][linear_indices]
         f_values[~valid_indices] = np.nan
         # extracted feature is the 'prediction' column
-        df['prediction'] = f_values
+        df["prediction"] = f_values
 
     return df
 
 
-def dataset_llr(all_ids, feature_dict, feature_type, gs_test, start_edge_num,
-                max_num_edge, step_size, llr_dataset_file):
+def dataset_llr(
+    all_ids,
+    feature_dict,
+    feature_type,
+    gs_test,
+    start_edge_num,
+    max_num_edge,
+    step_size,
+    llr_dataset_file,
+):
     llr_ds = pd.DataFrame()
     all_ids_sorted = sorted(all_ids)
     all_pairs = list(itertools.combinations(all_ids_sorted, 2))
     all_ds_pred = None
 
     for dataset in feature_dict:
-        log.info(f'Calculating llr for {dataset} ...')
+        log.info(f"Calculating llr for {dataset} ...")
         feature_file = feature_dict[dataset]
-        predicted_all_pairs = extract_dataset_feature(all_pairs, feature_file, feature_type)
+        predicted_all_pairs = extract_dataset_feature(
+            all_pairs, feature_file, feature_type
+        )
         if all_ds_pred is None:
-            all_ds_pred = predicted_all_pairs['prediction'].values
+            all_ds_pred = predicted_all_pairs["prediction"].values
         else:
-            all_ds_pred = np.vstack((all_ds_pred, predicted_all_pairs['prediction'].values))
-
-        cur_llr_res = compute_llr(predicted_all_pairs, None, start_edge_num,
-                                max_num_edge, step_size, gs_test)
-        cur_llr_res['dataset'] = dataset
+            all_ds_pred = np.vstack(
+                (all_ds_pred, predicted_all_pairs["prediction"].values)
+            )
+
+        cur_llr_res = compute_llr(
+            predicted_all_pairs, None, start_edge_num, max_num_edge, step_size, gs_test
+        )
+        cur_llr_res["dataset"] = dataset
         llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True)
-        llr_ds.to_csv(llr_dataset_file, sep='\t', index=False)
-        log.info(f'Calculating llr for {dataset} ... done')
+        llr_ds.to_csv(llr_dataset_file, sep="\t", index=False)
+        log.info(f"Calculating llr for {dataset} ... done")
 
     # calculate llr for all datasets based on the average prediction
-    log.info(f'Calculating llr for all datasets average ...')
-    all_ds_pred_df = pd.DataFrame(all_pairs, columns=['P1', 'P2'])
+    log.info(f"Calculating llr for all datasets average ...")
+    all_ds_pred_df = pd.DataFrame(all_pairs, columns=["P1", "P2"])
     all_ds_pred_avg = np.nanmean(all_ds_pred, axis=0)
-    all_ds_pred_df['prediction'] = all_ds_pred_avg
-    cur_llr_res = compute_llr(all_ds_pred_df, None, start_edge_num, max_num_edge, step_size, gs_test)
-    cur_llr_res['dataset'] = 'all_average'
+    all_ds_pred_df["prediction"] = all_ds_pred_avg
+    cur_llr_res = compute_llr(
+        all_ds_pred_df, None, start_edge_num, max_num_edge, step_size, gs_test
+    )
+    cur_llr_res["dataset"] = "all_average"
     llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True)
-    log.info(f'Calculating llr for all datasets average ... done')
-    llr_ds.to_csv(llr_dataset_file, sep='\t', index=False)
+    log.info(f"Calculating llr for all datasets average ... done")
+    llr_ds.to_csv(llr_dataset_file, sep="\t", index=False)
 
     return llr_ds
 
@@ -481,39 +569,43 @@ def get_cutoff(model_dict, gs_test, lr_cutoff):
     cutoff_p_dict = {}
     cutoff_llr_dict = {}
     for ft in model_dict:
-        log.info(f'Calculating cutoff prob for {ft} ...')
+        log.info(f"Calculating cutoff prob for {ft} ...")
         model = model_dict[ft]
-        if ft == 'ex':
-            gs_test_df = gs_test.loc[:, ~gs_test.columns.str.endswith('_PPI')]
+        if ft == "ex":
+            gs_test_df = gs_test.loc[:, ~gs_test.columns.str.endswith("_PPI")]
         else:
             gs_test_df = gs_test
         prob = model.predict_proba(gs_test_df.iloc[:, 2:-1])
         prob = prob[:, 1]
-        pred_df = pd.DataFrame(prob, columns=['prob'])
+        pred_df = pd.DataFrame(prob, columns=["prob"])
         pred_df = pd.concat([pred_df, gs_test_df.iloc[:, -1]], axis=1)
-        pred_df = pred_df.sort_values(by='prob', ascending=False)
+        pred_df = pred_df.sort_values(by="prob", ascending=False)
 
-        P = pred_df['label'].sum()
+        P = pred_df["label"].sum()
         N = len(pred_df) - P
-        cumulative_pp = np.cumsum(pred_df['label'])
+        cumulative_pp = np.cumsum(pred_df["label"])
         cumulative_pn = np.arange(len(pred_df)) + 1 - cumulative_pp
         llr_values = np.log((cumulative_pp / cumulative_pn) / (P / N))
-        pred_df['llr'] = llr_values
+        pred_df["llr"] = llr_values
 
         # find the first prob that has llr >= lr_cutoff
         cutoff = np.log(lr_cutoff)
         cutoff_prob = None
         for _, row in pred_df[::-1].iterrows():
-            if not np.isinf(row['llr']) and row['llr'] >= cutoff:
-                cutoff_prob = row['prob']
-                cutoff_llr = row['llr']
+            if not np.isinf(row["llr"]) and row["llr"] >= cutoff:
+                cutoff_prob = row["prob"]
+                cutoff_llr = row["llr"]
                 break
 
         # if cutoff_prob is None, it means that the lr_cutoff is too high
         # and we cannot find a cutoff prob that has llr >= lr_cutoff
         if cutoff_prob is None:
-            log.error(f'Cannot find cutoff prob for {ft}, lower lr_cutoff and try again')
-            import sys; sys.exit(1)
+            log.error(
+                f"Cannot find cutoff prob for {ft}, lower lr_cutoff and try again"
+            )
+            import sys
+
+            sys.exit(1)
 
         cutoff_p_dict[ft] = cutoff_prob
         cutoff_llr_dict[ft] = cutoff_llr
@@ -523,34 +615,43 @@ def get_cutoff(model_dict, gs_test, lr_cutoff):
 
 def predict_network(predict_results_file, cutoff_p, output_file):
     for ft in predict_results_file:
-        log.info(f'Predicting network for {ft} ...')
+        log.info(f"Predicting network for {ft} ...")
         predicted_df = pd.read_parquet(predict_results_file[ft])
-        filtered_df = predicted_df[predicted_df['prediction'] > cutoff_p[ft]]
+        filtered_df = predicted_df[predicted_df["prediction"] > cutoff_p[ft]]
         cur_file = output_file[ft]
-        filtered_df[['P1', 'P2']].to_csv(cur_file, sep='\t', index=False, header=None)
+        filtered_df[["P1", "P2"]].to_csv(cur_file, sep="\t", index=False, header=None)
         directory, file_name = os.path.split(cur_file)
         base_name, extension = os.path.splitext(file_name)
-        new_file_name = f'{base_name}_with_p{extension}'
+        new_file_name = f"{base_name}_with_p{extension}"
         new_file = os.path.join(directory, new_file_name)
-        filtered_df.to_csv(new_file, sep='\t', index=False)
+        filtered_df.to_csv(new_file, sep="\t", index=False)
         num_edges = len(filtered_df)
-        num_nodes = len(set(filtered_df['P1']) | set(filtered_df['P2']))
-        log.info(f'Number of edges: {num_edges}')
-        log.info(f'Number of nodes: {num_nodes}')
-        log.info(f'Predicting network for {ft} ... done')
-
-
-def predict_all_pairs(model_dict, all_ids, feature_type, ppi_feature, cc_dict,
-                    mr_dict, extra_feature_file, prediction_dir,
-                    output_file, n_jobs=1):
+        num_nodes = len(set(filtered_df["P1"]) | set(filtered_df["P2"]))
+        log.info(f"Number of edges: {num_edges}")
+        log.info(f"Number of nodes: {num_nodes}")
+        log.info(f"Predicting network for {ft} ... done")
+
+
+def predict_all_pairs(
+    model_dict,
+    all_ids,
+    feature_type,
+    ppi_feature,
+    cc_dict,
+    mr_dict,
+    extra_feature_file,
+    prediction_dir,
+    output_file,
+    n_jobs=1,
+):
     chunk_size = 1000000
-    log.info('Genearating all pairs ...')
+    log.info("Genearating all pairs ...")
     all_ids = sorted(all_ids)
     all_pairs = list(itertools.combinations(all_ids, 2))
-    log.info('Genearating all pairs ... done')
+    log.info("Genearating all pairs ... done")
     log.info(f'Number of valid ids {format(len(all_ids), ",")}')
     # remove all "chunk_*.parquet" files in prediction_dir if they exist
-    pattern = os.path.join(prediction_dir, 'chunk_*.parquet')
+    pattern = os.path.join(prediction_dir, "chunk_*.parquet")
     matching_files = glob.glob(pattern)
     for file in matching_files:
         os.remove(file)
@@ -558,34 +659,46 @@ def predict_all_pairs(model_dict, all_ids, feature_type, ppi_feature, cc_dict,
     for ft in model_dict:
         log.info(f'Predicting all pairs ({format(len(all_pairs), ",")}) for {ft} ...')
         model = model_dict[ft]
+
         def process_and_save_chunk(start_idx, chunk_id):
-            chunk = all_pairs[start_idx:start_idx + chunk_size]
-            chunk_df = pd.DataFrame(chunk, columns=['P1', 'P2'])
-            if ft == 'ex':
+            chunk = all_pairs[start_idx : start_idx + chunk_size]
+            chunk_df = pd.DataFrame(chunk, columns=["P1", "P2"])
+            if ft == "ex":
                 cur_ppi_feature = None
             else:
                 cur_ppi_feature = ppi_feature
-            feature_df = extract_features(chunk_df, feature_type, cc_dict, cur_ppi_feature,
-                                        extra_feature_file, mr_dict)
+            feature_df = extract_features(
+                chunk_df,
+                feature_type,
+                cc_dict,
+                cur_ppi_feature,
+                extra_feature_file,
+                mr_dict,
+            )
             predictions = model.predict_proba(feature_df)
-            prediction_df = pd.DataFrame(predictions[:, 1], columns=['prediction'])
-            prediction_df['P1'] = chunk_df['P1']
-            prediction_df['P2'] = chunk_df['P2']
-            prediction_df = prediction_df[['P1', 'P2', 'prediction']]
-            prediction_df['prediction'] = prediction_df['prediction'].astype('float32')
+            prediction_df = pd.DataFrame(predictions[:, 1], columns=["prediction"])
+            prediction_df["P1"] = chunk_df["P1"]
+            prediction_df["P2"] = chunk_df["P2"]
+            prediction_df = prediction_df[["P1", "P2", "prediction"]]
+            prediction_df["prediction"] = prediction_df["prediction"].astype("float32")
             table = pa.Table.from_pandas(prediction_df)
             chunk_id = str(chunk_id).zfill(6)
-            output_file = f'{prediction_dir}/chunk_{chunk_id}.parquet'
+            output_file = f"{prediction_dir}/chunk_{chunk_id}.parquet"
             pq.write_table(table, output_file)
 
         with ThreadPoolExecutor(max_workers=n_jobs) as executor:
-            for chunk_id, chunk_start in enumerate(range(0, len(all_pairs), chunk_size)):
+            for chunk_id, chunk_start in enumerate(
+                range(0, len(all_pairs), chunk_size)
+            ):
                 executor.submit(process_and_save_chunk, chunk_start, chunk_id)
 
-        pattern = os.path.join(prediction_dir, 'chunk_*.parquet')
+        pattern = os.path.join(prediction_dir, "chunk_*.parquet")
         matching_files = glob.glob(pattern)
         matching_files.sort()
-        pq.write_table(pa.concat_tables([pq.read_table(file) for file in matching_files]), output_file[ft])
+        pq.write_table(
+            pa.concat_tables([pq.read_table(file) for file in matching_files]),
+            output_file[ft],
+        )
         for file in matching_files:
             os.remove(file)
 

From e53c78ff372ae8760a062cf5a12e5692b61b4207 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Wed, 18 Sep 2024 12:37:11 -0500
Subject: [PATCH 02/54] add documentation for extra-feature file

---
 funmap/funmap.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index 01aa93cb..00565c9e 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -26,7 +26,9 @@
 log = setup_logger(__name__)
 
 
-def get_valid_gs_data(gs_path: str, valid_gene_list: List[str], md5=None):
+def get_valid_gs_data(
+    gs_path: str, valid_gene_list: List[str], md5=None
+):  # TODO: Add error message if there is no overlap between valid_gene_list and the gold standard set
     log.info(f'Loading gold standard feature file "{gs_path}" ...')
     if is_url_scheme(gs_path):
         gs_edge_df = read_csv_with_md5_check(
@@ -271,6 +273,26 @@ def assemble_feature_df(h5_file_mapping, df, dataset="cc"):
 def extract_features(
     df, feature_type, cc_dict, ppi_feature=None, extra_feature=None, mr_dict=None
 ):
+    """
+      extract_features - creates the final feature `pandas` dataframe used by xgboost
+
+      Params:
+      df: pd.DataFrame
+      feature_type: str - Type of feature used. Currently supporting pearson correlation or mutual rank. Accepted values: 'cc' or 'mr'.
+      cc_dict: dict[str, str] - dictionary containing paths to the pre-calculated features.
+      ...
+      extra_feature: str - path for file containing extra features
+
+      Extra Features:
+
+      This is a TSV file with the following format:
+
+     Gene A   Gene B   Feature X   Feature Y
+    -------- -------- ----------- -----------
+     ABC1     DEF2     0.12        34.5
+     GHI3     JKLM4    -0.6        NA
+     …        …        …           …
+    """
     if feature_type == "mr":
         if not mr_dict:
             raise ValueError("mr dict is empty")

From fa90597a13c32655963f883085498788f7268a59 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Wed, 18 Sep 2024 14:58:41 -0500
Subject: [PATCH 03/54] Add support for adding extra_feature_file

---
 funmap/funmap.py | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index 00565c9e..816d11d6 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -19,7 +19,12 @@
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import StratifiedKFold
 import xgboost as xgb
-from funmap.utils import get_data_dict, is_url_scheme, read_csv_with_md5_check
+from funmap.utils import (
+    get_data_dict,
+    is_url_scheme,
+    read_csv_with_md5_check,
+    check_extra_feature_file,
+)
 from funmap.data_urls import network_info, misc_urls as urls
 from funmap.logger import setup_logger
 
@@ -308,8 +313,30 @@ def extract_features(
 
     # TODO: add extra features if provided
     if extra_feature is not None:
-        pass
-
+        if check_extra_feature_file(extra_feature):
+            log.info(f"Loading extra features from {extra_feature}")
+            extra_feature_df = pd.read_csv(extra_feature, sep="\t")
+            log.info(
+                f"Extra Features: {len(extra_feature_df.columns) - 2}\nGene Pairs: {extra_feature_df.shape[0]}"
+            )
+            extra_feature_df.columns.values[0] = "P1"
+            extra_feature_df.columns.values[0] = "P2"
+            extra_feature_df[["P1", "P2"]] = extra_feature_df.apply(
+                lambda row: sorted([row["P1"], row["P2"]])
+                if row["P1"] > row["P2"]
+                else [row["P1"], row["P2"]],
+                axis=1,
+                result_type="expand",
+            )
+            old_feature_count = feature_df.shape[0]
+            feature_df = pd.merge(
+                feature_df, extra_feature_df, on=["P1", "P2"], how="outer"
+            )
+            log.info(f"Added gene pairs: {feature_df.shape[0] - old_feature_count}")
+        else:
+            log.warning(
+                "File format verification failed. Skipping adding extra feature."
+            )
     # move 'label' column to the end of the dataframe if it exists
     if "label" in feature_df.columns:
         feature_df = feature_df[

From bad4f6da97180d190b6a2c63c5d3a3a53a364243 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Wed, 18 Sep 2024 15:19:12 -0500
Subject: [PATCH 04/54] remove extra_file check

---
 funmap/funmap.py | 43 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index 816d11d6..518d0f69 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -313,30 +313,25 @@ def extract_features(
 
     # TODO: add extra features if provided
     if extra_feature is not None:
-        if check_extra_feature_file(extra_feature):
-            log.info(f"Loading extra features from {extra_feature}")
-            extra_feature_df = pd.read_csv(extra_feature, sep="\t")
-            log.info(
-                f"Extra Features: {len(extra_feature_df.columns) - 2}\nGene Pairs: {extra_feature_df.shape[0]}"
-            )
-            extra_feature_df.columns.values[0] = "P1"
-            extra_feature_df.columns.values[0] = "P2"
-            extra_feature_df[["P1", "P2"]] = extra_feature_df.apply(
-                lambda row: sorted([row["P1"], row["P2"]])
-                if row["P1"] > row["P2"]
-                else [row["P1"], row["P2"]],
-                axis=1,
-                result_type="expand",
-            )
-            old_feature_count = feature_df.shape[0]
-            feature_df = pd.merge(
-                feature_df, extra_feature_df, on=["P1", "P2"], how="outer"
-            )
-            log.info(f"Added gene pairs: {feature_df.shape[0] - old_feature_count}")
-        else:
-            log.warning(
-                "File format verification failed. Skipping adding extra feature."
-            )
+        log.info(f"Loading extra features from {extra_feature}")
+        extra_feature_df = pd.read_csv(extra_feature, sep="\t")
+        log.info(
+            f"Extra Features: {len(extra_feature_df.columns) - 2}\nGene Pairs: {extra_feature_df.shape[0]}"
+        )
+        extra_feature_df.columns.values[0] = "P1"
+        extra_feature_df.columns.values[0] = "P2"
+        extra_feature_df[["P1", "P2"]] = extra_feature_df.apply(
+            lambda row: sorted([row["P1"], row["P2"]])
+            if row["P1"] > row["P2"]
+            else [row["P1"], row["P2"]],
+            axis=1,
+            result_type="expand",
+        )
+        old_feature_count = feature_df.shape[0]
+        feature_df = pd.merge(
+            feature_df, extra_feature_df, on=["P1", "P2"], how="outer"
+        )
+        log.info(f"Added gene pairs: {feature_df.shape[0] - old_feature_count}")
     # move 'label' column to the end of the dataframe if it exists
     if "label" in feature_df.columns:
         feature_df = feature_df[

From a9927df2e126b4f90cd3b7e83be288dbea337229 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Wed, 18 Sep 2024 15:30:48 -0500
Subject: [PATCH 05/54] skip feature file check

---
 funmap/cli.py | 368 +++++++++++++++++++++++++++++---------------------
 1 file changed, 216 insertions(+), 152 deletions(-)

diff --git a/funmap/cli.py b/funmap/cli.py
index 8094bf1f..8cd7cf34 100644
--- a/funmap/cli.py
+++ b/funmap/cli.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 from funmap.funmap import compute_llr, predict_all_pairs, dataset_llr, predict_all_pairs
 from funmap.plotting import explore_data, plot_results, merge_and_delete
-from funmap.funmap import  train_ml_model, prepare_gs_data, get_cutoff, get_ppi_feature
+from funmap.funmap import train_ml_model, prepare_gs_data, get_cutoff, get_ppi_feature
 from funmap.funmap import compute_features, predict_network
 from funmap.data_urls import misc_urls as urls
 from funmap.logger import setup_logging, setup_logger
@@ -17,250 +17,313 @@
 
 log = setup_logger(__name__)
 
-@click.group(help='funmap command line interface')
-@click.version_option(version=f'{__version__}')
+
+@click.group(help="funmap command line interface")
+@click.version_option(version=f"{__version__}")
 def cli():
     """
     Command line interface for funmap.
     """
     click.echo("====== funmap =======")
 
-@cli.command(help='check the data quality')
-@click.option('--config-file', '-c', required=True, type=click.Path(exists=True),
-            help='path to experiment configuration yaml file')
-@click.option('--force-rerun', '-f', is_flag=True, default=False,
-            help='if set, will remove results from previous run first')
+
+@cli.command(help="check the data quality")
+@click.option(
+    "--config-file",
+    "-c",
+    required=True,
+    type=click.Path(exists=True),
+    help="path to experiment configuration yaml file",
+)
+@click.option(
+    "--force-rerun",
+    "-f",
+    is_flag=True,
+    default=False,
+    help="if set, will remove results from previous run first",
+)
 def qc(config_file, force_rerun):
     if force_rerun:
         while True:
-            confirmation = input("Do you want to remove results from previous run? (y/n): ")
-            if confirmation.lower() == 'y':
-                click.echo('Removing results from previous run')
+            confirmation = input(
+                "Do you want to remove results from previous run? (y/n): "
+            )
+            if confirmation.lower() == "y":
+                click.echo("Removing results from previous run")
                 cleanup_experiment(config_file)
                 break
-            elif confirmation.lower() == 'n':
-                click.echo('Not removing results from previous run')
+            elif confirmation.lower() == "n":
+                click.echo("Not removing results from previous run")
                 break
             else:
                 click.echo("Invalid input. Please enter 'y' or 'n'.")
 
     setup_logging(config_file)
-    log.info(f'Running QC...')
+    log.info(f"Running QC...")
     cfg = setup_experiment(config_file)
     all_fig_names = []
-    figure_dir = Path(cfg['results_dir']) / cfg['subdirs']['figure_dir']
-    min_sample_count = cfg['min_sample_count']
+    figure_dir = Path(cfg["results_dir"]) / cfg["subdirs"]["figure_dir"]
+    min_sample_count = cfg["min_sample_count"]
     fig_names = explore_data(cfg, min_sample_count, figure_dir)
     all_fig_names.extend(fig_names)
-    merge_and_delete(figure_dir, all_fig_names, 'qc.pdf')
-    log.info('figure qc.pdf saved to {}'.format(figure_dir))
-    log.info('QC complete')
+    merge_and_delete(figure_dir, all_fig_names, "qc.pdf")
+    log.info("figure qc.pdf saved to {}".format(figure_dir))
+    log.info("QC complete")
 
 
-@cli.command(help='run funmap')
-@click.option('--config-file', '-c', required=True, type=click.Path(exists=True),
-            help='path to experiment configuration yaml file')
-@click.option('--force-rerun', '-f', is_flag=True, default=False,
-            help='if set, will remove results from previous run first')
+@cli.command(help="run funmap")
+@click.option(
+    "--config-file",
+    "-c",
+    required=True,
+    type=click.Path(exists=True),
+    help="path to experiment configuration yaml file",
+)
+@click.option(
+    "--force-rerun",
+    "-f",
+    is_flag=True,
+    default=False,
+    help="if set, will remove results from previous run first",
+)
 def run(config_file, force_rerun):
-    click.echo(f'Running funmap...')
+    click.echo(f"Running funmap...")
     if force_rerun:
         while True:
-            confirmation = input("Do you want to remove results from previous run? (y/n): ")
-            if confirmation.lower() == 'y':
-                click.echo('Removing results from previous run')
+            confirmation = input(
+                "Do you want to remove results from previous run? (y/n): "
+            )
+            if confirmation.lower() == "y":
+                click.echo("Removing results from previous run")
                 cleanup_experiment(config_file)
                 break
-            elif confirmation.lower() == 'n':
-                click.echo('Not removing results from previous run')
+            elif confirmation.lower() == "n":
+                click.echo("Not removing results from previous run")
                 break
             else:
                 click.echo("Invalid input. Please enter 'y' or 'n'.")
 
     setup_logging(config_file)
     cfg = setup_experiment(config_file)
-    extra_feature_file = cfg['extra_feature_file']
-    if (extra_feature_file is not None) and (not check_extra_feature_file(extra_feature_file)):
-        return
-    gs_file = cfg['gs_file']
+    extra_feature_file = cfg["extra_feature_file"]
+    # if (extra_feature_file is not None) and (not check_extra_feature_file(extra_feature_file)):
+    #     return
+    gs_file = cfg["gs_file"]
     if (gs_file is not None) and (not check_gold_standard_file(gs_file)):
         return
 
-    task = cfg['task']
-    seed = cfg['seed']
+    task = cfg["task"]
+    seed = cfg["seed"]
     np.random.seed(seed)
-    ml_type = cfg['ml_type']
-    feature_type = cfg['feature_type']
+    ml_type = cfg["ml_type"]
+    feature_type = cfg["feature_type"]
     # min_feature_count = cfg['min_feature_count']
-    min_sample_count = cfg['min_sample_count']
+    min_sample_count = cfg["min_sample_count"]
     # filter_before_prediction = cfg['filter_before_prediction']
-    test_size = cfg['test_size']
+    test_size = cfg["test_size"]
     # filter_after_prediction = cfg['filter_after_prediction']
     # filter_criterion = cfg['filter_criterion']
     # filter_threshold = cfg['filter_threshold']
     # filter_blacklist = cfg['filter_blacklist']
-    n_jobs = cfg['n_jobs']
-    lr_cutoff = cfg['lr_cutoff']
-    max_num_edges = cfg['max_num_edges']
-    step_size = cfg['step_size']
-    start_edge_num = cfg['start_edge_num']
+    n_jobs = cfg["n_jobs"]
+    lr_cutoff = cfg["lr_cutoff"]
+    max_num_edges = cfg["max_num_edges"]
+    step_size = cfg["step_size"]
+    start_edge_num = cfg["start_edge_num"]
 
-    results_dir = Path(cfg['results_dir'])
-    saved_data_dir = results_dir / cfg['subdirs']['saved_data_dir']
-    model_dir = results_dir / cfg['subdirs']['saved_model_dir']
-    prediction_dir = results_dir / cfg['subdirs']['saved_predictions_dir']
-    network_dir = results_dir / cfg['subdirs']['network_dir']
-    figure_dir = results_dir / cfg['subdirs']['figure_dir']
+    results_dir = Path(cfg["results_dir"])
+    saved_data_dir = results_dir / cfg["subdirs"]["saved_data_dir"]
+    model_dir = results_dir / cfg["subdirs"]["saved_model_dir"]
+    prediction_dir = results_dir / cfg["subdirs"]["saved_predictions_dir"]
+    network_dir = results_dir / cfg["subdirs"]["network_dir"]
+    figure_dir = results_dir / cfg["subdirs"]["figure_dir"]
 
-    if cfg['task'] == 'protein_func':
-        feature_mapping = ['ex', 'ei']
+    if cfg["task"] == "protein_func":
+        feature_mapping = ["ex", "ei"]
     else:
-        feature_mapping = ['ex']
+        feature_mapping = ["ex"]
         # here the file stored a dictionary of ml models
-    ml_model_file = {feature: model_dir / f'model_{feature}.pkl.gz'
-                    for feature in feature_mapping }
-    predicted_all_pairs_file = {feature: prediction_dir / f'predicted_all_pairs_{feature}.parquet'
-                    for feature in feature_mapping }
-    llr_res_file = {feature: results_dir / f'llr_results_{feature}.tsv'
-                    for feature in feature_mapping }
-    edge_list_file = {feature: network_dir/ f'funmap_{feature}.tsv'
-                    for feature in feature_mapping }
+    ml_model_file = {
+        feature: model_dir / f"model_{feature}.pkl.gz" for feature in feature_mapping
+    }
+    predicted_all_pairs_file = {
+        feature: prediction_dir / f"predicted_all_pairs_{feature}.parquet"
+        for feature in feature_mapping
+    }
+    llr_res_file = {
+        feature: results_dir / f"llr_results_{feature}.tsv"
+        for feature in feature_mapping
+    }
+    edge_list_file = {
+        feature: network_dir / f"funmap_{feature}.tsv" for feature in feature_mapping
+    }
     # gold standard data include specified feature (cc or mr) and ppi feature (if applicable)
     # and extra feature if applicable
-    gs_df_file = saved_data_dir / 'gold_standard_data.h5'
+    gs_df_file = saved_data_dir / "gold_standard_data.h5"
     # blacklist_file = urls['funmap_blacklist']
     # llr obtained with each invividual dataset
-    llr_dataset_file = results_dir / 'llr_dataset.tsv'
+    llr_dataset_file = results_dir / "llr_dataset.tsv"
     gs_train = gs_test = None
     cutoff_p = cutoff_llr = None
     ml_model_dict = {}
 
     # compute and save cc, mr results
-    cc_dict, mr_dict, all_valid_ids = compute_features(cfg, feature_type, min_sample_count,
-                                                    saved_data_dir)
+    cc_dict, mr_dict, all_valid_ids = compute_features(
+        cfg, feature_type, min_sample_count, saved_data_dir
+    )
     gs_args = {
-        'task': task,
-        'saved_data_dir': saved_data_dir,
-        'cc_dict': cc_dict,
-        'mr_dict': mr_dict,
-        'feature_type': feature_type,
-        'gs_file': gs_file,
-        'extra_feature_file': extra_feature_file,
-        'valid_id_list': all_valid_ids,
-        'test_size': test_size,
-        'seed': seed
+        "task": task,
+        "saved_data_dir": saved_data_dir,
+        "cc_dict": cc_dict,
+        "mr_dict": mr_dict,
+        "feature_type": feature_type,
+        "gs_file": gs_file,
+        "extra_feature_file": extra_feature_file,
+        "valid_id_list": all_valid_ids,
+        "test_size": test_size,
+        "seed": seed,
     }
 
-    all_edge_list_exist = all(os.path.exists(file_path) for file_path in edge_list_file.values())
+    all_edge_list_exist = all(
+        os.path.exists(file_path) for file_path in edge_list_file.values()
+    )
 
     if all_edge_list_exist:
-        log.info('Fumap network(s) already exists. Skipping model training and prediction.')
+        log.info(
+            "Fumap network(s) already exists. Skipping model training and prediction."
+        )
     else:
-        all_model_exist = all(os.path.exists(file_path) for file_path in ml_model_file.values())
+        all_model_exist = all(
+            os.path.exists(file_path) for file_path in ml_model_file.values()
+        )
         if all_model_exist:
-            log.info(f'Trained model(s) exists. Loading model(s) ...')
+            log.info(f"Trained model(s) exists. Loading model(s) ...")
             ml_model_dict = {}
             # feature: ex or ei
             for feature in ml_model_file:
-                with gzip.open(ml_model_file[feature], 'rb') as fh:
+                with gzip.open(ml_model_file[feature], "rb") as fh:
                     ml_model = pickle.load(fh)
                     ml_model_dict[feature] = ml_model
-            log.info(f'Loading model(s) ... done')
+            log.info(f"Loading model(s) ... done")
             if not gs_df_file.exists():
-                log.error(f'Trained models found but gold standard data file {gs_df_file} '
-                        f'does not exist.')
+                log.error(
+                    f"Trained models found but gold standard data file {gs_df_file} "
+                    f"does not exist."
+                )
                 return
-            with pd.HDFStore(gs_df_file, mode='r') as store:
-                gs_train = store['train']
-                gs_test = store['test']
+            with pd.HDFStore(gs_df_file, mode="r") as store:
+                gs_train = store["train"]
+                gs_test = store["test"]
         else:
             gs_train, gs_test = prepare_gs_data(**gs_args)
-            with pd.HDFStore(gs_df_file, mode='w') as store:
-                store.put('train', gs_train)
-                store.put('test', gs_test)
-            ml_model_dict = train_ml_model(gs_train, ml_type, seed, n_jobs, feature_mapping,
-                                        model_dir)
+            with pd.HDFStore(gs_df_file, mode="w") as store:
+                store.put("train", gs_train)
+                store.put("test", gs_test)
+            ml_model_dict = train_ml_model(
+                gs_train, ml_type, seed, n_jobs, feature_mapping, model_dir
+            )
 
-        all_predicted_all_pairs_exist = all(os.path.exists(file_path) for file_path in
-                                            predicted_all_pairs_file.values())
+        all_predicted_all_pairs_exist = all(
+            os.path.exists(file_path) for file_path in predicted_all_pairs_file.values()
+        )
         if all_predicted_all_pairs_exist:
-            log.info('Predicted all pairs already exists. Skipping prediction.')
+            log.info("Predicted all pairs already exists. Skipping prediction.")
         else:
-            log.info('Predicting all pairs ...')
-            if task == 'protein_func':
+            log.info("Predicting all pairs ...")
+            if task == "protein_func":
                 ppi_feature = get_ppi_feature()
             else:
                 ppi_feature = None
             pred_all_pairs_args = {
-                'model_dict': ml_model_dict,
-                'all_ids': all_valid_ids,
-                'feature_type': feature_type,
-                'ppi_feature': ppi_feature,
-                'cc_dict': cc_dict,
-                'mr_dict': mr_dict,
-                'extra_feature_file': extra_feature_file,
-                'prediction_dir': prediction_dir,
-                'output_file': predicted_all_pairs_file,
-                'n_jobs': n_jobs
+                "model_dict": ml_model_dict,
+                "all_ids": all_valid_ids,
+                "feature_type": feature_type,
+                "ppi_feature": ppi_feature,
+                "cc_dict": cc_dict,
+                "mr_dict": mr_dict,
+                "extra_feature_file": extra_feature_file,
+                "prediction_dir": prediction_dir,
+                "output_file": predicted_all_pairs_file,
+                "n_jobs": n_jobs,
             }
             predict_all_pairs(**pred_all_pairs_args)
-            log.info('Predicting all pairs ... done')
+            log.info("Predicting all pairs ... done")
 
         cutoff_p, cutoff_llr = get_cutoff(ml_model_dict, gs_test, lr_cutoff)
-        log.info(f'cutoff probability: {cutoff_p}')
-        log.info(f'cutoff llr: {cutoff_llr}')
+        log.info(f"cutoff probability: {cutoff_p}")
+        log.info(f"cutoff llr: {cutoff_llr}")
         predict_network(predicted_all_pairs_file, cutoff_p, edge_list_file)
 
     if not gs_df_file.exists():
         gs_train, gs_test = prepare_gs_data(**gs_args)
-        with pd.HDFStore(gs_df_file, mode='w') as store:
-            store.put('train', gs_train)
-            store.put('test', gs_test)
+        with pd.HDFStore(gs_df_file, mode="w") as store:
+            store.put("train", gs_train)
+            store.put("test", gs_test)
     else:
         if gs_test is None:
-            with pd.HDFStore(gs_df_file, mode='r') as store:
-                    gs_train = store['train']
-                    gs_test = store['test']
+            with pd.HDFStore(gs_df_file, mode="r") as store:
+                gs_train = store["train"]
+                gs_test = store["test"]
 
-    all_llr_res_exist = all(os.path.exists(file_path) for file_path in llr_res_file.values())
-    all_edge_list_exist = all(os.path.exists(file_path) for file_path in edge_list_file.values())
+    all_llr_res_exist = all(
+        os.path.exists(file_path) for file_path in llr_res_file.values()
+    )
+    all_edge_list_exist = all(
+        os.path.exists(file_path) for file_path in edge_list_file.values()
+    )
     if all_llr_res_exist and all_edge_list_exist:
-        log.info('LLR results already exist.')
+        log.info("LLR results already exist.")
     else:
         for ft in feature_mapping:
-            log.info(f'Computing LLR for {ft} ...')
+            log.info(f"Computing LLR for {ft} ...")
             if not predicted_all_pairs_file[ft].exists():
-                log.error(f'Predicted all pairs file {predicted_all_pairs_file[ft]} does not exist.')
+                log.error(
+                    f"Predicted all pairs file {predicted_all_pairs_file[ft]} does not exist."
+                )
                 return
             predicted_all_pairs = pd.read_parquet(predicted_all_pairs_file[ft])
             # also save the llr results to file
-            compute_llr(predicted_all_pairs, llr_res_file[ft], start_edge_num, max_num_edges, step_size,
-                        gs_test)
-            log.info(f'Computing LLR for {ft} ... done')
+            compute_llr(
+                predicted_all_pairs,
+                llr_res_file[ft],
+                start_edge_num,
+                max_num_edges,
+                step_size,
+                gs_test,
+            )
+            log.info(f"Computing LLR for {ft} ... done")
 
     validation_res = {}
     for ft in feature_mapping:
         validation_res[ft] = {
-            'llr_res_path': llr_res_file[ft],
-            'edge_list_path': edge_list_file[ft]
+            "llr_res_path": llr_res_file[ft],
+            "edge_list_path": edge_list_file[ft],
         }
     if not llr_dataset_file.exists():
-        log.info('Computing LLR for each dataset ...')
+        log.info("Computing LLR for each dataset ...")
         # feature_dict = cc_dict if feature_type == 'cc' else mr_dict
         # use CC features for individual dataset
-        llr_ds = dataset_llr(all_valid_ids, cc_dict, 'cc', gs_test, start_edge_num,
-                            max_num_edges, step_size, llr_dataset_file)
-        log.info('Done.')
+        llr_ds = dataset_llr(
+            all_valid_ids,
+            cc_dict,
+            "cc",
+            gs_test,
+            start_edge_num,
+            max_num_edges,
+            step_size,
+            llr_dataset_file,
+        )
+        log.info("Done.")
     else:
-        llr_ds = pd.read_csv(llr_dataset_file, sep='\t')
+        llr_ds = pd.read_csv(llr_dataset_file, sep="\t")
 
     if not ml_model_dict:
-        log.info(f'Trained model(s) exists. Loading model(s) ...')
+        log.info(f"Trained model(s) exists. Loading model(s) ...")
         for feature in ml_model_file:
-            with gzip.open(ml_model_file[feature], 'rb') as fh:
+            with gzip.open(ml_model_file[feature], "rb") as fh:
                 ml_model = pickle.load(fh)
                 ml_model_dict[feature] = ml_model
-        log.info(f'Loading model(s) ... done')
+        log.info(f"Loading model(s) ... done")
 
     all_fig_names = []
     if cutoff_llr is None:
@@ -268,30 +331,31 @@ def run(config_file, force_rerun):
 
     gs_dict = {}
     gs_dict[feature_type.upper()] = gs_train
-    if task == 'protein_func' and feature_type.upper() == 'MR' and 'rp_pairs' in cfg:
+    if task == "protein_func" and feature_type.upper() == "MR" and "rp_pairs" in cfg:
         # extract gs data for CC and MR for plotting
         gs_args = {
-                'task': task,
-                'saved_data_dir': saved_data_dir,
-                'cc_dict': cc_dict,
-                'mr_dict': mr_dict,
-                'feature_type': 'cc',
-                'gs_file': gs_file,
-                # no extra feature for plotting
-                'extra_feature_file': None,
-                'valid_id_list': all_valid_ids,
-                'test_size': test_size,
-                'seed': seed
-            }
+            "task": task,
+            "saved_data_dir": saved_data_dir,
+            "cc_dict": cc_dict,
+            "mr_dict": mr_dict,
+            "feature_type": "cc",
+            "gs_file": gs_file,
+            # no extra feature for plotting
+            "extra_feature_file": None,
+            "valid_id_list": all_valid_ids,
+            "test_size": test_size,
+            "seed": seed,
+        }
         gs_train, gs_test = prepare_gs_data(**gs_args)
-        gs_dict['CC'] = gs_train
+        gs_dict["CC"] = gs_train
 
-    fig_names = plot_results(cfg, validation_res, llr_ds, gs_dict, cutoff_llr,
-                            figure_dir)
+    fig_names = plot_results(
+        cfg, validation_res, llr_ds, gs_dict, cutoff_llr, figure_dir
+    )
     all_fig_names.extend(fig_names)
 
-    merge_and_delete(figure_dir, all_fig_names, 'results.pdf')
+    merge_and_delete(figure_dir, all_fig_names, "results.pdf")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     cli()

From 67ca6c53e1e25967cd0cc5d1934ba832bb19ef7f Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Wed, 18 Sep 2024 15:49:47 -0500
Subject: [PATCH 06/54] upgrade version number to 0.1.21

fix column names for extra feature df
---
 funmap/funmap.py |  2 +-
 setup.cfg        |  2 +-
 setup.py         | 64 ++++++++++++++++++++++++------------------------
 3 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index 518d0f69..02467aae 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -319,7 +319,7 @@ def extract_features(
             f"Extra Features: {len(extra_feature_df.columns) - 2}\nGene Pairs: {extra_feature_df.shape[0]}"
         )
         extra_feature_df.columns.values[0] = "P1"
-        extra_feature_df.columns.values[0] = "P2"
+        extra_feature_df.columns.values[1] = "P2"
         extra_feature_df[["P1", "P2"]] = extra_feature_df.apply(
             lambda row: sorted([row["P1"], row["P2"]])
             if row["P1"] > row["P2"]
diff --git a/setup.cfg b/setup.cfg
index 63a95df9..c12570fd 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.1.20
+current_version = 0.1.21
 commit = True
 tag = True
 
diff --git a/setup.py b/setup.py
index 48b00773..af504f72 100644
--- a/setup.py
+++ b/setup.py
@@ -5,48 +5,48 @@
 from setuptools import setup, find_packages
 
 requirements = [
-    'pyyaml==6.0.1',
-    'xgboost==2.0.0',
-    'numpy==1.24.4',
-    'scipy==1.10.1',
-    'pyarrow==13.0.0',
-    'pandas==2.0.3',
-    'joblib==1.3.2',
-    'matplotlib==3.7.3',
-    'seaborn==0.13.0',
-    'scikit-learn==1.3.2',
-    'imbalanced-learn==0.11.0',
-    'tqdm==4.66.1',
-    'PyPDF2==3.0.1',
-    'matplotlib_venn==0.11.9',
-    'networkx==3.1',
-    'powerlaw==1.5',
-    'click==8.1.7',
-    'h5py==3.10.0',
-    'tables==3.8.0'
-    ]
+    "pyyaml==6.0.1",
+    "xgboost==2.0.0",
+    "numpy==1.24.4",
+    "scipy==1.10.1",
+    "pyarrow==13.0.0",
+    "pandas==2.0.3",
+    "joblib==1.3.2",
+    "matplotlib==3.7.3",
+    "seaborn==0.13.0",
+    "scikit-learn==1.3.2",
+    "imbalanced-learn==0.11.0",
+    "tqdm==4.66.1",
+    "PyPDF2==3.0.1",
+    "matplotlib_venn==0.11.9",
+    "networkx==3.1",
+    "powerlaw==1.5",
+    "click==8.1.7",
+    "h5py==3.10.0",
+    "tables==3.8.0",
+]
 
-test_requirements = [ ]
+test_requirements = []
 
 setup(
     author="Zhiao Shi",
-    author_email='zhiao.shi@gmail.com',
-    python_requires='>=3.7',
+    author_email="zhiao.shi@gmail.com",
+    python_requires=">=3.7",
     description="generate gene co-function networks using omics data",
     entry_points={
-        'console_scripts': [
-            'funmap=funmap.cli:cli',
+        "console_scripts": [
+            "funmap=funmap.cli:cli",
         ],
     },
     install_requires=requirements,
-    license='MIT license',
+    license="MIT license",
     include_package_data=True,
-    keywords=['funmap', 'bioinformatics', 'biological-network'],
-    name='funmap',
-    packages=find_packages(include=['funmap', 'funmap.*']),
-    test_suite='tests',
+    keywords=["funmap", "bioinformatics", "biological-network"],
+    name="funmap",
+    packages=find_packages(include=["funmap", "funmap.*"]),
+    test_suite="tests",
     tests_require=test_requirements,
-    url='https://github.com/bzhanglab/funmap',
-    version='0.1.20',
+    url="https://github.com/bzhanglab/funmap",
+    version="0.1.21",
     zip_safe=False,
 )

From 05b316dafd5744b46981eb21ae9933b5c97cd8ed Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 19 Sep 2024 09:42:57 -0500
Subject: [PATCH 07/54] diagnostic printing

---
 funmap/funmap.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index 02467aae..4d7bd36a 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -327,10 +327,13 @@ def extract_features(
             axis=1,
             result_type="expand",
         )
+        log.info(str(feature_df.head()))
+        log.info("Extra feature:\n" + str(extra_feature_df.head()))
         old_feature_count = feature_df.shape[0]
         feature_df = pd.merge(
             feature_df, extra_feature_df, on=["P1", "P2"], how="outer"
         )
+        log.info("Merged Data Frame:\n" + str(feature_df.head()))
         log.info(f"Added gene pairs: {feature_df.shape[0] - old_feature_count}")
     # move 'label' column to the end of the dataframe if it exists
     if "label" in feature_df.columns:

From 4cad093a963b8a3df7d3052b788a1e49858359e9 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 19 Sep 2024 11:26:35 -0500
Subject: [PATCH 08/54] Add extra_feature_file genes to all_valid_ids

More diagnostic logging
---
 funmap/funmap.py | 25 ++++++++++++++++++++++++-
 funmap/utils.py  | 15 +++++++++++++--
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index 4d7bd36a..d07f3369 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -31,7 +31,7 @@
 log = setup_logger(__name__)
 
 
-def get_valid_gs_data(
+def get_valid_gs_dat(
     gs_path: str, valid_gene_list: List[str], md5=None
 ):  # TODO: Add error message if there is no overlap between valid_gene_list and the gold standard set
     log.info(f'Loading gold standard feature file "{gs_path}" ...')
@@ -330,9 +330,32 @@ def extract_features(
         log.info(str(feature_df.head()))
         log.info("Extra feature:\n" + str(extra_feature_df.head()))
         old_feature_count = feature_df.shape[0]
+
+        # Get gene indices for P1 and P2
+        # df.reset_index(drop=True, inplace=True)
+        log.info("df head")
+        log.info(str(df.head()))
+        # gene_to_index = {gene.astype(str): idx for idx, gene in enumerate(gene_ids)}
+        # p1_indices = np.array([gene_to_index.get(gene, -1) for gene in df.iloc[:, 0]])
+        # p2_indices = np.array([gene_to_index.get(gene, -1) for gene in df.iloc[:, 1]])
+        #
+        # f_dataset = h5_file[dataset]
+        # f_values = np.empty(len(df), dtype=float)
+        # valid_indices = (p1_indices != -1) & (p2_indices != -1)
+        #
+        # linear_indices_func = (
+        #     lambda row_indices, col_indices, n: np.array(col_indices)
+        #     - np.array(row_indices)
+        #     + (2 * n - np.array(row_indices) + 1) * np.array(row_indices) // 2
+        # )
+        # linear_indices = linear_indices_func(
+        #     p1_indices[valid_indices], p2_indices[valid_indices], len(gene_ids)
+        # )
         feature_df = pd.merge(
             feature_df, extra_feature_df, on=["P1", "P2"], how="outer"
         )
+        f_values[valid_indices] = f_dataset[:][linear_indices]
+        f_values[~valid_indices] = np.nan
         log.info("Merged Data Frame:\n" + str(feature_df.head()))
         log.info(f"Added gene pairs: {feature_df.shape[0] - old_feature_count}")
     # move 'label' column to the end of the dataframe if it exists
diff --git a/funmap/utils.py b/funmap/utils.py
index 1f769274..b3e4c595 100644
--- a/funmap/utils.py
+++ b/funmap/utils.py
@@ -67,7 +67,7 @@ def normalize_filename(filename):
     return cleaned_filename
 
 
-def get_data_dict(config, min_sample_count=15):
+    def get_data_dict(config, min_sample_count=15):
     """
     Returns a dictionary of data from the provided data configuration, filtered to only include genes that are
     coding and have at least `min_sample_count` samples.
@@ -121,7 +121,18 @@ def get_data_dict(config, min_sample_count=15):
         # valid_count = np.sum(is_valid)
         valid_p = cur_data.columns[is_valid].values
         all_valid_ids = all_valid_ids.union(set(valid_p))
-
+    if config["extra_feature_file"] is not None:
+        log.info("Importing extra feature file")
+        # TODO: Import extra feature file
+        extra_feature_df = pd.read_csv(extra_feature, sep="\t")
+        if config['filter_noncoding_genes']:
+            col1 = extra_feature_df.columns.values[0]
+            col2 = extra_feature_df.columns.values[1]
+            coding = mapping.loc[mapping['coding'] == 'coding', ['gene_name']]
+            coding_genes = list(set(coding['gene_name'].to_list()))
+            extra_feature_df = extra_feature_df[extra_feature_df[col1].isin(coding_genes) & extra_feature_df[col2].isin(coding_genes)]
+        genes = set(extra_feature_df.iloc[:, :2].values.flatten().tolist())
+        all_valid_ids = all_valid_ids.union(genes)
     all_valid_ids = list(all_valid_ids)
     all_valid_ids.sort()
     log.info(f'total number of valid ids: {len(all_valid_ids)}')

From ed5bf9f5cae41438b825c27dc7f6ab1a8863aab6 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 19 Sep 2024 11:30:34 -0500
Subject: [PATCH 09/54] fix indent issue

---
 funmap/utils.py | 379 ++++++++++++++++++++++++++----------------------
 1 file changed, 208 insertions(+), 171 deletions(-)

diff --git a/funmap/utils.py b/funmap/utils.py
index b3e4c595..9bebb567 100644
--- a/funmap/utils.py
+++ b/funmap/utils.py
@@ -17,12 +17,15 @@
 
 def is_url_scheme(path):
     parsed = urlparse(path)
-    if parsed.scheme == 'file':
+    if parsed.scheme == "file":
         return False
 
-    return parsed.scheme != ''
+    return parsed.scheme != ""
 
-def read_csv_with_md5_check(url, expected_md5=None, local_path='downloaded_file.csv', **kwargs):
+
+def read_csv_with_md5_check(
+    url, expected_md5=None, local_path="downloaded_file.csv", **kwargs
+):
     try:
         response = urllib.request.urlopen(url)
         content = response.read()
@@ -30,11 +33,13 @@ def read_csv_with_md5_check(url, expected_md5=None, local_path='downloaded_file.
         if expected_md5:
             md5_hash = hashlib.md5(content).hexdigest()
             if md5_hash != expected_md5:
-                log.error('gold standard file: MD5 hash mismatch, file may be corrupted.')
+                log.error(
+                    "gold standard file: MD5 hash mismatch, file may be corrupted."
+                )
                 raise ValueError("MD5 hash mismatch, file may be corrupted.")
 
         # Save the content to a local file
-        with open(local_path, 'wb') as f:
+        with open(local_path, "wb") as f:
             f.write(content)
 
         df = pd.read_csv(local_path, **kwargs)
@@ -44,13 +49,13 @@ def read_csv_with_md5_check(url, expected_md5=None, local_path='downloaded_file.
         return None
 
 
-def check_gs_files_exist(file_dict, key='CC'):
-    if key.upper() == 'CC':
-        paths = file_dict.get('CC')
+def check_gs_files_exist(file_dict, key="CC"):
+    if key.upper() == "CC":
+        paths = file_dict.get("CC")
         if paths is not None and os.path.exists(paths):
             return True
-    elif key.upper() == 'MR':
-        paths = file_dict.get('MR')
+    elif key.upper() == "MR":
+        paths = file_dict.get("MR")
         if paths is not None and all(os.path.exists(p) for p in paths):
             return True
     else:
@@ -61,13 +66,13 @@ def check_gs_files_exist(file_dict, key='CC'):
 
 def normalize_filename(filename):
     # Remove any characters that are not allowed in filenames
-    cleaned_filename = re.sub(r'[^\w\s.-]', '', filename)
+    cleaned_filename = re.sub(r"[^\w\s.-]", "", filename)
     # Replace spaces with underscores
-    cleaned_filename = cleaned_filename.replace(' ', '_')
+    cleaned_filename = cleaned_filename.replace(" ", "_")
     return cleaned_filename
 
 
-    def get_data_dict(config, min_sample_count=15):
+def get_data_dict(config, min_sample_count=15):
     """
     Returns a dictionary of data from the provided data configuration, filtered to only include genes that are
     coding and have at least `min_sample_count` samples.
@@ -79,41 +84,42 @@ def get_data_dict(config, min_sample_count=15):
         the data from the corresponding file.
 
     """
-    data_file = config['data_path']
+    data_file = config["data_path"]
     data_dict = {}
-    if 'filter_noncoding_genes' in config and config['filter_noncoding_genes']:
-        mapping = pd.read_csv(urls['mapping_file'], sep='\t')
+    if "filter_noncoding_genes" in config and config["filter_noncoding_genes"]:
+        mapping = pd.read_csv(urls["mapping_file"], sep="\t")
     # extract the data file from the tar.gz file
-    tmp_dir = 'tmp_data'
+    tmp_dir = "tmp_data"
     if not os.path.exists(tmp_dir):
         os.mkdir(tmp_dir)
-    os.system(f'tar -xzf {data_file} --strip-components=1 -C {tmp_dir}')
+    os.system(f"tar -xzf {data_file} --strip-components=1 -C {tmp_dir}")
     # gene ids are gene symbols
-    for dt in config['data_files']:
+    for dt in config["data_files"]:
         log.info(f"processing ... {dt['name']}")
-        cur_feature = dt['name']
+        cur_feature = dt["name"]
         # cur_file = get_obj_from_tgz(data_file, dt['path'])
         # extract the data file from the tar.gz file
         # cur_data= get_obj_from_tgz(data_file, dt['path'])
-        cur_data = pd.read_csv(os.path.join(tmp_dir, dt['path']), sep='\t', index_col=0,
-                                header=0)
+        cur_data = pd.read_csv(
+            os.path.join(tmp_dir, dt["path"]), sep="\t", index_col=0, header=0
+        )
         if cur_data.shape[1] < min_sample_count:
             log.info(f"... {dt['name']} ... not enough samples, skipped")
             continue
         cur_data = cur_data.T
         # exclude cohort with sample number < min_sample_count
         # remove noncoding genes first
-        if config['filter_noncoding_genes']:
-            coding = mapping.loc[mapping['coding'] == 'coding', ['gene_name']]
-            coding_genes = list(set(coding['gene_name'].to_list()))
+        if config["filter_noncoding_genes"]:
+            coding = mapping.loc[mapping["coding"] == "coding", ["gene_name"]]
+            coding_genes = list(set(coding["gene_name"].to_list()))
             cur_data = cur_data[[c for c in cur_data.columns if c in coding_genes]]
         # duplicated columns, for now select the last column
-        cur_data = cur_data.loc[:,~cur_data.columns.duplicated(keep='last')]
+        cur_data = cur_data.loc[:, ~cur_data.columns.duplicated(keep="last")]
         data_dict[cur_feature] = cur_data
 
     shutil.rmtree(tmp_dir)
 
-    log.info('filtering out non valid ids ...')
+    log.info("filtering out non valid ids ...")
     all_valid_ids = set()
     for i in data_dict:
         cur_data = data_dict[i]
@@ -125,17 +131,20 @@ def get_data_dict(config, min_sample_count=15):
         log.info("Importing extra feature file")
         # TODO: Import extra feature file
         extra_feature_df = pd.read_csv(extra_feature, sep="\t")
-        if config['filter_noncoding_genes']:
+        if config["filter_noncoding_genes"]:
             col1 = extra_feature_df.columns.values[0]
             col2 = extra_feature_df.columns.values[1]
-            coding = mapping.loc[mapping['coding'] == 'coding', ['gene_name']]
-            coding_genes = list(set(coding['gene_name'].to_list()))
-            extra_feature_df = extra_feature_df[extra_feature_df[col1].isin(coding_genes) & extra_feature_df[col2].isin(coding_genes)]
+            coding = mapping.loc[mapping["coding"] == "coding", ["gene_name"]]
+            coding_genes = list(set(coding["gene_name"].to_list()))
+            extra_feature_df = extra_feature_df[
+                extra_feature_df[col1].isin(coding_genes)
+                & extra_feature_df[col2].isin(coding_genes)
+            ]
         genes = set(extra_feature_df.iloc[:, :2].values.flatten().tolist())
         all_valid_ids = all_valid_ids.union(genes)
     all_valid_ids = list(all_valid_ids)
     all_valid_ids.sort()
-    log.info(f'total number of valid ids: {len(all_valid_ids)}')
+    log.info(f"total number of valid ids: {len(all_valid_ids)}")
 
     # filter out columns that are not in all_valid_ids
     for i in data_dict:
@@ -143,10 +152,10 @@ def get_data_dict(config, min_sample_count=15):
         selected_columns = cur_data.columns.intersection(all_valid_ids)
         cur_data = cur_data[selected_columns]
         # it is possible the entire column is nan, remove it
-        cur_data = cur_data.dropna(axis=1, how='all')
+        cur_data = cur_data.dropna(axis=1, how="all")
         data_dict[i] = cur_data
-        log.info(f'{i} -- ')
-        log.info(f'  samples x ids: {cur_data.shape}')
+        log.info(f"{i} -- ")
+        log.info(f"  samples x ids: {cur_data.shape}")
 
     return data_dict, all_valid_ids
 
@@ -187,11 +196,14 @@ def get_node_edge(edge_list):
     # remove duplidated rows in edge_list
     edge_list = edge_list.drop_duplicates()
     n_edge = len(edge_list)
-    nodes = set(edge_list.iloc[:,0].to_list()) | set(edge_list.iloc[:,1].to_list())
-    return(dict(n_node = len(nodes), n_edge = n_edge,
-        edge_per_node = n_edge / len(nodes),
-        nodes = list(nodes),
-        edges = edge_list))
+    nodes = set(edge_list.iloc[:, 0].to_list()) | set(edge_list.iloc[:, 1].to_list())
+    return dict(
+        n_node=len(nodes),
+        n_edge=n_edge,
+        edge_per_node=n_edge / len(nodes),
+        nodes=list(nodes),
+        edges=edge_list,
+    )
 
 
 def get_node_edge_overlap(network_info):
@@ -208,199 +220,206 @@ def get_node_edge_overlap(network_info):
     overlap : dict
         A dictionary with the node and edge overlap between networks.
     """
-    networks = pd.DataFrame(columns = ['name', 'type', 'n_node',
-                                    'n_edge', 'edge_per_node',
-                                    'nodes', 'edges'])
+    networks = pd.DataFrame(
+        columns=["name", "type", "n_node", "n_edge", "edge_per_node", "nodes", "edges"]
+    )
 
     for _, row in network_info.iterrows():
-        network_name = row['name']
-        network_type = row['type']
-        network_el = row['el']
+        network_name = row["name"]
+        network_type = row["type"]
+        network_el = row["el"]
         res = get_node_edge(network_el)
-        cur_df = pd.DataFrame({'name': [network_name],
-                                'type': [network_type],
-                                'n_node': [int(res['n_node'])],
-                                'n_edge': [int(res['n_edge'])],
-                                'edge_per_node': [res['edge_per_node']],
-                                'nodes': [res['nodes']],
-                                'edges': [res['edges']]})
+        cur_df = pd.DataFrame(
+            {
+                "name": [network_name],
+                "type": [network_type],
+                "n_node": [int(res["n_node"])],
+                "n_edge": [int(res["n_edge"])],
+                "edge_per_node": [res["edge_per_node"]],
+                "nodes": [res["nodes"]],
+                "edges": [res["edges"]],
+            }
+        )
         networks = pd.concat([networks, cur_df], ignore_index=True)
     # overlap of nodes and edges
     overlap = {}
 
     # node overlap
-    target = 'FunMap'
+    target = "FunMap"
     cur_res = {}
-    target_node_set = set(networks.loc[networks['name'] == target,
-                                    'nodes'].tolist()[0])
+    target_node_set = set(networks.loc[networks["name"] == target, "nodes"].tolist()[0])
     target_size = len(target_node_set)
     for _, row in networks.iterrows():
-        if row['name'] == target:
+        if row["name"] == target:
             continue
-        cur_node_set = set(row['nodes'])
+        cur_node_set = set(row["nodes"])
         cur_size = len(cur_node_set)
         overlap_size = len(target_node_set & cur_node_set)
-        cur_res[row['name']] = tuple([
-                target_size - overlap_size,
-                cur_size - overlap_size,
-                overlap_size])
+        cur_res[row["name"]] = tuple(
+            [target_size - overlap_size, cur_size - overlap_size, overlap_size]
+        )
 
-    overlap['node'] = cur_res
+    overlap["node"] = cur_res
 
     # edge overlap
     cur_res = {}
-    target_edge_df = networks.loc[networks['name'] == target, 'edges'].tolist()[0]
-    target_edge_set = set(tuple(sorted(x)) for x in zip(target_edge_df.pop(0),
-                                                    target_edge_df.pop(1)))
+    target_edge_df = networks.loc[networks["name"] == target, "edges"].tolist()[0]
+    target_edge_set = set(
+        tuple(sorted(x)) for x in zip(target_edge_df.pop(0), target_edge_df.pop(1))
+    )
     target_size = len(target_edge_set)
 
     for _, row in networks.iterrows():
-        if row['name'] == target:
+        if row["name"] == target:
             continue
-        edge_df = row['edges']
+        edge_df = row["edges"]
         edges = [tuple(sorted(x)) for x in zip(edge_df.pop(0), edge_df.pop(1))]
         cur_edge_set = set(edges)
         cur_size = len(cur_edge_set)
         overlap_size = len(target_edge_set & cur_edge_set)
-        cur_res[row['name']] = tuple([
-            target_size - overlap_size,
-            cur_size - overlap_size,
-            overlap_size])
+        cur_res[row["name"]] = tuple(
+            [target_size - overlap_size, cur_size - overlap_size, overlap_size]
+        )
 
-    overlap['edge'] = cur_res
+    overlap["edge"] = cur_res
     return overlap
 
+
 def cleanup_experiment(config_file):
     cfg = get_config(config_file)
-    results_dir = Path(cfg['results_dir'])
+    results_dir = Path(cfg["results_dir"])
     shutil.rmtree(results_dir, ignore_errors=True)
 
+
 def setup_experiment(config_file):
     cfg = get_config(config_file)
-    results_dir = Path(cfg['results_dir'])
+    results_dir = Path(cfg["results_dir"])
     # create folders
-    folder_dict = cfg['subdirs']
-    folders = [results_dir / Path(folder_dict[folder_name]) for folder_name in folder_dict]
+    folder_dict = cfg["subdirs"]
+    folders = [
+        results_dir / Path(folder_dict[folder_name]) for folder_name in folder_dict
+    ]
     for folder in folders:
         folder.mkdir(parents=True, exist_ok=True)
 
     # save configuration to results folder
-    with open(str( results_dir / 'config.yml'), 'w') as fh:
+    with open(str(results_dir / "config.yml"), "w") as fh:
         yaml.dump(cfg, fh, sort_keys=False)
 
     return cfg
 
+
 def get_config(cfg_file: Path):
     cfg = {
-        'task': 'protein_func',
-        'results_dir': 'results',
+        "task": "protein_func",
+        "results_dir": "results",
         # the following directories are relative to the results_dir
-        'subdirs': {
-            'saved_model_dir': 'saved_models',
-            'saved_data_dir': 'saved_data',
-            'saved_predictions_dir': 'saved_predictions',
-            'figure_dir': 'figures',
-            'network_dir': 'networks',
+        "subdirs": {
+            "saved_model_dir": "saved_models",
+            "saved_data_dir": "saved_data",
+            "saved_predictions_dir": "saved_predictions",
+            "figure_dir": "figures",
+            "network_dir": "networks",
         },
-        'seed': 42,
-        'feature_type': 'cc',
-        'test_size': 0.2,
-        'ml_type': 'xgboost',
-        'gs_file': None,
-        'extra_feature_file': None,
+        "seed": 42,
+        "feature_type": "cc",
+        "test_size": 0.2,
+        "ml_type": "xgboost",
+        "gs_file": None,
+        "extra_feature_file": None,
         # 'filter_before_prediction': True,
         # 'min_feature_count': 1,
-        'min_sample_count': 20,
-        'filter_noncoding_genes': False,
+        "min_sample_count": 20,
+        "filter_noncoding_genes": False,
         # 'filter_after_prediction': True,
         # 'filter_criterion': 'max',
         # 'filter_threshold': 0.95,
         # 'filter_blacklist': False,
-        'n_jobs': os.cpu_count(),
-        'start_edge_num': 1000,
-        'max_num_edges': 250000,
-        'step_size': 1000,
-        'lr_cutoff': 50,
+        "n_jobs": os.cpu_count(),
+        "start_edge_num": 1000,
+        "max_num_edges": 250000,
+        "step_size": 1000,
+        "lr_cutoff": 50,
     }
 
-    with open(cfg_file, 'r') as fh:
+    with open(cfg_file, "r") as fh:
         cfg_dict = yaml.load(fh, Loader=yaml.FullLoader)
 
     # use can change the following parameters in the config file
-    if 'task' in cfg_dict:
-        cfg['task'] = cfg_dict['task']
-        assert cfg['task'] in ['protein_func', 'kinase_func']
+    if "task" in cfg_dict:
+        cfg["task"] = cfg_dict["task"]
+        assert cfg["task"] in ["protein_func", "kinase_func"]
 
-    if 'seed' in cfg_dict:
-        cfg['seed'] = cfg_dict['seed']
+    if "seed" in cfg_dict:
+        cfg["seed"] = cfg_dict["seed"]
 
-    if 'feature_type' in cfg_dict:
-        cfg['feature_type'] = cfg_dict['feature_type']
-        assert cfg['feature_type'] in ['cc', 'mr']
+    if "feature_type" in cfg_dict:
+        cfg["feature_type"] = cfg_dict["feature_type"]
+        assert cfg["feature_type"] in ["cc", "mr"]
 
-    if 'extra_feature_file' in cfg_dict:
-        cfg['extra_feature_file'] = cfg_dict['extra_feature_file']
+    if "extra_feature_file" in cfg_dict:
+        cfg["extra_feature_file"] = cfg_dict["extra_feature_file"]
 
-    if 'gs_file' in cfg_dict:
-        cfg['gs_file'] = cfg_dict['gs_file']
+    if "gs_file" in cfg_dict:
+        cfg["gs_file"] = cfg_dict["gs_file"]
 
-    if 'min_sample_count' in cfg_dict:
-        cfg['min_sample_count'] = cfg_dict['min_sample_count']
+    if "min_sample_count" in cfg_dict:
+        cfg["min_sample_count"] = cfg_dict["min_sample_count"]
 
-    if 'n_jobs' in cfg_dict:
-        cfg['n_jobs'] = cfg_dict['n_jobs']
+    if "n_jobs" in cfg_dict:
+        cfg["n_jobs"] = cfg_dict["n_jobs"]
 
-    if 'start_edge_num' in cfg_dict:
-        cfg['start_edge_num'] = cfg_dict['start_edge_num']
+    if "start_edge_num" in cfg_dict:
+        cfg["start_edge_num"] = cfg_dict["start_edge_num"]
 
-    if 'max_num_edges' in cfg_dict:
-        cfg['max_num_edges'] = cfg_dict['max_num_edges']
+    if "max_num_edges" in cfg_dict:
+        cfg["max_num_edges"] = cfg_dict["max_num_edges"]
 
-    if 'step_size' in cfg_dict:
-        cfg['step_size'] = cfg_dict['step_size']
+    if "step_size" in cfg_dict:
+        cfg["step_size"] = cfg_dict["step_size"]
 
-    if 'lr_cutoff' in cfg_dict:
-        cfg['lr_cutoff'] = cfg_dict['lr_cutoff']
+    if "lr_cutoff" in cfg_dict:
+        cfg["lr_cutoff"] = cfg_dict["lr_cutoff"]
 
-    if 'name' in cfg_dict:
-        cfg['name'] = cfg_dict['name']
+    if "name" in cfg_dict:
+        cfg["name"] = cfg_dict["name"]
     else:
-        raise ValueError('name not specified in config file')
+        raise ValueError("name not specified in config file")
 
-    if 'data_path' in cfg_dict:
-        cfg['data_path'] = cfg_dict['data_path']
+    if "data_path" in cfg_dict:
+        cfg["data_path"] = cfg_dict["data_path"]
     else:
-        raise ValueError('data_path not specified in config file')
+        raise ValueError("data_path not specified in config file")
 
-    if cfg['task'] == 'protein_func':
-        if 'filter_noncoding_genes' in cfg_dict:
-            cfg['filter_noncoding_genes'] = cfg_dict['filter_noncoding_genes']
+    if cfg["task"] == "protein_func":
+        if "filter_noncoding_genes" in cfg_dict:
+            cfg["filter_noncoding_genes"] = cfg_dict["filter_noncoding_genes"]
     else:
         # ignore filter_noncoding_genes for kinase_func
-        cfg['filter_noncoding_genes'] = False
-        log.info('ignoring filter_noncoding_genes for kinase_func')
+        cfg["filter_noncoding_genes"] = False
+        log.info("ignoring filter_noncoding_genes for kinase_func")
 
-    if 'results_dir' in cfg_dict:
-        cfg['results_dir'] = cfg_dict['results_dir'] + '/' + cfg_dict['name']
+    if "results_dir" in cfg_dict:
+        cfg["results_dir"] = cfg_dict["results_dir"] + "/" + cfg_dict["name"]
 
-    if 'data_files' not in cfg_dict:
-        raise ValueError('data_files not specified in config file')
+    if "data_files" not in cfg_dict:
+        raise ValueError("data_files not specified in config file")
 
     # Check all files listed under data_files are also in the tar.gz file
-    data_files = cfg_dict['data_files']
+    data_files = cfg_dict["data_files"]
     # List all the files in the tar.gz file
-    with tarfile.open(cfg['data_path'], "r:gz") as tar:
+    with tarfile.open(cfg["data_path"], "r:gz") as tar:
         tar_files = {Path(file).name for file in tar.getnames()}
 
     # Check if all files in data_files are in tar_files
-    if not all(file['path'] in tar_files for file in data_files):
-        print('Files listed under data_files are not in the tar.gz file!')
-        raise ValueError('Files listed under data_files are not in the tar.gz file!')
+    if not all(file["path"] in tar_files for file in data_files):
+        print("Files listed under data_files are not in the tar.gz file!")
+        raise ValueError("Files listed under data_files are not in the tar.gz file!")
 
-    cfg['data_files'] = cfg_dict['data_files']
+    cfg["data_files"] = cfg_dict["data_files"]
 
-    if 'rp_pairs' in cfg_dict:
-        cfg['rp_pairs'] = cfg_dict['rp_pairs']
+    if "rp_pairs" in cfg_dict:
+        cfg["rp_pairs"] = cfg_dict["rp_pairs"]
 
     return cfg
 
@@ -412,9 +431,9 @@ def check_gold_standard_file(file_path, min_count=10000):
 
     """
     try:
-        with open(file_path, 'r', newline='') as tsv_file:
+        with open(file_path, "r", newline="") as tsv_file:
             dialect = csv.Sniffer().sniff(tsv_file.read(2048))
-            if dialect.delimiter != '\t':
+            if dialect.delimiter != "\t":
                 print("Error: Incorrect TSV format. TSV files should be tab-separated.")
                 return False
     except csv.Error as e:
@@ -423,12 +442,16 @@ def check_gold_standard_file(file_path, min_count=10000):
 
     # Check data format and Class values
     class_values = []
-    with open(file_path, 'r', newline='') as tsv_file:
-        reader = csv.reader(tsv_file, delimiter='\t')
+    with open(file_path, "r", newline="") as tsv_file:
+        reader = csv.reader(tsv_file, delimiter="\t")
         next(reader)  # Skip header
-        for row_num, row in enumerate(reader, start=2):  # Add row_num for better error reporting
+        for row_num, row in enumerate(
+            reader, start=2
+        ):  # Add row_num for better error reporting
             if len(row) != 3:  # Assuming each row should have 3 columns
-                log.error(f'Invalid row format in row {row_num}. Each row should have 3 columns.')
+                log.error(
+                    f"Invalid row format in row {row_num}. Each row should have 3 columns."
+                )
                 return False
             class_value = row[2].strip()
             if not class_value.isdigit() or int(class_value) not in (0, 1):
@@ -442,14 +465,16 @@ def check_gold_standard_file(file_path, min_count=10000):
     lesser_count = min(count_0, count_1)
 
     if lesser_count < min_count:
-        log.error(f"The lesser of 0 and 1 occurrences ({lesser_count}) does not meet the threshold. "
-            f"Expected at least {min_count}.")
+        log.error(
+            f"The lesser of 0 and 1 occurrences ({lesser_count}) does not meet the threshold. "
+            f"Expected at least {min_count}."
+        )
         return False
 
     return True
 
 
-def check_extra_feature_file(file_path, missing_value='NA'):
+def check_extra_feature_file(file_path, missing_value="NA"):
     """
     Notes
     -----
@@ -465,9 +490,9 @@ def check_extra_feature_file(file_path, missing_value='NA'):
     The TSV file should be tab-separated.
     """
     try:
-        with open(file_path, 'r', newline='') as tsv_file:
+        with open(file_path, "r", newline="") as tsv_file:
             dialect = csv.Sniffer().sniff(tsv_file.read(1024))
-            if dialect.delimiter != '\t':
+            if dialect.delimiter != "\t":
                 log.error("Incorrect TSV format. TSV files should be tab-separated.")
                 return False
     except csv.Error:
@@ -475,8 +500,8 @@ def check_extra_feature_file(file_path, missing_value='NA'):
         return False
 
     # Check header and number of columns
-    with open(file_path, 'r', newline='') as tsv_file:
-        reader = csv.reader(tsv_file, delimiter='\t')
+    with open(file_path, "r", newline="") as tsv_file:
+        reader = csv.reader(tsv_file, delimiter="\t")
         header = next(reader, None)
         if header is None:
             log.error("The TSV file must have a header row.")
@@ -488,12 +513,16 @@ def check_extra_feature_file(file_path, missing_value='NA'):
             return False
 
     # Check data type consistency for additional columns
-    with open(file_path, 'r', newline='') as tsv_file:
-        reader = csv.DictReader(tsv_file, delimiter='\t')
+    with open(file_path, "r", newline="") as tsv_file:
+        reader = csv.DictReader(tsv_file, delimiter="\t")
         column_data_types = {}
-        for row_num, row in enumerate(reader, start=2):  # Add row_num for better error reporting
+        for row_num, row in enumerate(
+            reader, start=2
+        ):  # Add row_num for better error reporting
             for column_name, value in row.items():
-                if column_name not in header[:2]:  # Skip the first two columns (Protein_1 and Protein_2)
+                if (
+                    column_name not in header[:2]
+                ):  # Skip the first two columns (Protein_1 and Protein_2)
                     if value == missing_value:
                         continue
                     try:
@@ -501,18 +530,26 @@ def check_extra_feature_file(file_path, missing_value='NA'):
                         if float_value.is_integer():
                             value = int(float_value)
                     except ValueError:
-                        log.error("Invalid data type in row %d, column '%s'. "
-                                    "The value '%s' should be either an integer, a float, or '%s' (missing value).",
-                                    row_num, column_name, value, missing_value)
+                        log.error(
+                            "Invalid data type in row %d, column '%s'. "
+                            "The value '%s' should be either an integer, a float, or '%s' (missing value).",
+                            row_num,
+                            column_name,
+                            value,
+                            missing_value,
+                        )
                         return False
                     # Store the data type of each column (float or integer)
-                    data_type = float if '.' in value else int
+                    data_type = float if "." in value else int
                     if column_name not in column_data_types:
                         column_data_types[column_name] = data_type
                     elif column_data_types[column_name] != data_type:
-                        log.error("Inconsistent data type in column '%s'. "
-                                    "Expected a consistent data type (integer, float, or '%s') for all rows.",
-                                    column_name, missing_value)
+                        log.error(
+                            "Inconsistent data type in column '%s'. "
+                            "Expected a consistent data type (integer, float, or '%s') for all rows.",
+                            column_name,
+                            missing_value,
+                        )
                         return False
 
     return True

From b22c47efd5deefba4e241c47595e292901d37171 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 19 Sep 2024 11:32:06 -0500
Subject: [PATCH 10/54] fix extra_file path

---
 funmap/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/funmap/utils.py b/funmap/utils.py
index 9bebb567..cc00a09c 100644
--- a/funmap/utils.py
+++ b/funmap/utils.py
@@ -130,7 +130,7 @@ def get_data_dict(config, min_sample_count=15):
     if config["extra_feature_file"] is not None:
         log.info("Importing extra feature file")
         # TODO: Import extra feature file
-        extra_feature_df = pd.read_csv(extra_feature, sep="\t")
+        extra_feature_df = pd.read_csv(config["extra_feature_file"], sep="\t")
         if config["filter_noncoding_genes"]:
             col1 = extra_feature_df.columns.values[0]
             col2 = extra_feature_df.columns.values[1]

From 262d580e59957ee2b4552191ec3b615edae23fe1 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 19 Sep 2024 11:53:33 -0500
Subject: [PATCH 11/54] fix function names problem

---
 funmap/funmap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index d07f3369..f775d241 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -31,7 +31,7 @@
 log = setup_logger(__name__)
 
 
-def get_valid_gs_dat(
+def get_valid_gs_data(
     gs_path: str, valid_gene_list: List[str], md5=None
 ):  # TODO: Add error message if there is no overlap between valid_gene_list and the gold standard set
     log.info(f'Loading gold standard feature file "{gs_path}" ...')

From dbc14796b24c01b86c1986ba7621680740a4640d Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 19 Sep 2024 12:11:39 -0500
Subject: [PATCH 12/54] Use df for merging extra features

---
 funmap/funmap.py | 34 ++++------------------------------
 1 file changed, 4 insertions(+), 30 deletions(-)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index f775d241..88d2750d 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -327,37 +327,11 @@ def extract_features(
             axis=1,
             result_type="expand",
         )
-        log.info(str(feature_df.head()))
-        log.info("Extra feature:\n" + str(extra_feature_df.head()))
-        old_feature_count = feature_df.shape[0]
-
-        # Get gene indices for P1 and P2
-        # df.reset_index(drop=True, inplace=True)
-        log.info("df head")
-        log.info(str(df.head()))
-        # gene_to_index = {gene.astype(str): idx for idx, gene in enumerate(gene_ids)}
-        # p1_indices = np.array([gene_to_index.get(gene, -1) for gene in df.iloc[:, 0]])
-        # p2_indices = np.array([gene_to_index.get(gene, -1) for gene in df.iloc[:, 1]])
-        #
-        # f_dataset = h5_file[dataset]
-        # f_values = np.empty(len(df), dtype=float)
-        # valid_indices = (p1_indices != -1) & (p2_indices != -1)
-        #
-        # linear_indices_func = (
-        #     lambda row_indices, col_indices, n: np.array(col_indices)
-        #     - np.array(row_indices)
-        #     + (2 * n - np.array(row_indices) + 1) * np.array(row_indices) // 2
-        # )
-        # linear_indices = linear_indices_func(
-        #     p1_indices[valid_indices], p2_indices[valid_indices], len(gene_ids)
-        # )
-        feature_df = pd.merge(
-            feature_df, extra_feature_df, on=["P1", "P2"], how="outer"
+        merged_df = pd.merge(
+            df[["P1", "P2"]], extra_feature_df, on=["P1", "P2"], how="left"
         )
-        f_values[valid_indices] = f_dataset[:][linear_indices]
-        f_values[~valid_indices] = np.nan
-        log.info("Merged Data Frame:\n" + str(feature_df.head()))
-        log.info(f"Added gene pairs: {feature_df.shape[0] - old_feature_count}")
+        merged_df = merged_df.drop(columns=["P1", "P2", "label"])
+        feature_df = pd.concat([feature_df, merged_df], axis=1)
     # move 'label' column to the end of the dataframe if it exists
     if "label" in feature_df.columns:
         feature_df = feature_df[

From 1fc6f15b5e8d8cbb733074cc119315a34b350f68 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 19 Sep 2024 12:14:24 -0500
Subject: [PATCH 13/54] don't drop missing column

---
 funmap/funmap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index 88d2750d..0fc6da97 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -330,7 +330,7 @@ def extract_features(
         merged_df = pd.merge(
             df[["P1", "P2"]], extra_feature_df, on=["P1", "P2"], how="left"
         )
-        merged_df = merged_df.drop(columns=["P1", "P2", "label"])
+        merged_df = merged_df.drop(columns=["P1", "P2"])
         feature_df = pd.concat([feature_df, merged_df], axis=1)
     # move 'label' column to the end of the dataframe if it exists
     if "label" in feature_df.columns:

From cfdf84d8a7128c3459f73ef8249ec7e358457ed4 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 19 Sep 2024 13:18:52 -0500
Subject: [PATCH 14/54] diagnostic logging

more diagnostics

log final dataframe

show nan rows

track df size
---
 funmap/funmap.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index 0fc6da97..2159feba 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -327,17 +327,38 @@ def extract_features(
             axis=1,
             result_type="expand",
         )
+        df.reset_index(drop=True, inplace=True)
         merged_df = pd.merge(
             df[["P1", "P2"]], extra_feature_df, on=["P1", "P2"], how="left"
         )
         merged_df = merged_df.drop(columns=["P1", "P2"])
+        log.info("Merged_df")
+        log.info(str(merged_df.head()))
+        non_nan_values_exist = merged_df.notna().any().any()
+
+        if non_nan_values_exist:
+            print("There are non-NaN values in the merged_df.")
+        else:
+            print("All values in merged_df are NaN.")
+
+        log.info("before merge")
+        log.info(str(feature_df.head()))
+        log.info(feature_df.size)
         feature_df = pd.concat([feature_df, merged_df], axis=1)
+        log.info("After merge")
+        log.info(str(feature_df.head()))
+        log.info(feature_df.size)
     # move 'label' column to the end of the dataframe if it exists
     if "label" in feature_df.columns:
         feature_df = feature_df[
             [col for col in feature_df.columns if col != "label"] + ["label"]
         ]
+        nan_rows = feature_df[feature_df["label"].isna()]
+        print("NAN ROWS:")
+        print(nan_rows)
 
+    log.info("Final DF")
+    log.info(str(feature_df.head()))
     return feature_df
 
 

From 62ba8b975208be4e09438c2d34baa83cb9d180be Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 19 Sep 2024 14:48:30 -0500
Subject: [PATCH 15/54] Remove duplicate gene pairs in extra_features

row logging

log fixes
---
 funmap/funmap.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index 2159feba..3360a6f5 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -327,6 +327,12 @@ def extract_features(
             axis=1,
             result_type="expand",
         )
+        extra_feature_df = extra_feature_df[
+            extra_feature_df.duplicated(subset=["P1", "P2"], keep="first")
+        ]
+        log.info(
+            f"Row count for df: {df.shape[0]}\nRow count for features: {feature_df.shape[0]}"
+        )
         df.reset_index(drop=True, inplace=True)
         merged_df = pd.merge(
             df[["P1", "P2"]], extra_feature_df, on=["P1", "P2"], how="left"
@@ -346,6 +352,9 @@ def extract_features(
         log.info(feature_df.size)
         feature_df = pd.concat([feature_df, merged_df], axis=1)
         log.info("After merge")
+        log.info(
+            f"Row count for merge_df: {merge_df.shape[0]}\nRow count for features: {feature_df.shape[0]}"
+        )
         log.info(str(feature_df.head()))
         log.info(feature_df.size)
     # move 'label' column to the end of the dataframe if it exists

From 5691803951b8183e1c28654e3bde6c688031f615 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 19 Sep 2024 15:05:15 -0500
Subject: [PATCH 16/54] use merge instead of concat

---
 funmap/funmap.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index 3360a6f5..d6bc04ac 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -350,10 +350,10 @@ def extract_features(
         log.info("before merge")
         log.info(str(feature_df.head()))
         log.info(feature_df.size)
-        feature_df = pd.concat([feature_df, merged_df], axis=1)
+        feature_df = pd.merge(feature_df, merged_df, left_index=True, right_index=True)
         log.info("After merge")
         log.info(
-            f"Row count for merge_df: {merge_df.shape[0]}\nRow count for features: {feature_df.shape[0]}"
+            f"Row count for merge_df: {merged_df.shape[0]}\nRow count for features: {feature_df.shape[0]}"
         )
         log.info(str(feature_df.head()))
         log.info(feature_df.size)

From 28b76b3221f2d922318fb9ac6274cf562169cb2f Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 19 Sep 2024 15:23:33 -0500
Subject: [PATCH 17/54] drop duplicates more explicitly

---
 funmap/funmap.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index d6bc04ac..9a5e9c6c 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -327,9 +327,9 @@ def extract_features(
             axis=1,
             result_type="expand",
         )
-        extra_feature_df = extra_feature_df[
-            extra_feature_df.duplicated(subset=["P1", "P2"], keep="first")
-        ]
+        extra_feature_df = extra_feature_df.drop_duplicates(
+            subset=["P1", "P2"], keep="last"
+        )
         log.info(
             f"Row count for df: {df.shape[0]}\nRow count for features: {feature_df.shape[0]}"
         )

From e075c7519a2e6a81b0940b249002fcb8a0c9866b Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 19 Sep 2024 15:31:36 -0500
Subject: [PATCH 18/54] remove extra logs

---
 funmap/funmap.py | 30 ------------------------------
 1 file changed, 30 deletions(-)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index 9a5e9c6c..418aa8a3 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -315,9 +315,6 @@ def extract_features(
     if extra_feature is not None:
         log.info(f"Loading extra features from {extra_feature}")
         extra_feature_df = pd.read_csv(extra_feature, sep="\t")
-        log.info(
-            f"Extra Features: {len(extra_feature_df.columns) - 2}\nGene Pairs: {extra_feature_df.shape[0]}"
-        )
         extra_feature_df.columns.values[0] = "P1"
         extra_feature_df.columns.values[1] = "P2"
         extra_feature_df[["P1", "P2"]] = extra_feature_df.apply(
@@ -330,44 +327,17 @@ def extract_features(
         extra_feature_df = extra_feature_df.drop_duplicates(
             subset=["P1", "P2"], keep="last"
         )
-        log.info(
-            f"Row count for df: {df.shape[0]}\nRow count for features: {feature_df.shape[0]}"
-        )
         df.reset_index(drop=True, inplace=True)
         merged_df = pd.merge(
             df[["P1", "P2"]], extra_feature_df, on=["P1", "P2"], how="left"
         )
         merged_df = merged_df.drop(columns=["P1", "P2"])
-        log.info("Merged_df")
-        log.info(str(merged_df.head()))
-        non_nan_values_exist = merged_df.notna().any().any()
-
-        if non_nan_values_exist:
-            print("There are non-NaN values in the merged_df.")
-        else:
-            print("All values in merged_df are NaN.")
-
-        log.info("before merge")
-        log.info(str(feature_df.head()))
-        log.info(feature_df.size)
         feature_df = pd.merge(feature_df, merged_df, left_index=True, right_index=True)
-        log.info("After merge")
-        log.info(
-            f"Row count for merge_df: {merged_df.shape[0]}\nRow count for features: {feature_df.shape[0]}"
-        )
-        log.info(str(feature_df.head()))
-        log.info(feature_df.size)
     # move 'label' column to the end of the dataframe if it exists
     if "label" in feature_df.columns:
         feature_df = feature_df[
             [col for col in feature_df.columns if col != "label"] + ["label"]
         ]
-        nan_rows = feature_df[feature_df["label"].isna()]
-        print("NAN ROWS:")
-        print(nan_rows)
-
-    log.info("Final DF")
-    log.info(str(feature_df.head()))
     return feature_df
 
 

From 49441acfda95bb14061a2c688c65427b55e6ebfc Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 19 Sep 2024 16:11:42 -0500
Subject: [PATCH 19/54] Add LLR calculation and plotting for extra features

---
 funmap/funmap.py   |  42 +++
 funmap/plotting.py | 789 +++++++++++++++++++++++++++------------------
 2 files changed, 519 insertions(+), 312 deletions(-)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index 418aa8a3..90076c9b 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -557,6 +557,12 @@ def extract_dataset_feature(all_pairs, feature_file, feature_type="cc"):
     return df
 
 
+def extract_extra_features(all_pairs, ef_df):
+    # convert all_pairs to a dataframe
+    df = pd.DataFrame(all_pairs, columns=["P1", "P2"])
+    pd.merge(df, ef_df, on=["P1", "P2"], how="left")
+
+
 def dataset_llr(
     all_ids,
     feature_dict,
@@ -566,6 +572,7 @@ def dataset_llr(
     max_num_edge,
     step_size,
     llr_dataset_file,
+    extra_feature,
 ):
     llr_ds = pd.DataFrame()
     all_ids_sorted = sorted(all_ids)
@@ -605,6 +612,41 @@ def dataset_llr(
     llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True)
     log.info(f"Calculating llr for all datasets average ... done")
     llr_ds.to_csv(llr_dataset_file, sep="\t", index=False)
+    if extra_feature is not None:
+        log.info("Calculating LLR for extra features")
+        extra_feature_df = pd.read_csv(extra_feature, sep="\t")
+        extra_feature_df.columns.values[0] = "P1"
+        extra_feature_df.columns.values[1] = "P2"
+        extra_feature_df[["P1", "P2"]] = extra_feature_df.apply(
+            lambda row: sorted([row["P1"], row["P2"]])
+            if row["P1"] > row["P2"]
+            else [row["P1"], row["P2"]],
+            axis=1,
+            result_type="expand",
+        )
+        extra_feature_df = extra_feature_df.drop_duplicates(
+            subset=["P1", "P2"], keep="last"
+        )
+        extra_feature_df = extract_extra_features(
+            all_pairs, extra_feature_df
+        )  # filter out unused pairs
+        features = extra_feature_df.columns.values[2:]
+        for f in features:
+            subset_df = extra_feature_df[["P1", "P2", f]]
+            subset_df.columns.values[-1] = "prediction"
+            log.info(f"Calculating llr for extra feature {f} ...")
+            cur_llr_res = compute_llr(
+                subset_df,
+                None,
+                start_edge_num,
+                max_num_edge,
+                step_size,
+                gs_test,
+            )
+            cur_llr_res["dataset"] = f + "_EXTRAFEAT"
+            llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True)
+            llr_ds.to_csv(llr_dataset_file, sep="\t", index=False)
+            log.info(f"Calculating llr for {dataset} ... done")
 
     return llr_ds
 
diff --git a/funmap/plotting.py b/funmap/plotting.py
index 3a46a970..6a01b74b 100644
--- a/funmap/plotting.py
+++ b/funmap/plotting.py
@@ -21,6 +21,7 @@
 
 log = setup_logger(__name__)
 
+
 def edge_number(x, pos):
     """
     Formatter function to format the x-axis tick labels
@@ -38,64 +39,84 @@ def edge_number(x, pos):
         The formatted string of the value.
     """
     if x >= 1e6:
-        s = '{:1.1f}M'.format(x*1e-6)
+        s = "{:1.1f}M".format(x * 1e-6)
     elif x == 0:
-        s = '0'
+        s = "0"
     else:
-        s = '{:1.0f}K'.format(x*1e-3)
+        s = "{:1.0f}K".format(x * 1e-3)
     return s
 
 
-def plot_llr_comparison(cfg, validation_results, llr_ds, output_file='llr_comparison.pdf'):
-    name_type_dict = {item['name']: item['type'] for item in cfg['data_files']}
-    name_type_dict['all_average'] = 'other'
-    datasets = sorted(llr_ds['dataset'].unique().tolist())
+def plot_llr_comparison(
+    cfg, validation_results, llr_ds, output_file="llr_comparison.pdf"
+):
+    name_type_dict = {item["name"]: item["type"] for item in cfg["data_files"]}
+    name_type_dict["all_average"] = "other"
+    datasets = sorted(llr_ds["dataset"].unique().tolist())
     fig, ax = plt.subplots(figsize=(20, 16))
 
     for ds in datasets:
         start = -1
-        cur_df = llr_ds[llr_ds['dataset'] == ds]
-        if name_type_dict[ds].upper() == 'RNA':
-            ltype = '--'
-        elif name_type_dict[ds].upper() == 'PROTEIN':
-            ltype = ':'
+        cur_df = llr_ds[llr_ds["dataset"] == ds]
+        if name_type_dict[ds].upper() == "RNA":
+            ltype = "--"
+        elif name_type_dict[ds].upper() == "PROTEIN":
+            ltype = ":"
+        elif ds.endswith("_EXTRAFEAT"):
+            ds = ds.replace("_EXTRAFEAT", "")
+            ltype = "-."
         else:
-            ltype = '-'
-        ax.plot(cur_df['k'], cur_df['llr'], linestyle=ltype, label=ds)
+            ltype = "-"
+        ax.plot(cur_df["k"], cur_df["llr"], linestyle=ltype, label=ds)
         if start == -1:
-            start = cur_df['k'].iloc[0]
+            start = cur_df["k"].iloc[0]
 
     # plot llr_res with the same start point
     for ft in validation_results:
-        llr_res = pd.read_csv(validation_results[ft]['llr_res_path'], sep='\t')
-        llr_res = llr_res[llr_res['k'] >= start]
-        ax.plot(llr_res['k'], llr_res['llr'], label=f'funmap_{ft}', linewidth=3)
-
-    line_styles = [Line2D([0], [0], linestyle='--', color='black', label='RNA'),
-                    Line2D([0], [0], linestyle=':', color='black', label='Protein'),
-                    Line2D([0], [0], linestyle='-', color='black', label='Other')]
-    color_legend = ax.legend(handles=[Patch(color=line.get_color(), label=line.get_label()) for line in ax.lines],
-                            bbox_to_anchor=(1.05, 1),  title='Data type', fontsize=16,
-                            bbox_transform=ax.transAxes)
-    linestyle_legend = ax.legend(handles=line_styles,
-                                bbox_to_anchor=(1.05, 0.2),  title='Model', fontsize=16,
-                                bbox_transform=ax.transAxes)
+        llr_res = pd.read_csv(validation_results[ft]["llr_res_path"], sep="\t")
+        llr_res = llr_res[llr_res["k"] >= start]
+        ax.plot(llr_res["k"], llr_res["llr"], label=f"funmap_{ft}", linewidth=3)
+
+    line_styles = [
+        Line2D([0], [0], linestyle="--", color="black", label="RNA"),
+        Line2D([0], [0], linestyle=":", color="black", label="Protein"),
+        Line2D([0], [0], linestyle="-.", color="black", label="Extra Feature"),
+        Line2D([0], [0], linestyle="-", color="black", label="Other"),
+    ]
+    color_legend = ax.legend(
+        handles=[
+            Patch(color=line.get_color(), label=line.get_label()) for line in ax.lines
+        ],
+        bbox_to_anchor=(1.05, 1),
+        title="Data type",
+        fontsize=16,
+        bbox_transform=ax.transAxes,
+    )
+    linestyle_legend = ax.legend(
+        handles=line_styles,
+        bbox_to_anchor=(1.05, 0.2),
+        title="Model",
+        fontsize=16,
+        bbox_transform=ax.transAxes,
+    )
     ax.add_artist(color_legend)
     ax.add_artist(linestyle_legend)
     ax.xaxis.set_major_formatter(edge_number)
-    ax.set_xlabel('number of pairs', fontsize=16)
-    ax.set_ylabel('log likelihood ratio', fontsize=16)
-    ax.yaxis.grid(color = 'gainsboro', linestyle = 'dotted')
+    ax.set_xlabel("number of pairs", fontsize=16)
+    ax.set_ylabel("log likelihood ratio", fontsize=16)
+    ax.yaxis.grid(color="gainsboro", linestyle="dotted")
     plt.tight_layout()
     plt.box(on=None)
-    plt.savefig(output_file, bbox_inches='tight', bbox_extra_artists=[color_legend, linestyle_legend])
+    plt.savefig(
+        output_file,
+        bbox_inches="tight",
+        bbox_extra_artists=[color_legend, linestyle_legend],
+    )
     plt.close(fig)
     return output_file
 
 
-def explore_data(data_config: Path,
-                min_sample_count: int,
-                output_dir: Path):
+def explore_data(data_config: Path, min_sample_count: int, output_dir: Path):
     """
     Generate plots to explore and visualize data
 
@@ -122,9 +143,9 @@ def explore_data(data_config: Path,
 
     max_col_to_plot = 100
 
-    log.info('Generating plots to explore and visualize data ...')
+    log.info("Generating plots to explore and visualize data ...")
     for ds in data_dict:
-        log.info(f'... {ds}')
+        log.info(f"... {ds}")
         data_df = data_dict[ds]
         fig, ax = plt.subplots(1, 2, figsize=(20, 5))
         cur_data = data_df.T
@@ -132,32 +153,34 @@ def explore_data(data_config: Path,
         if cur_data.shape[1] > max_col_to_plot:
             cur_data = cur_data.sample(max_col_to_plot, axis=1)
         ax[0].boxplot(cur_data)
-        ax[0].set_ylabel('expression')
+        ax[0].set_ylabel("expression")
         if data_df.shape[0] > max_col_to_plot:
-            ax[0].set_xlabel(f'random selected {max_col_to_plot} samples (total n={data_df.shape[0]})')
+            ax[0].set_xlabel(
+                f"random selected {max_col_to_plot} samples (total n={data_df.shape[0]})"
+            )
             ax[0].set_xticklabels([])
             ax[0].set_xticks([])
         else:
-            ax[0].set_xlabel('sample')
-            ax[0].set_xticklabels(cur_data.columns, rotation=45, ha='right')
+            ax[0].set_xlabel("sample")
+            ax[0].set_xticklabels(cur_data.columns, rotation=45, ha="right")
 
         # density plot for each sample in each dataset
         for i in range(data_df.shape[0]):
             sns.kdeplot(data_df.iloc[i, :], linewidth=1, ax=ax[1])
-        locator=MaxNLocator(60)
+        locator = MaxNLocator(60)
         ax[1].xaxis.set_major_locator(locator)
-        ax[1].set_xlabel('values')
-        ax[1].set_ylabel('density')
+        ax[1].set_xlabel("values")
+        ax[1].set_ylabel("density")
         ticks_loc = ax[1].get_xticks().tolist()
         ax[1].xaxis.set_major_locator(mticker.FixedLocator(ticks_loc))
-        ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=45, ha='right')
+        ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=45, ha="right")
 
         # set title for the figu
-        fig.suptitle(f'{ds}', fontsize=16)
+        fig.suptitle(f"{ds}", fontsize=16)
         fig.tight_layout()
-        cur_file_name = f'{ds}_sample_plot.pdf'
+        cur_file_name = f"{ds}_sample_plot.pdf"
         fig_names.append(cur_file_name)
-        log.info(f'Saving figure {cur_file_name} ...')
+        log.info(f"Saving figure {cur_file_name} ...")
         fig.savefig(output_dir / cur_file_name)
         plt.close(fig)
         data_keys.append(ds)
@@ -166,52 +189,58 @@ def explore_data(data_config: Path,
     fig, ax = plt.subplots(figsize=(10, 5))
     ax.boxplot(data)
     ax.set_xticklabels(data_keys, rotation=45)
-    ax.yaxis.grid(color = 'gainsboro', linestyle = 'dotted')
+    ax.yaxis.grid(color="gainsboro", linestyle="dotted")
 
-    ax.set_xlabel('dataset')
-    ax.set_ylabel('median expression')
+    ax.set_xlabel("dataset")
+    ax.set_ylabel("median expression")
     plt.box(on=None)
     fig.tight_layout()
-    file_name = 'data_box_plot.pdf'
+    file_name = "data_box_plot.pdf"
     fig_names.append(file_name)
-    log.info(f'Saving figure {file_name} ...')
+    log.info(f"Saving figure {file_name} ...")
     fig.savefig(output_dir / file_name)
     plt.close(fig)
 
     # boxplot of the number of samples and genes
     sample_count = pd.DataFrame(
-        {'count':[data_dict[ds].shape[0] for ds in data_dict],
-        'dataset': [ds for ds in data_dict]})
-    gene_count = pd.DataFrame({
-        'count':[data_dict[ds].shape[1] for ds in data_dict],
-        'dataset': [ds for ds in data_dict]})
+        {
+            "count": [data_dict[ds].shape[0] for ds in data_dict],
+            "dataset": [ds for ds in data_dict],
+        }
+    )
+    gene_count = pd.DataFrame(
+        {
+            "count": [data_dict[ds].shape[1] for ds in data_dict],
+            "dataset": [ds for ds in data_dict],
+        }
+    )
 
     fig, ax = plt.subplots(1, 2, figsize=(10, 5))
 
-    bars0 = ax[0].barh(sample_count['dataset'], sample_count['count'], color='#774FA0')
-    bars1 = ax[1].barh(gene_count['dataset'], gene_count['count'], color='#7DC462')
+    bars0 = ax[0].barh(sample_count["dataset"], sample_count["count"], color="#774FA0")
+    bars1 = ax[1].barh(gene_count["dataset"], gene_count["count"], color="#7DC462")
 
-    ax[0].spines['top'].set_visible(False)
-    ax[0].spines['left'].set_visible(False)
-    ax[0].spines['right'].set_visible(False)
-    ax[0].tick_params(axis='both', which='major', labelsize=12)
-    ax[0].bar_label(bars0, label_type='edge', fontsize=10)
-    ax[0].set_xlabel('number of samples')
+    ax[0].spines["top"].set_visible(False)
+    ax[0].spines["left"].set_visible(False)
+    ax[0].spines["right"].set_visible(False)
+    ax[0].tick_params(axis="both", which="major", labelsize=12)
+    ax[0].bar_label(bars0, label_type="edge", fontsize=10)
+    ax[0].set_xlabel("number of samples")
 
-    ax[1].spines['top'].set_visible(False)
-    ax[1].spines['left'].set_visible(False)
-    ax[1].spines['right'].set_visible(False)
+    ax[1].spines["top"].set_visible(False)
+    ax[1].spines["left"].set_visible(False)
+    ax[1].spines["right"].set_visible(False)
     ax[1].set_yticklabels([])
-    ax[1].tick_params(axis='x', which='major', labelsize=12)
-    ax[1].tick_params(axis='y', which='both', left=False, right=False, labelleft=False)
+    ax[1].tick_params(axis="x", which="major", labelsize=12)
+    ax[1].tick_params(axis="y", which="both", left=False, right=False, labelleft=False)
 
-    ax[1].bar_label(bars1, label_type='edge', fontsize=10)
-    ax[1].set_xlabel('number of genes')
+    ax[1].bar_label(bars1, label_type="edge", fontsize=10)
+    ax[1].set_xlabel("number of genes")
 
     fig.tight_layout()
-    file_name = 'sample_gene_count.pdf'
+    file_name = "sample_gene_count.pdf"
     fig.savefig(output_dir / file_name)
-    log.info(f'Saving figure {file_name} ...')
+    log.info(f"Saving figure {file_name} ...")
     plt.close(fig)
     fig_names.append(file_name)
     return fig_names
@@ -219,50 +248,66 @@ def explore_data(data_config: Path,
 
 def plot_results(cfg, validation_results, llr_ds, gs_dict, cutoff_llr, figure_dir):
     fig_names = []
-    file_name = 'llr_compare_dataset.pdf'
-    plot_llr_comparison(cfg, validation_results, llr_ds, output_file=figure_dir / file_name)
+    file_name = "llr_compare_dataset.pdf"
+    plot_llr_comparison(
+        cfg, validation_results, llr_ds, output_file=figure_dir / file_name
+    )
     fig_names.append(file_name)
 
-    if 'rp_pairs' in cfg:
-        file_names = plot_pair_llr(gs_dict, cfg['feature_type'], output_dir=figure_dir,
-                                rp_pairs=cfg['rp_pairs'])
+    if "rp_pairs" in cfg:
+        file_names = plot_pair_llr(
+            gs_dict,
+            cfg["feature_type"],
+            output_dir=figure_dir,
+            rp_pairs=cfg["rp_pairs"],
+        )
         fig_names.extend(file_names)
 
-    if cfg['task'] == 'protein_func':
-        file_name = 'llr_compare_networks.pdf'
+    if cfg["task"] == "protein_func":
+        file_name = "llr_compare_networks.pdf"
         # if gs_file is not specified, plot the llr comparison between networks
         # because some the numbers are pre-computed based on default gs_file
-        if cfg['gs_file'] is None:
-            plot_llr_compare_networks(validation_results, cfg['lr_cutoff'], cutoff_llr,
-                                    output_file=figure_dir / file_name)
+        if cfg["gs_file"] is None:
+            plot_llr_compare_networks(
+                validation_results,
+                cfg["lr_cutoff"],
+                cutoff_llr,
+                output_file=figure_dir / file_name,
+            )
             fig_names.append(file_name)
 
         # the information about other networks is fixed for now
         other_network_info = {
-            'name': ['BioGRID', 'BioPlex', 'HI-union', 'STRING'],
-            'type': ['BioGRID', 'BioPlex', 'HI', 'STRING'],
-            'url': ['https://figshare.com/ndownloader/files/39125054',
-                    'https://figshare.com/ndownloader/files/39125051',
-                    'https://figshare.com/ndownloader/files/39125093',
-                    'https://figshare.com/ndownloader/files/39125090'
-                ]
+            "name": ["BioGRID", "BioPlex", "HI-union", "STRING"],
+            "type": ["BioGRID", "BioPlex", "HI", "STRING"],
+            "url": [
+                "https://figshare.com/ndownloader/files/39125054",
+                "https://figshare.com/ndownloader/files/39125051",
+                "https://figshare.com/ndownloader/files/39125093",
+                "https://figshare.com/ndownloader/files/39125090",
+            ],
         }
         # convert the info to a data frame where the url is read as a dataframe
         network_info = pd.DataFrame(other_network_info)
-        network_info['el'] = network_info['url'].apply(lambda x: pd.read_csv(x,
-                                                        sep='\t', header=None))
-        network_info = network_info.drop(columns=['url'])
+        network_info["el"] = network_info["url"].apply(
+            lambda x: pd.read_csv(x, sep="\t", header=None)
+        )
+        network_info = network_info.drop(columns=["url"])
 
         # for each funmap, create a dataframe
         for ft in validation_results:
-            edge_file_path = validation_results[ft]['edge_list_path']
-            funmap_el = pd.read_csv(edge_file_path, sep='\t', header=None)
-            funmap_df = pd.DataFrame({'name': ['FunMap'], 'type': ['FunMap'], 'el': [funmap_el]})
+            edge_file_path = validation_results[ft]["edge_list_path"]
+            funmap_el = pd.read_csv(edge_file_path, sep="\t", header=None)
+            funmap_df = pd.DataFrame(
+                {"name": ["FunMap"], "type": ["FunMap"], "el": [funmap_el]}
+            )
             all_network_info = pd.concat([network_info, funmap_df], ignore_index=True)
             overlap_info = get_node_edge_overlap(all_network_info)
-            node_color, edge_color = '#7DC462', '#774FA0'
-            for (type, color) in zip(['node', 'edge'], [node_color, edge_color]):
-                fig_name = plot_overlap_venn(f'funmap_{ft}', overlap_info[type], type, color, figure_dir)
+            node_color, edge_color = "#7DC462", "#774FA0"
+            for type, color in zip(["node", "edge"], [node_color, edge_color]):
+                fig_name = plot_overlap_venn(
+                    f"funmap_{ft}", overlap_info[type], type, color, figure_dir
+                )
                 fig_names.append(fig_name)
 
             fig_name = plot_network_stats(all_network_info, ft, figure_dir)
@@ -297,34 +342,58 @@ def plot_1d_llr(ax, feature_df, feature_name, feature_type, data_type, n_bins):
     df = feature_df.loc[:, [feature_name]]
     cur_df = df.dropna()
     cur_df_vals = cur_df.values.reshape(-1)
-    clr = '#bcbddc'
-    data_range = {'CC': (-1, 1), 'MR': (0, 1)}
-    if data_type == 'PRO':
-        ax.hist(cur_df_vals, bins=n_bins, range=data_range[feature_type], color=clr,
-            orientation='horizontal',
-            density=True)
-        ax.text(0.95, 0.95, data_type,
-            verticalalignment='top', horizontalalignment='right',
+    clr = "#bcbddc"
+    data_range = {"CC": (-1, 1), "MR": (0, 1)}
+    if data_type == "PRO":
+        ax.hist(
+            cur_df_vals,
+            bins=n_bins,
+            range=data_range[feature_type],
+            color=clr,
+            orientation="horizontal",
+            density=True,
+        )
+        ax.text(
+            0.95,
+            0.95,
+            data_type,
+            verticalalignment="top",
+            horizontalalignment="right",
             transform=ax.transAxes,
             rotation=-90,
-            color='black', fontsize=16)
+            color="black",
+            fontsize=16,
+        )
         ax.set_ylim(data_range[feature_type])
-        ax.set_xlim(0,2.5)
+        ax.set_xlim(0, 2.5)
     else:
-        ax.hist(cur_df_vals, bins=n_bins, range=data_range[feature_type], color=clr,
-            density=True)
-        ax.text(0.02, 0.9, data_type,
-            verticalalignment='top', horizontalalignment='left',
+        ax.hist(
+            cur_df_vals,
+            bins=n_bins,
+            range=data_range[feature_type],
+            color=clr,
+            density=True,
+        )
+        ax.text(
+            0.02,
+            0.9,
+            data_type,
+            verticalalignment="top",
+            horizontalalignment="left",
             transform=ax.transAxes,
-            color='black', fontsize=16)
-        ax.set_ylabel('density', fontsize=16)
+            color="black",
+            fontsize=16,
+        )
+        ax.set_ylabel("density", fontsize=16)
         ax.set_xlim(data_range[feature_type])
-        ax.set_ylim(0,2.5)
-    ax.tick_params(axis='x', labelsize=16)
-    ax.tick_params(axis='y', labelsize=16)
+        ax.set_ylim(0, 2.5)
+    ax.tick_params(axis="x", labelsize=16)
+    ax.tick_params(axis="y", labelsize=16)
 
 
-def plot_2d_llr(ax, feature_df, feature_type, pair_name, rna_feature, pro_feature, n_bins):
+def plot_2d_llr(
+    ax, feature_df, feature_type, pair_name, rna_feature, pro_feature, n_bins
+):
     """
     Plots a 2D log likelihood ratio between two features in a scatter plot.
 
@@ -351,48 +420,54 @@ def plot_2d_llr(ax, feature_df, feature_type, pair_name, rna_feature, pro_featur
         The mesh plot of the log likelihood ratio.
 
     """
-    data_types = ['RNA', 'PRO']
-    label_mapping = {'PRO': 'Protein', 'RNA': 'mRNA'}
-    feature_label_mapping = {'CC': 'correlation coefficient',
-                            'MR': 'mutual rank'}
+    data_types = ["RNA", "PRO"]
+    label_mapping = {"PRO": "Protein", "RNA": "mRNA"}
+    feature_label_mapping = {"CC": "correlation coefficient", "MR": "mutual rank"}
     cnt = {}
     cnt_pos_neg = {}
     max_density = -1
 
-    data_range = {'CC': (-1, 1), 'MR': (0, 1)}
+    data_range = {"CC": (-1, 1), "MR": (0, 1)}
 
     for label in [0, 1]:
-        df = feature_df.loc[:, ['label', rna_feature, pro_feature]]
+        df = feature_df.loc[:, ["label", rna_feature, pro_feature]]
         df = df.dropna()
-        cur_df = df.loc[df['label'] == label, [rna_feature, pro_feature]]
-        hist_density, _, _ = np.histogram2d(cur_df[rna_feature].values,
-                                cur_df[pro_feature].values, bins=n_bins,
-                                range=np.array([data_range[feature_type],
-                                                data_range[feature_type]]),
-                                density=True)
+        cur_df = df.loc[df["label"] == label, [rna_feature, pro_feature]]
+        hist_density, _, _ = np.histogram2d(
+            cur_df[rna_feature].values,
+            cur_df[pro_feature].values,
+            bins=n_bins,
+            range=np.array([data_range[feature_type], data_range[feature_type]]),
+            density=True,
+        )
         max_density = max(max_density, np.max(hist_density))
 
     for label in [0, 1]:
-        df = feature_df.loc[:, ['label', rna_feature, pro_feature]]
+        df = feature_df.loc[:, ["label", rna_feature, pro_feature]]
         df = df.dropna()
-        cur_df = df.loc[df['label'] == label, [rna_feature, pro_feature]]
+        cur_df = df.loc[df["label"] == label, [rna_feature, pro_feature]]
         cnt[label] = cur_df.shape[0]
-        hist, _, _ = np.histogram2d(cur_df[rna_feature].values,
-                                    cur_df[pro_feature].values, bins=n_bins,
-                                    range=np.array([data_range[feature_type],
-                                                data_range[feature_type]]))
+        hist, _, _ = np.histogram2d(
+            cur_df[rna_feature].values,
+            cur_df[pro_feature].values,
+            bins=n_bins,
+            range=np.array([data_range[feature_type], data_range[feature_type]]),
+        )
         cnt_pos_neg[label] = hist
-        hh = ax.hist2d(cur_df[rna_feature].values,
-                        cur_df[pro_feature].values,
-                        bins=n_bins,
-                        range=np.array([data_range[feature_type],
-                            data_range[feature_type]]),
-                        vmin=0,
-                        vmax=max_density,
-                        density=True)
-
-    llr_vals = ((cnt_pos_neg[1] + 1)/(cnt_pos_neg[0] + cnt[0]/cnt[1]))/(cnt[1]/cnt[0])
-    if feature_type == 'MR':
+        hh = ax.hist2d(
+            cur_df[rna_feature].values,
+            cur_df[pro_feature].values,
+            bins=n_bins,
+            range=np.array([data_range[feature_type], data_range[feature_type]]),
+            vmin=0,
+            vmax=max_density,
+            density=True,
+        )
+
+    llr_vals = ((cnt_pos_neg[1] + 1) / (cnt_pos_neg[0] + cnt[0] / cnt[1])) / (
+        cnt[1] / cnt[0]
+    )
+    if feature_type == "MR":
         vmin = np.percentile(llr_vals, 5)
         vmax = np.percentile(llr_vals, 95)
         symmetric_max = max(abs(vmin), abs(vmax))
@@ -402,50 +477,74 @@ def plot_2d_llr(ax, feature_df, feature_type, pair_name, rna_feature, pro_featur
         vmin = -4
         vmax = 4
     cmap = plt.cm.RdBu_r
-    fig = ax.pcolormesh(hh[1], hh[2], np.transpose(np.log(llr_vals)), vmin=vmin, vmax=vmax,
-                        cmap=cmap)
-    ax.text(0.02, 0.01, pair_name,
-        verticalalignment='bottom', horizontalalignment='left',
+    fig = ax.pcolormesh(
+        hh[1], hh[2], np.transpose(np.log(llr_vals)), vmin=vmin, vmax=vmax, cmap=cmap
+    )
+    ax.text(
+        0.02,
+        0.01,
+        pair_name,
+        verticalalignment="bottom",
+        horizontalalignment="left",
         transform=ax.transAxes,
-        color='gray', fontsize=24, fontweight='bold')
-    ax.set_xlabel(f'{label_mapping[data_types[0]]}\n{feature_label_mapping[feature_type]}', fontsize=16)
-    ax.set_ylabel(f'{label_mapping[data_types[1]]}\n{feature_label_mapping[feature_type]}', fontsize=16)
-    if feature_type == 'CC':
+        color="gray",
+        fontsize=24,
+        fontweight="bold",
+    )
+    ax.set_xlabel(
+        f"{label_mapping[data_types[0]]}\n{feature_label_mapping[feature_type]}",
+        fontsize=16,
+    )
+    ax.set_ylabel(
+        f"{label_mapping[data_types[1]]}\n{feature_label_mapping[feature_type]}",
+        fontsize=16,
+    )
+    if feature_type == "CC":
         ax.set_xticks([-1, -0.5, 0, 0.5, 1])
         ax.set_yticks([-1, -0.5, 0, 0.5, 1])
     else:
         ax.set_xticks([0, 0.25, 0.5, 0.75, 1])
         ax.set_yticks([0, 0.25, 0.5, 0.75, 1])
-    ax.tick_params(axis='x', labelsize=16)
-    ax.tick_params(axis='y', labelsize=16)
+    ax.tick_params(axis="x", labelsize=16)
+    ax.tick_params(axis="y", labelsize=16)
     return fig
 
 
 def plot_pair_llr(gs_dict, feature_type, output_dir, rp_pairs):
     n_bins = 20
     file_names = []
-    feature_type = ['CC', 'MR']
+    feature_type = ["CC", "MR"]
     # plot for each rna-protein pair using CC or MR as feature
     for rp_pair in rp_pairs:
         for ft in feature_type:
             ft = ft.upper()
             feature_df = gs_dict[ft]
-            fig, ax = plt.subplots(2, 2, figsize=(10, 10),
-                                gridspec_kw={'width_ratios': [4, 1],
-                                            'height_ratios': [1, 4]})
-            rna_feature = rp_pair['rna']
-            pro_feature = rp_pair['protein']
-            plot_1d_llr(ax[0,0], feature_df, rna_feature, ft, 'RNA', n_bins)
-            ax[0, 0].xaxis.set_ticks_position('none')
+            fig, ax = plt.subplots(
+                2,
+                2,
+                figsize=(10, 10),
+                gridspec_kw={"width_ratios": [4, 1], "height_ratios": [1, 4]},
+            )
+            rna_feature = rp_pair["rna"]
+            pro_feature = rp_pair["protein"]
+            plot_1d_llr(ax[0, 0], feature_df, rna_feature, ft, "RNA", n_bins)
+            ax[0, 0].xaxis.set_ticks_position("none")
             ax[0, 0].set_xticklabels([])
 
-            ax[0,1].axis('off')
+            ax[0, 1].axis("off")
 
-            heatmap2d = plot_2d_llr(ax[1,0], feature_df, ft, rp_pair['name'],
-                rna_feature, pro_feature, n_bins)
+            heatmap2d = plot_2d_llr(
+                ax[1, 0],
+                feature_df,
+                ft,
+                rp_pair["name"],
+                rna_feature,
+                pro_feature,
+                n_bins,
+            )
 
-            plot_1d_llr(ax[1, 1], feature_df, pro_feature, ft, 'PRO', n_bins)
-            ax[1, 1].yaxis.set_ticks_position('none')
+            plot_1d_llr(ax[1, 1], feature_df, pro_feature, ft, "PRO", n_bins)
+            ax[1, 1].yaxis.set_ticks_position("none")
             ax[1, 1].set_yticklabels([])
 
             # add colorbar to the right of the plot
@@ -456,7 +555,7 @@ def plot_pair_llr(gs_dict, feature_type, output_dir, rp_pairs):
             plt.box(on=None)
             file_name = f"{rp_pair['name']}_rna_pro_{ft}_llr.pdf"
             file_names.append(file_name)
-            plt.savefig(output_dir / file_name, bbox_inches='tight')
+            plt.savefig(output_dir / file_name, bbox_inches="tight")
             plt.close(fig)
 
     return file_names
@@ -466,98 +565,143 @@ def plot_llr_compare_networks(validaton_results, cutoff, cutoff_llr, output_file
     all_networks = []
 
     for ft in validaton_results:
-        edge_list = pd.read_csv(validaton_results[ft]['edge_list_path'], sep='\t', header=None)
+        edge_list = pd.read_csv(
+            validaton_results[ft]["edge_list_path"], sep="\t", header=None
+        )
         n_edge = len(edge_list)
         n_node = len(set(edge_list.iloc[:, 0]) | set(edge_list.iloc[:, 1]))
-        all_networks.extend([
-            (f'FunMap_{ft}', 'FunMap', n_node, n_edge, cutoff_llr[ft], np.exp(cutoff_llr[ft]))
-        ])
+        all_networks.extend(
+            [
+                (
+                    f"FunMap_{ft}",
+                    "FunMap",
+                    n_node,
+                    n_edge,
+                    cutoff_llr[ft],
+                    np.exp(cutoff_llr[ft]),
+                )
+            ]
+        )
 
     # these are pre-computed values
     all_networks.extend(
-    [
-        # name, type, n, e, llr, lr
-        ('HuRI', 'HI', 8272, 52548, 2.3139014130648827, 10.11),
-        ('HI-union', 'HI', 9094, 64006, 2.298975841813893, 9.96),
-        ('ProHD', 'ProHD', 2680, 61580, 4.039348296, 56.78),
-        #  this is combined_score_700
-        ('STRING', 'STRING', 16351, 240314,  5.229377563059293, 186.676572955849),
-        ('BioGRID', 'BioGRID', 17259, 654490, 2.6524642147396182, 14.18896024552041),
-        ('BioPlex', 'BioPlex', 13854, 154428, 3.3329858940888375, 28.021887299660047)
-    ]
+        [
+            # name, type, n, e, llr, lr
+            ("HuRI", "HI", 8272, 52548, 2.3139014130648827, 10.11),
+            ("HI-union", "HI", 9094, 64006, 2.298975841813893, 9.96),
+            ("ProHD", "ProHD", 2680, 61580, 4.039348296, 56.78),
+            #  this is combined_score_700
+            ("STRING", "STRING", 16351, 240314, 5.229377563059293, 186.676572955849),
+            (
+                "BioGRID",
+                "BioGRID",
+                17259,
+                654490,
+                2.6524642147396182,
+                14.18896024552041,
+            ),
+            (
+                "BioPlex",
+                "BioPlex",
+                13854,
+                154428,
+                3.3329858940888375,
+                28.021887299660047,
+            ),
+        ]
     )
     log.info(all_networks)
 
-    cols = ['name', 'group', 'n', 'e', 'llr', 'lr']
+    cols = ["name", "group", "n", "e", "llr", "lr"]
     network_data = pd.DataFrame(all_networks, columns=cols)
-    x = np.array(network_data['n'])
-    y = np.array(network_data['llr'])
-    e = np.array(network_data['e'])
-    z = network_data['name']
+    x = np.array(network_data["n"])
+    y = np.array(network_data["llr"])
+    e = np.array(network_data["e"])
+    z = network_data["name"]
 
     fig, ax = plt.subplots(figsize=(10, 10))
     ax.set_axisbelow(True)
-    ax.xaxis.grid(color = 'gainsboro', linestyle = 'dotted')
-    ax.yaxis.grid(color = 'gainsboro', linestyle = 'dotted')
-    ax.get_ygridlines()[4].set_color('salmon')
-    ax.get_yticklabels()[4].set_color('red')
+    ax.xaxis.grid(color="gainsboro", linestyle="dotted")
+    ax.yaxis.grid(color="gainsboro", linestyle="dotted")
+    ax.get_ygridlines()[4].set_color("salmon")
+    ax.get_yticklabels()[4].set_color("red")
     ax2 = ax.twinx()
 
     # we have 6 groups, so we need 6 colors
-    mycmap = matplotlib.colors.ListedColormap(['#de2d26', '#8B6CAF', '#0D95D0',
-                                            '#69A953', '#F1C36B', '#DC6C43'])
+    mycmap = matplotlib.colors.ListedColormap(
+        ["#de2d26", "#8B6CAF", "#0D95D0", "#69A953", "#F1C36B", "#DC6C43"]
+    )
     # group 0 is FunMap, group 1 is HI, group 2 is ProHD, group 3 is STRING,
     # group 4 is BioGRID, group 5 is BioPlex
     # the length of gro
-    color_group = [0] * len(validaton_results) + [1] * 2 + [2] * 1 + [3] * 1 + [4] * 1 + [5] * 1
-    scatter = ax.scatter(x, y, c=color_group, cmap=mycmap,
-                        s=e/1000)
+    color_group = (
+        [0] * len(validaton_results) + [1] * 2 + [2] * 1 + [3] * 1 + [4] * 1 + [5] * 1
+    )
+    scatter = ax.scatter(x, y, c=color_group, cmap=mycmap, s=e / 1000)
     ax.set_ylim(2, 6)
     ax2.set_ylim(np.exp(2.0), np.exp(6))
-    ax.set_xlabel('number of genes')
+    ax.set_xlabel("number of genes")
     ax.set_yticks([2.0, 2.5, 3, 3.5, np.log(cutoff), 4, 4.5, 5, 5.5, 6])
-    ax.set_ylabel('log likelihood ratio')
-    ax.spines['top'].set_visible(False)
-    ax.spines['left'].set_visible(False)
-    ax.spines['right'].set_visible(False)
-    ax2.set_ylabel('likelihood ratio')
-    ax2.set_yscale('log', base=np.e)
-    ax2.set_yticks([np.exp(2), np.exp(2.5), np.exp(3), np.exp(3.5),
-                    cutoff, np.exp(4), np.exp(4.5), np.exp(5),
-                    np.exp(5.5), np.exp(6)])
-    ax.tick_params(axis='x', labelsize=12)
-    ax.tick_params(axis='y', labelsize=12)
-    ax2.tick_params(axis='y', labelsize=12)
-    ax2.get_yticklabels()[4].set_color('red')
+    ax.set_ylabel("log likelihood ratio")
+    ax.spines["top"].set_visible(False)
+    ax.spines["left"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    ax2.set_ylabel("likelihood ratio")
+    ax2.set_yscale("log", base=np.e)
+    ax2.set_yticks(
+        [
+            np.exp(2),
+            np.exp(2.5),
+            np.exp(3),
+            np.exp(3.5),
+            cutoff,
+            np.exp(4),
+            np.exp(4.5),
+            np.exp(5),
+            np.exp(5.5),
+            np.exp(6),
+        ]
+    )
+    ax.tick_params(axis="x", labelsize=12)
+    ax.tick_params(axis="y", labelsize=12)
+    ax2.tick_params(axis="y", labelsize=12)
+    ax2.get_yticklabels()[4].set_color("red")
     ax2.get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
-    ax2.spines['top'].set_visible(False)
-    ax2.spines['left'].set_visible(False)
-    ax2.spines['right'].set_visible(False)
+    ax2.spines["top"].set_visible(False)
+    ax2.spines["left"].set_visible(False)
+    ax2.spines["right"].set_visible(False)
 
-    handles1, labels1 = scatter.legend_elements(prop='sizes', num=4, alpha=1,
-                                                fmt='{x:.0f} K')
-    legend1 = ax.legend(handles1, labels1,
-                        loc='upper left', labelspacing=1.8, borderpad=1.0,
-                        title='number of pairs', frameon=True)
+    handles1, labels1 = scatter.legend_elements(
+        prop="sizes", num=4, alpha=1, fmt="{x:.0f} K"
+    )
+    legend1 = ax.legend(
+        handles1,
+        labels1,
+        loc="upper left",
+        labelspacing=1.8,
+        borderpad=1.0,
+        title="number of pairs",
+        frameon=True,
+    )
 
     ax.add_artist(legend1)
     leg = ax.get_legend()
     for i in range(len(leg.legendHandles)):
-        leg.legendHandles[i].set_color('gray')
+        leg.legendHandles[i].set_color("gray")
 
     ax.xaxis.set_major_formatter(edge_number)
     for i, txt in enumerate(z):
-        if txt == 'STRING':
-            ax.annotate(txt, (x[i]-500, y[i]+0.18), color='gray', fontsize=10)
-        elif txt == 'HI-union':
-            ax.annotate(txt, (x[i]-100, y[i]+0.1), color='gray', fontsize=10)
-        elif txt == 'BioGRID':
-            ax.annotate(txt, (x[i]-800, y[i]-0.25), color='gray', fontsize=10)
+        if txt == "STRING":
+            ax.annotate(txt, (x[i] - 500, y[i] + 0.18), color="gray", fontsize=10)
+        elif txt == "HI-union":
+            ax.annotate(txt, (x[i] - 100, y[i] + 0.1), color="gray", fontsize=10)
+        elif txt == "BioGRID":
+            ax.annotate(txt, (x[i] - 800, y[i] - 0.25), color="gray", fontsize=10)
         else:
-            ax.annotate(txt, (x[i]-100, y[i]-0.2), color='gray', fontsize=10)
+            ax.annotate(txt, (x[i] - 100, y[i] - 0.2), color="gray", fontsize=10)
 
     fig.tight_layout()
-    fig.savefig(output_file, bbox_inches='tight')
+    fig.savefig(output_file, bbox_inches="tight")
     plt.close(fig)
 
 
@@ -593,25 +737,29 @@ def plot_overlap_venn(network_name, overlap, node_or_edge, color, output_dir):
     max_area = max(map(sum, data))
 
     def set_venn_scale(ax, true_area, reference_area=max_area):
-        s = np.sqrt(float(reference_area)/true_area)
+        s = np.sqrt(float(reference_area) / true_area)
         ax.set_xlim(-s, s)
         ax.set_ylim(-s, s)
 
     all_axes = []
 
     n_plot = len(overlap)
-    fig, ax = plt.subplots(1, n_plot, figsize=(5*n_plot, 5))
+    fig, ax = plt.subplots(1, n_plot, figsize=(5 * n_plot, 5))
 
     for i, nw in enumerate(overlap):
         cur_ax = ax[i]
         all_axes.append(cur_ax)
-        labels = ('FunMap', nw)
-        out = venn2(overlap[nw],
-                            set_labels=labels, alpha=1.0,
-                            ax=cur_ax, set_colors=[color, 'white'])
-        venn2_circles(overlap[nw], ax=cur_ax, linestyle='solid',
-                    color='gray',
-                    linewidth=1)
+        labels = ("FunMap", nw)
+        out = venn2(
+            overlap[nw],
+            set_labels=labels,
+            alpha=1.0,
+            ax=cur_ax,
+            set_colors=[color, "white"],
+        )
+        venn2_circles(
+            overlap[nw], ax=cur_ax, linestyle="solid", color="gray", linewidth=1
+        )
         if out.set_labels:
             for text in out.set_labels:
                 text.set_fontsize(12)
@@ -620,14 +768,14 @@ def set_venn_scale(ax, true_area, reference_area=max_area):
             text.set_fontsize(10)
 
     # add title to the figure
-    name = 'genes' if node_or_edge == 'node' else 'edges'
-    fig.suptitle(f'Overlap of {name} ({network_name})', fontsize=16)
+    name = "genes" if node_or_edge == "node" else "edges"
+    fig.suptitle(f"Overlap of {name} ({network_name})", fontsize=16)
 
     for a, d in zip(flatten(ax), data):
-        set_venn_scale(a, sum(d)*1.5)
+        set_venn_scale(a, sum(d) * 1.5)
 
-    file_name = f'{network_name}_overlap_{node_or_edge}.pdf'
-    fig.savefig(output_dir / file_name, bbox_inches='tight')
+    file_name = f"{network_name}_overlap_{node_or_edge}.pdf"
+    fig.savefig(output_dir / file_name, bbox_inches="tight")
     plt.close(fig)
     return file_name
 
@@ -638,17 +786,17 @@ def plot_network_stats(network_info, feature_type, output_dir):
     average_shortest_path = {}
     # these are pre-calculated since they take a long time to compute and
     # the network is fixed
-    average_shortest_path ={
-        'BioGRID': 2.74,
-        'BioPlex': 3.60,
-        'HI-union': 3.70,
-        'STRING': 3.95
+    average_shortest_path = {
+        "BioGRID": 2.74,
+        "BioPlex": 3.60,
+        "HI-union": 3.70,
+        "STRING": 3.95,
     }
     # if you want to recompute the average shortest path length,
     # add the network name to this list
-    network_list = ['FunMap']
+    network_list = ["FunMap"]
     for n in network_list:
-        network_el = network_info.loc[network_info['name'] == n, 'el'].values[0]
+        network_el = network_info.loc[network_info["name"] == n, "el"].values[0]
         cur_network = nx.from_pandas_edgelist(network_el, source=0, target=1)
         cur_density = nx.density(cur_network)
         density[n] = cur_density
@@ -657,94 +805,111 @@ def plot_network_stats(network_info, feature_type, output_dir):
         cur_average_shortest_path = nx.average_shortest_path_length(cur_cc)
         average_shortest_path[n] = cur_average_shortest_path
         cur_degrees = [val for (_, val) in cur_network.degree()]
-        if n == 'FunMap': # only fit for FunMap
+        if n == "FunMap":  # only fit for FunMap
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
-                fit = powerlaw.Fit(cur_degrees, discrete=True, xmax=250, estimate_discrete=False)
-                powerlaw.plot_pdf(cur_degrees, linear_bins=True, linestyle='None', marker='o',
-                            markerfacecolor='None', color='#de2d26',
-                            linewidth=3, ax=ax[0])
+                fit = powerlaw.Fit(
+                    cur_degrees, discrete=True, xmax=250, estimate_discrete=False
+                )
+                powerlaw.plot_pdf(
+                    cur_degrees,
+                    linear_bins=True,
+                    linestyle="None",
+                    marker="o",
+                    markerfacecolor="None",
+                    color="#de2d26",
+                    linewidth=3,
+                    ax=ax[0],
+                )
             # not plotting the power law fit
             # fit.power_law.plot_pdf(linestyle='--',color='black', ax=ax[0])
 
     # all the networks in network_info minus FunMap
-    other_networks = list(set(network_info['name'].tolist()) - set(['FunMap']))
+    other_networks = list(set(network_info["name"].tolist()) - set(["FunMap"]))
     for n in other_networks:
-        network_el = network_info.loc[network_info['name'] == n, 'el'].values[0]
+        network_el = network_info.loc[network_info["name"] == n, "el"].values[0]
         cur_network = nx.from_pandas_edgelist(network_el, source=0, target=1)
         cur_density = nx.density(cur_network)
         density[n] = cur_density
 
-    ax[0].set_xlabel('degree')
-    ax[0].set_ylabel('p(x)')
-    ax[0].spines['top'].set_visible(False)
-    ax[0].spines['right'].set_visible(False)
-    ax[0].yaxis.grid(color = 'gainsboro', linestyle = 'dotted')
+    ax[0].set_xlabel("degree")
+    ax[0].set_ylabel("p(x)")
+    ax[0].spines["top"].set_visible(False)
+    ax[0].spines["right"].set_visible(False)
+    ax[0].yaxis.grid(color="gainsboro", linestyle="dotted")
     ax[0].set_axisbelow(True)
 
     # global average clustering coefficient
     # these are pre-calculated since they take a long time to compute and
     # the network is fixed
-    avg_cc = {
-            'BioGRID': 0.125,
-            'BioPlex': 0.103,
-            'HI-union': 0.06,
-            'STRING': 0.335
-        }
+    avg_cc = {"BioGRID": 0.125, "BioPlex": 0.103, "HI-union": 0.06, "STRING": 0.335}
     for n in network_list:
-        network_el = network_info.loc[network_info['name'] == n, 'el'].values[0]
+        network_el = network_info.loc[network_info["name"] == n, "el"].values[0]
         cur_network = nx.from_pandas_edgelist(network_el, source=0, target=1)
         cur_cc = nx.average_clustering(cur_network)
         avg_cc[n] = cur_cc
 
-    network_list = network_info['name'].tolist()
-    ax[1].bar(network_list, [avg_cc[i] for i in network_list], width=0.5, align='center',
-            color='#E4C89A')
-    ax[1].spines['left'].set_position(('outward', 8))
-    ax[1].spines['bottom'].set_position(('outward', 5))
-    ax[1].spines['top'].set_visible(False)
-    ax[1].spines['left'].set_visible(False)
-    ax[1].spines['right'].set_visible(False)
-    ax[1].set_ylabel('Average clustering coefficient')
+    network_list = network_info["name"].tolist()
+    ax[1].bar(
+        network_list,
+        [avg_cc[i] for i in network_list],
+        width=0.5,
+        align="center",
+        color="#E4C89A",
+    )
+    ax[1].spines["left"].set_position(("outward", 8))
+    ax[1].spines["bottom"].set_position(("outward", 5))
+    ax[1].spines["top"].set_visible(False)
+    ax[1].spines["left"].set_visible(False)
+    ax[1].spines["right"].set_visible(False)
+    ax[1].set_ylabel("Average clustering coefficient")
     ticks_loc = ax[1].get_xticks()
     ax[1].xaxis.set_major_locator(mticker.FixedLocator(ticks_loc))
-    ax[1].set_xticklabels(network_list, rotation=45, ha='right')
-    ax[1].yaxis.grid(color = 'gainsboro', linestyle = 'dotted')
+    ax[1].set_xticklabels(network_list, rotation=45, ha="right")
+    ax[1].yaxis.grid(color="gainsboro", linestyle="dotted")
     ax[1].set_axisbelow(True)
 
-    ax[2].bar(network_list, [density[i] for i in network_list], width=0.5, align='center',
-            color = '#D8B2C6'
-            )
-    ax[2].spines['left'].set_position(('outward', 8))
-    ax[2].spines['bottom'].set_position(('outward', 5))
-    ax[2].spines['top'].set_visible(False)
-    ax[2].spines['left'].set_visible(False)
-    ax[2].spines['right'].set_visible(False)
-    ax[2].set_ylabel('Density')
+    ax[2].bar(
+        network_list,
+        [density[i] for i in network_list],
+        width=0.5,
+        align="center",
+        color="#D8B2C6",
+    )
+    ax[2].spines["left"].set_position(("outward", 8))
+    ax[2].spines["bottom"].set_position(("outward", 5))
+    ax[2].spines["top"].set_visible(False)
+    ax[2].spines["left"].set_visible(False)
+    ax[2].spines["right"].set_visible(False)
+    ax[2].set_ylabel("Density")
     ticks_loc = ax[2].get_xticks()
     ax[2].xaxis.set_major_locator(mticker.FixedLocator(ticks_loc))
-    ax[2].set_xticklabels(network_list, rotation=45, ha='right')
-    ax[2].yaxis.grid(color = 'gainsboro', linestyle = 'dotted')
+    ax[2].set_xticklabels(network_list, rotation=45, ha="right")
+    ax[2].yaxis.grid(color="gainsboro", linestyle="dotted")
     ax[2].set_axisbelow(True)
 
-    ax[3].bar(network_list, [average_shortest_path[i] for i in network_list], width=0.5, align='center',
-            color = '#B6D8A6'
-            )
-    ax[3].spines['left'].set_position(('outward', 8))
-    ax[3].spines['bottom'].set_position(('outward', 5))
-    ax[3].spines['top'].set_visible(False)
-    ax[3].spines['left'].set_visible(False)
-    ax[3].spines['right'].set_visible(False)
-    ax[3].set_ylabel('Average shortest path length')
+    ax[3].bar(
+        network_list,
+        [average_shortest_path[i] for i in network_list],
+        width=0.5,
+        align="center",
+        color="#B6D8A6",
+    )
+    ax[3].spines["left"].set_position(("outward", 8))
+    ax[3].spines["bottom"].set_position(("outward", 5))
+    ax[3].spines["top"].set_visible(False)
+    ax[3].spines["left"].set_visible(False)
+    ax[3].spines["right"].set_visible(False)
+    ax[3].set_ylabel("Average shortest path length")
     ticks_loc = ax[3].get_xticks()
     ax[3].xaxis.set_major_locator(mticker.FixedLocator(ticks_loc))
-    ax[3].set_xticklabels(network_list, rotation=45, ha='right')
-    ax[3].yaxis.grid(color = 'gainsboro', linestyle = 'dotted')
+    ax[3].set_xticklabels(network_list, rotation=45, ha="right")
+    ax[3].yaxis.grid(color="gainsboro", linestyle="dotted")
     ax[3].set_axisbelow(True)
 
-    fig.suptitle(f'Network properties of Funmap ({feature_type})', fontsize=16)
-    file_name = f'funmap_{feature_type}_network_properties.pdf'
-    fig.savefig(output_dir / file_name, bbox_inches='tight')
+    fig.suptitle(f"Network properties of Funmap ({feature_type})", fontsize=16)
+    file_name = f"funmap_{feature_type}_network_properties.pdf"
+    fig.savefig(output_dir / file_name, bbox_inches="tight")
     plt.close(fig)
     return file_name
 
@@ -778,12 +943,12 @@ def merge_and_delete(fig_dir, file_list, output_file):
         pdf_writer.add_outline_item(os.path.splitext(file)[0], total_page_num)
         total_page_num = total_page_num + cur_page_num
 
-    with open(fig_dir / output_file, 'wb') as fh:
+    with open(fig_dir / output_file, "wb") as fh:
         pdf_writer.write(fh)
-        log.info('figures have been merged.')
+        log.info("figures have been merged.")
 
     for filename in file_list:
         try:
             os.remove(fig_dir / filename)
         except:
-            log.error(f'{filename} could not be deleted.')
+            log.error(f"{filename} could not be deleted.")

From 74448dad2c1e9a0689aa8f3caafccb8e63317ddc Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 19 Sep 2024 16:26:30 -0500
Subject: [PATCH 20/54] Remove excess logging and fix llr parameters

---
 funmap/cli.py    | 1 +
 funmap/funmap.py | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/funmap/cli.py b/funmap/cli.py
index 8cd7cf34..31bd0336 100644
--- a/funmap/cli.py
+++ b/funmap/cli.py
@@ -312,6 +312,7 @@ def run(config_file, force_rerun):
             max_num_edges,
             step_size,
             llr_dataset_file,
+            extra_feature_file,
         )
         log.info("Done.")
     else:
diff --git a/funmap/funmap.py b/funmap/funmap.py
index 90076c9b..73616190 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -313,7 +313,6 @@ def extract_features(
 
     # TODO: add extra features if provided
     if extra_feature is not None:
-        log.info(f"Loading extra features from {extra_feature}")
         extra_feature_df = pd.read_csv(extra_feature, sep="\t")
         extra_feature_df.columns.values[0] = "P1"
         extra_feature_df.columns.values[1] = "P2"

From b885a86c7619f21a811ee629ae423abc0b01c4b4 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 19 Sep 2024 17:08:43 -0500
Subject: [PATCH 21/54] fix return value

---
 funmap/funmap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index 73616190..86cd393b 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -559,7 +559,7 @@ def extract_dataset_feature(all_pairs, feature_file, feature_type="cc"):
 def extract_extra_features(all_pairs, ef_df):
     # convert all_pairs to a dataframe
     df = pd.DataFrame(all_pairs, columns=["P1", "P2"])
-    pd.merge(df, ef_df, on=["P1", "P2"], how="left")
+    return pd.merge(df, ef_df, on=["P1", "P2"], how="left")
 
 
 def dataset_llr(

From 077e066199c078cc8c853015948edd283b67d546 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Fri, 20 Sep 2024 09:50:10 -0500
Subject: [PATCH 22/54] skip edge count check for extra features

---
 funmap/funmap.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index 86cd393b..1ef957ba 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -419,12 +419,18 @@ def train_model(X, y, seed, n_jobs, feature_mapping, model_dir):
 
 
 def compute_llr(
-    predicted_all_pairs, llr_res_file, start_edge_num, max_num_edges, step_size, gs_test
+    predicted_all_pairs,
+    llr_res_file,
+    start_edge_num,
+    max_num_edges,
+    step_size,
+    gs_test,
+    is_extra_feat=False,
 ):
     # make sure max_num_edges is smaller than the number of non-NA values
-    assert max_num_edges < np.count_nonzero(
+    assert is_extra_feat or max_num_edges < np.count_nonzero(
         ~np.isnan(predicted_all_pairs.iloc[:, -1].values)
-    ), f"max_num_edges should be smaller than the number of non-NA values"
+    ), "max_num_edges should be smaller than the number of non-NA values"
 
     cur_col_name = "prediction"
     cur_results = predicted_all_pairs.nlargest(max_num_edges, cur_col_name)
@@ -635,12 +641,7 @@ def dataset_llr(
             subset_df.columns.values[-1] = "prediction"
             log.info(f"Calculating llr for extra feature {f} ...")
             cur_llr_res = compute_llr(
-                subset_df,
-                None,
-                start_edge_num,
-                max_num_edge,
-                step_size,
-                gs_test,
+                subset_df, None, start_edge_num, max_num_edge, step_size, gs_test, True
             )
             cur_llr_res["dataset"] = f + "_EXTRAFEAT"
             llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True)

From 157f64c0cdd45de3009954b577dbe84e13869c24 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Fri, 20 Sep 2024 10:17:10 -0500
Subject: [PATCH 23/54] check if dataset is extra feature before checking type

---
 funmap/plotting.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/funmap/plotting.py b/funmap/plotting.py
index 6a01b74b..57019304 100644
--- a/funmap/plotting.py
+++ b/funmap/plotting.py
@@ -58,13 +58,13 @@ def plot_llr_comparison(
     for ds in datasets:
         start = -1
         cur_df = llr_ds[llr_ds["dataset"] == ds]
-        if name_type_dict[ds].upper() == "RNA":
+        if ds.endswith("_EXTRAFEAT"):
+            ds = ds.replace("_EXTRAFEAT", "")
+            ltype = "-."
+        elif name_type_dict[ds].upper() == "RNA":
             ltype = "--"
         elif name_type_dict[ds].upper() == "PROTEIN":
             ltype = ":"
-        elif ds.endswith("_EXTRAFEAT"):
-            ds = ds.replace("_EXTRAFEAT", "")
-            ltype = "-."
         else:
             ltype = "-"
         ax.plot(cur_df["k"], cur_df["llr"], linestyle=ltype, label=ds)

From 998a418e2a767ec5011149d4ba0a0b446ff35687 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Fri, 20 Sep 2024 17:17:23 -0500
Subject: [PATCH 24/54] Add back LLR calculation for extra features

---
 funmap/funmap.py | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/funmap/funmap.py b/funmap/funmap.py
index 6434a802..2a4952b7 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -644,7 +644,36 @@ def dataset_llr(
     llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True)
     log.info("Calculating llr for all datasets average ... done")
     llr_ds.to_csv(llr_dataset_file, sep="\t", index=False)
-
+    if extra_feature is not None:
+        log.info("Calculating LLR for extra features")
+        extra_feature_df = pd.read_csv(extra_feature, sep="\t")
+        extra_feature_df.columns.values[0] = "P1"
+        extra_feature_df.columns.values[1] = "P2"
+        extra_feature_df[["P1", "P2"]] = extra_feature_df.apply(
+            lambda row: sorted([row["P1"], row["P2"]])
+            if row["P1"] > row["P2"]
+            else [row["P1"], row["P2"]],
+            axis=1,
+            result_type="expand",
+        )
+        extra_feature_df = extra_feature_df.drop_duplicates(
+            subset=["P1", "P2"], keep="last"
+        )
+        extra_feature_df = extract_extra_features(
+            all_pairs, extra_feature_df
+        )  # filter out unused pairs
+        features = extra_feature_df.columns.values[2:]
+        for f in features:
+            subset_df = extra_feature_df[["P1", "P2", f]]
+            subset_df.columns.values[-1] = "prediction"
+            log.info(f"Calculating llr for extra feature {f} ...")
+            cur_llr_res = compute_llr(
+                subset_df, None, start_edge_num, max_num_edge, step_size, gs_test, True
+            )
+            cur_llr_res["dataset"] = f + "_EXTRAFEAT"
+            llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True)
+            llr_ds.to_csv(llr_dataset_file, sep="\t", index=False)
+            log.info(f"Calculating llr for {dataset} ... done")
     return llr_ds
 
 

From f6bb066d64d6576c6d86b3b63dc5c78ba76f8b3d Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Wed, 25 Sep 2024 14:10:58 -0500
Subject: [PATCH 25/54] Process extra feature file once instead of multiple
 times

---
 funmap/cli.py    | 14 ++++++++++----
 funmap/funmap.py | 45 +++++++++------------------------------------
 funmap/utils.py  | 28 +++++++++++++++++++++++-----
 3 files changed, 42 insertions(+), 45 deletions(-)

diff --git a/funmap/cli.py b/funmap/cli.py
index 0a7e7f40..3560eeb4 100644
--- a/funmap/cli.py
+++ b/funmap/cli.py
@@ -26,6 +26,7 @@
     check_extra_feature_file,
     check_gold_standard_file,
     cleanup_experiment,
+    process_extra_feature,
     setup_experiment,
 )
 
@@ -120,6 +121,11 @@ def run(config_file, force_rerun):
     setup_logging(config_file)
     cfg = setup_experiment(config_file)
     extra_feature_file = cfg["extra_feature_file"]
+    if extra_feature_file is not None:
+        log.info("Loading extra feature file into dataframe")
+        extra_feature_df = process_extra_feature(extra_feature_file)
+    else:
+        extra_feature_df = None
     # if (extra_feature_file is not None) and (not check_extra_feature_file(extra_feature_file)):
     #     return
     gs_file = cfg["gs_file"]
@@ -192,7 +198,7 @@ def run(config_file, force_rerun):
         "mr_dict": mr_dict,
         "feature_type": feature_type,
         "gs_file": gs_file,
-        "extra_feature_file": extra_feature_file,
+        "extra_feature_df": extra_feature_df,
         "valid_id_list": all_valid_ids,
         "test_size": test_size,
         "seed": seed,
@@ -255,7 +261,7 @@ def run(config_file, force_rerun):
                 "ppi_feature": ppi_feature,
                 "cc_dict": cc_dict,
                 "mr_dict": mr_dict,
-                "extra_feature_file": extra_feature_file,
+                "extra_feature_df": extra_feature_df,
                 "prediction_dir": prediction_dir,
                 "output_file": predicted_all_pairs_file,
                 "n_jobs": n_jobs,
@@ -326,7 +332,7 @@ def run(config_file, force_rerun):
             max_num_edges,
             step_size,
             llr_dataset_file,
-            extra_feature_file,
+            extra_feature_df,
         )
         log.info("Done.")
     else:
@@ -356,7 +362,7 @@ def run(config_file, force_rerun):
             "feature_type": "cc",
             "gs_file": gs_file,
             # no extra feature for plotting
-            "extra_feature_file": None,
+            "extra_feature_df": None,
             "valid_id_list": all_valid_ids,
             "test_size": test_size,
             "seed": seed,
diff --git a/funmap/funmap.py b/funmap/funmap.py
index 2a4952b7..b0e405f7 100644
--- a/funmap/funmap.py
+++ b/funmap/funmap.py
@@ -23,7 +23,6 @@
 from funmap.data_urls import network_info
 from funmap.logger import setup_logger
 from funmap.utils import (
-    check_extra_feature_file,
     get_data_dict,
     is_url_scheme,
     read_csv_with_md5_check,
@@ -290,7 +289,7 @@ def get_1d_indices(i_array, j_array, n):
 
 
 def extract_features(
-    df, feature_type, cc_dict, ppi_feature=None, extra_feature=None, mr_dict=None
+    df, feature_type, cc_dict, ppi_feature=None, extra_feature_df=None, mr_dict=None
 ):
     """
       extract_features - creates the final feature `pandas` dataframe used by xgboost
@@ -326,20 +325,7 @@ def extract_features(
             )
 
     # TODO: add extra features if provided
-    if extra_feature is not None:
-        extra_feature_df = pd.read_csv(extra_feature, sep="\t")
-        extra_feature_df.columns.values[0] = "P1"
-        extra_feature_df.columns.values[1] = "P2"
-        extra_feature_df[["P1", "P2"]] = extra_feature_df.apply(
-            lambda row: sorted([row["P1"], row["P2"]])
-            if row["P1"] > row["P2"]
-            else [row["P1"], row["P2"]],
-            axis=1,
-            result_type="expand",
-        )
-        extra_feature_df = extra_feature_df.drop_duplicates(
-            subset=["P1", "P2"], keep="last"
-        )
+    if extra_feature_df is not None:
         df.reset_index(drop=True, inplace=True)
         merged_df = pd.merge(
             df[["P1", "P2"]], extra_feature_df, on=["P1", "P2"], how="left"
@@ -506,7 +492,7 @@ def prepare_gs_data(**kwargs):
     gs_file = kwargs["gs_file"]
     gs_file_md5 = None
     feature_type = kwargs["feature_type"]
-    extra_feature_file = kwargs["extra_feature_file"]
+    extra_feature_df = kwargs["extra_feature_df"]
     valid_id_list = kwargs["valid_id_list"]
     test_size = kwargs["test_size"]
     seed = kwargs["seed"]
@@ -530,10 +516,10 @@ def prepare_gs_data(**kwargs):
     else:
         ppi_feature = None
     gs_train_df = extract_features(
-        gs_train, feature_type, cc_dict, ppi_feature, extra_feature_file, mr_dict
+        gs_train, feature_type, cc_dict, ppi_feature, extra_feature_df, mr_dict
     )
     gs_test_df = extract_features(
-        gs_test, feature_type, cc_dict, ppi_feature, extra_feature_file, mr_dict
+        gs_test, feature_type, cc_dict, ppi_feature, extra_feature_df, mr_dict
     )
 
     # store both the ids with gs_test_df for later use
@@ -599,7 +585,7 @@ def dataset_llr(
     max_num_edge,
     step_size,
     llr_dataset_file,
-    extra_feature,
+    extra_feature_df,
 ):
     llr_ds = pd.DataFrame()
     all_ids_sorted = sorted(all_ids)
@@ -644,21 +630,8 @@ def dataset_llr(
     llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True)
     log.info("Calculating llr for all datasets average ... done")
     llr_ds.to_csv(llr_dataset_file, sep="\t", index=False)
-    if extra_feature is not None:
+    if extra_feature_df is not None:
         log.info("Calculating LLR for extra features")
-        extra_feature_df = pd.read_csv(extra_feature, sep="\t")
-        extra_feature_df.columns.values[0] = "P1"
-        extra_feature_df.columns.values[1] = "P2"
-        extra_feature_df[["P1", "P2"]] = extra_feature_df.apply(
-            lambda row: sorted([row["P1"], row["P2"]])
-            if row["P1"] > row["P2"]
-            else [row["P1"], row["P2"]],
-            axis=1,
-            result_type="expand",
-        )
-        extra_feature_df = extra_feature_df.drop_duplicates(
-            subset=["P1", "P2"], keep="last"
-        )
         extra_feature_df = extract_extra_features(
             all_pairs, extra_feature_df
         )  # filter out unused pairs
@@ -751,7 +724,7 @@ def predict_all_pairs(
     ppi_feature,
     cc_dict,
     mr_dict,
-    extra_feature_file,
+    extra_feature_df,
     prediction_dir,
     output_file,
     n_jobs=1,
@@ -784,7 +757,7 @@ def process_and_save_chunk(start_idx, chunk_id):
                 feature_type,
                 cc_dict,
                 cur_ppi_feature,
-                extra_feature_file,
+                extra_feature_df,
                 mr_dict,
             )
             predictions = model.predict_proba(feature_df)
diff --git a/funmap/utils.py b/funmap/utils.py
index cc00a09c..81bb68cb 100644
--- a/funmap/utils.py
+++ b/funmap/utils.py
@@ -1,14 +1,16 @@
 import csv
-import yaml
+import hashlib
 import os
-import tarfile
 import re
-import hashlib
+import shutil
+import tarfile
 import urllib
-from urllib.parse import urlparse
 from pathlib import Path
+from urllib.parse import urlparse
+
 import pandas as pd
-import shutil
+import yaml
+
 from funmap.data_urls import misc_urls as urls
 from funmap.logger import setup_logger
 
@@ -553,3 +555,19 @@ def check_extra_feature_file(file_path, missing_value="NA"):
                         return False
 
     return True
+
+
+def process_extra_feature(extra_feature_file) -> pd.DataFrame:
+    extra_feature_df = pd.read_csv(extra_feature_file, sep="\t")
+    extra_feature_df.columns.values[0] = "P1"
+    extra_feature_df.columns.values[1] = "P2"
+    extra_feature_df[["P1", "P2"]] = extra_feature_df.apply(
+        lambda row: sorted([row["P1"], row["P2"]])
+        if row["P1"] > row["P2"]
+        else [row["P1"], row["P2"]],
+        axis=1,
+        result_type="expand",
+    )
+    extra_feature_df = extra_feature_df.drop_duplicates(
+        subset=["P1", "P2"], keep="last"
+    )

From 2b1f63038467be6a1b82a1cc2fb98ae95556aab3 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Fri, 27 Sep 2024 10:51:52 -0500
Subject: [PATCH 26/54] Migrate project structure to use maturin for rust file
 import

---
 .github/workflows/CI.yml               | 181 +++++++++++++++++++++++++
 .gitignore                             |  11 ++
 Cargo.lock                             | 171 +++++++++++++++++++++++
 Cargo.toml                             |  12 ++
 pyproject.toml                         |  47 +++++++
 pyproject_maturin.toml                 |  16 +++
 {funmap => python/funmap}/__init__.py  |   0
 {funmap => python/funmap}/cli.py       |   7 +-
 {funmap => python/funmap}/data_urls.py |   0
 {funmap => python/funmap}/funmap.py    |   0
 {funmap => python/funmap}/logger.py    |   0
 {funmap => python/funmap}/plotting.py  |   0
 {funmap => python/funmap}/saving.py    |   0
 {funmap => python/funmap}/utils.py     |   0
 requirements-dev.lock                  | 116 ++++++++++++++++
 requirements.lock                      | 115 ++++++++++++++++
 src/lib.rs                             |  14 ++
 17 files changed, 689 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/CI.yml
 create mode 100644 Cargo.lock
 create mode 100644 Cargo.toml
 create mode 100644 pyproject.toml
 create mode 100644 pyproject_maturin.toml
 rename {funmap => python/funmap}/__init__.py (100%)
 rename {funmap => python/funmap}/cli.py (99%)
 rename {funmap => python/funmap}/data_urls.py (100%)
 rename {funmap => python/funmap}/funmap.py (100%)
 rename {funmap => python/funmap}/logger.py (100%)
 rename {funmap => python/funmap}/plotting.py (100%)
 rename {funmap => python/funmap}/saving.py (100%)
 rename {funmap => python/funmap}/utils.py (100%)
 create mode 100644 requirements-dev.lock
 create mode 100644 requirements.lock
 create mode 100644 src/lib.rs

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
new file mode 100644
index 00000000..ad3ec43f
--- /dev/null
+++ b/.github/workflows/CI.yml
@@ -0,0 +1,181 @@
+# This file is autogenerated by maturin v1.7.4
+# To update, run
+#
+#    maturin generate-ci github
+#
+name: CI
+
+on:
+  push:
+    branches:
+      - main
+      - master
+    tags:
+      - '*'
+  pull_request:
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  linux:
+    runs-on: ${{ matrix.platform.runner }}
+    strategy:
+      matrix:
+        platform:
+          - runner: ubuntu-latest
+            target: x86_64
+          - runner: ubuntu-latest
+            target: x86
+          - runner: ubuntu-latest
+            target: aarch64
+          - runner: ubuntu-latest
+            target: armv7
+          - runner: ubuntu-latest
+            target: s390x
+          - runner: ubuntu-latest
+            target: ppc64le
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: 3.x
+      - name: Build wheels
+        uses: PyO3/maturin-action@v1
+        with:
+          target: ${{ matrix.platform.target }}
+          args: --release --out dist --find-interpreter
+          sccache: 'true'
+          manylinux: auto
+      - name: Upload wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels-linux-${{ matrix.platform.target }}
+          path: dist
+
+  musllinux:
+    runs-on: ${{ matrix.platform.runner }}
+    strategy:
+      matrix:
+        platform:
+          - runner: ubuntu-latest
+            target: x86_64
+          - runner: ubuntu-latest
+            target: x86
+          - runner: ubuntu-latest
+            target: aarch64
+          - runner: ubuntu-latest
+            target: armv7
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: 3.x
+      - name: Build wheels
+        uses: PyO3/maturin-action@v1
+        with:
+          target: ${{ matrix.platform.target }}
+          args: --release --out dist --find-interpreter
+          sccache: 'true'
+          manylinux: musllinux_1_2
+      - name: Upload wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels-musllinux-${{ matrix.platform.target }}
+          path: dist
+
+  windows:
+    runs-on: ${{ matrix.platform.runner }}
+    strategy:
+      matrix:
+        platform:
+          - runner: windows-latest
+            target: x64
+          - runner: windows-latest
+            target: x86
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: 3.x
+          architecture: ${{ matrix.platform.target }}
+      - name: Build wheels
+        uses: PyO3/maturin-action@v1
+        with:
+          target: ${{ matrix.platform.target }}
+          args: --release --out dist --find-interpreter
+          sccache: 'true'
+      - name: Upload wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels-windows-${{ matrix.platform.target }}
+          path: dist
+
+  macos:
+    runs-on: ${{ matrix.platform.runner }}
+    strategy:
+      matrix:
+        platform:
+          - runner: macos-12
+            target: x86_64
+          - runner: macos-14
+            target: aarch64
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: 3.x
+      - name: Build wheels
+        uses: PyO3/maturin-action@v1
+        with:
+          target: ${{ matrix.platform.target }}
+          args: --release --out dist --find-interpreter
+          sccache: 'true'
+      - name: Upload wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels-macos-${{ matrix.platform.target }}
+          path: dist
+
+  sdist:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Build sdist
+        uses: PyO3/maturin-action@v1
+        with:
+          command: sdist
+          args: --out dist
+      - name: Upload sdist
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels-sdist
+          path: dist
+
+  release:
+    name: Release
+    runs-on: ubuntu-latest
+    if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }}
+    needs: [linux, musllinux, windows, macos, sdist]
+    permissions:
+      # Use to sign the release artifacts
+      id-token: write
+      # Used to upload release artifacts
+      contents: write
+      # Used to generate artifact attestation
+      attestations: write
+    steps:
+      - uses: actions/download-artifact@v4
+      - name: Generate artifact attestation
+        uses: actions/attest-build-provenance@v1
+        with:
+          subject-path: 'wheels-*/*'
+      - name: Publish to PyPI
+        if: "startsWith(github.ref, 'refs/tags/')"
+        uses: PyO3/maturin-action@v1
+        env:
+          MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
+        with:
+          command: upload
+          args: --non-interactive --skip-existing wheels-*/*
diff --git a/.gitignore b/.gitignore
index 7ea9ea8f..eba216d4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -109,3 +109,14 @@ ENV/
 .DS_Store
 results
 *old
+
+# python generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+
+# venv
+.venv
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 00000000..ded191fc
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,171 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "autocfg"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "funmap_lib"
+version = "0.1.0"
+dependencies = [
+ "pyo3",
+]
+
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
+[[package]]
+name = "indoc"
+version = "2.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5"
+
+[[package]]
+name = "libc"
+version = "0.2.159"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5"
+
+[[package]]
+name = "memoffset"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
+
+[[package]]
+name = "portable-atomic"
+version = "1.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d30538d42559de6b034bc76fd6dd4c38961b1ee5c6c56e3808c50128fdbc22ce"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.86"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "pyo3"
+version = "0.22.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "15ee168e30649f7f234c3d49ef5a7a6cbf5134289bc46c29ff3155fa3221c225"
+dependencies = [
+ "cfg-if",
+ "indoc",
+ "libc",
+ "memoffset",
+ "once_cell",
+ "portable-atomic",
+ "pyo3-build-config",
+ "pyo3-ffi",
+ "pyo3-macros",
+ "unindent",
+]
+
+[[package]]
+name = "pyo3-build-config"
+version = "0.22.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e61cef80755fe9e46bb8a0b8f20752ca7676dcc07a5277d8b7768c6172e529b3"
+dependencies = [
+ "once_cell",
+ "target-lexicon",
+]
+
+[[package]]
+name = "pyo3-ffi"
+version = "0.22.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67ce096073ec5405f5ee2b8b31f03a68e02aa10d5d4f565eca04acc41931fa1c"
+dependencies = [
+ "libc",
+ "pyo3-build-config",
+]
+
+[[package]]
+name = "pyo3-macros"
+version = "0.22.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2440c6d12bc8f3ae39f1e775266fa5122fd0c8891ce7520fa6048e683ad3de28"
+dependencies = [
+ "proc-macro2",
+ "pyo3-macros-backend",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "pyo3-macros-backend"
+version = "0.22.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1be962f0e06da8f8465729ea2cb71a416d2257dff56cbe40a70d3e62a93ae5d1"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "pyo3-build-config",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.77"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "target-lexicon"
+version = "0.12.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
+
+[[package]]
+name = "unindent"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 00000000..ecac49bc
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "funmap_lib"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+[lib]
+name = "funmap"
+crate-type = ["cdylib"]
+
+[dependencies]
+pyo3 = "0.22.0"
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..d78d7f95
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,47 @@
+[project]
+name = "funmap"
+version = "0.2.0"
+description = "generate gene co-function networks using omics data"
+authors = [{ name = "Zhiao Shi", email = "zhiao.shi@gmail.com" }]
+dependencies = [
+    "PyPDF2==3.0.1",
+    "click==8.1.7",
+    "h5py==3.10.0",
+    "imbalanced-learn==0.11.0",
+    "joblib==1.3.2",
+    "matplotlib==3.7.3",
+    "matplotlib_venn==0.11.9",
+    "networkx==3.1",
+    "numpy==1.24.4",
+    "pandas==2.0.3",
+    "powerlaw==1.5",
+    "pyarrow==13.0.0",
+    "pyyaml==6.0.1",
+    "scikit-learn==1.3.2",
+    "scipy==1.10.1",
+    "seaborn==0.13.0",
+    "tables==3.8.0",
+    "tqdm==4.66.1",
+    "xgboost==2.0.0",
+]
+readme = "README.md"
+license = { text = "MIT license" }
+requires-python = ">=3.8"
+classifiers = [
+    "Programming Language :: Rust",
+    "Programming Language :: Python :: Implementation :: CPython",
+    "Programming Language :: Python :: Implementation :: PyPy",
+]
+
+[build-system]
+requires = ["maturin>=1,<2"]
+build-backend = "maturin"
+
+[tool.maturin]
+features = ["pyo3/extension-module"]
+python-source = "python"
+module-name = "funmap._lib"
+
+[tool.rye]
+managed = true
+dev-dependencies = ["maturin>=1.7.4"]
diff --git a/pyproject_maturin.toml b/pyproject_maturin.toml
new file mode 100644
index 00000000..bb2585da
--- /dev/null
+++ b/pyproject_maturin.toml
@@ -0,0 +1,16 @@
+[build-system]
+requires = ["maturin>=1.7,<2.0"]
+build-backend = "maturin"
+
+[project]
+name = "funmap"
+requires-python = ">=3.8"
+classifiers = [
+    "Programming Language :: Rust",
+    "Programming Language :: Python :: Implementation :: CPython",
+    "Programming Language :: Python :: Implementation :: PyPy",
+]
+dynamic = ["version"]
+[tool.maturin]
+features = ["pyo3/extension-module"]
+python-source = "funmap"
diff --git a/funmap/__init__.py b/python/funmap/__init__.py
similarity index 100%
rename from funmap/__init__.py
rename to python/funmap/__init__.py
diff --git a/funmap/cli.py b/python/funmap/cli.py
similarity index 99%
rename from funmap/cli.py
rename to python/funmap/cli.py
index 3560eeb4..09640f4f 100644
--- a/funmap/cli.py
+++ b/python/funmap/cli.py
@@ -6,7 +6,7 @@
 import click
 import numpy as np
 import pandas as pd
-
+from funmap import _lib as funmap_lib
 from funmap import __version__
 from funmap.data_urls import misc_urls as urls
 from funmap.funmap import (
@@ -86,6 +86,11 @@ def qc(config_file, force_rerun):
     log.info("QC complete")
 
 
+@cli.command()
+def rust():
+    print(f"sum: {funmap_lib.sum_as_string(5, 10)}")
+
+
 @cli.command(help="run funmap")
 @click.option(
     "--config-file",
diff --git a/funmap/data_urls.py b/python/funmap/data_urls.py
similarity index 100%
rename from funmap/data_urls.py
rename to python/funmap/data_urls.py
diff --git a/funmap/funmap.py b/python/funmap/funmap.py
similarity index 100%
rename from funmap/funmap.py
rename to python/funmap/funmap.py
diff --git a/funmap/logger.py b/python/funmap/logger.py
similarity index 100%
rename from funmap/logger.py
rename to python/funmap/logger.py
diff --git a/funmap/plotting.py b/python/funmap/plotting.py
similarity index 100%
rename from funmap/plotting.py
rename to python/funmap/plotting.py
diff --git a/funmap/saving.py b/python/funmap/saving.py
similarity index 100%
rename from funmap/saving.py
rename to python/funmap/saving.py
diff --git a/funmap/utils.py b/python/funmap/utils.py
similarity index 100%
rename from funmap/utils.py
rename to python/funmap/utils.py
diff --git a/requirements-dev.lock b/requirements-dev.lock
new file mode 100644
index 00000000..d3de5995
--- /dev/null
+++ b/requirements-dev.lock
@@ -0,0 +1,116 @@
+# generated by rye
+# use `rye lock` or `rye sync` to update this lockfile
+#
+# last locked with the following flags:
+#   pre: false
+#   features: []
+#   all-features: false
+#   with-sources: false
+#   generate-hashes: false
+#   universal: false
+
+-e file:.
+blosc2==2.0.0
+    # via tables
+click==8.1.7
+    # via funmap
+contourpy==1.3.0
+    # via matplotlib
+cycler==0.12.1
+    # via matplotlib
+cython==3.0.11
+    # via tables
+fonttools==4.54.1
+    # via matplotlib
+h5py==3.10.0
+    # via funmap
+imbalanced-learn==0.11.0
+    # via funmap
+joblib==1.3.2
+    # via funmap
+    # via imbalanced-learn
+    # via scikit-learn
+kiwisolver==1.4.7
+    # via matplotlib
+matplotlib==3.7.3
+    # via funmap
+    # via matplotlib-venn
+    # via powerlaw
+    # via seaborn
+matplotlib-venn==0.11.9
+    # via funmap
+maturin==1.7.4
+mpmath==1.3.0
+    # via powerlaw
+msgpack==1.1.0
+    # via blosc2
+networkx==3.1
+    # via funmap
+numexpr==2.10.1
+    # via tables
+numpy==1.24.4
+    # via contourpy
+    # via funmap
+    # via h5py
+    # via imbalanced-learn
+    # via matplotlib
+    # via matplotlib-venn
+    # via numexpr
+    # via pandas
+    # via powerlaw
+    # via pyarrow
+    # via scikit-learn
+    # via scipy
+    # via seaborn
+    # via tables
+    # via xgboost
+packaging==24.1
+    # via matplotlib
+    # via tables
+pandas==2.0.3
+    # via funmap
+    # via seaborn
+pillow==10.4.0
+    # via matplotlib
+powerlaw==1.5
+    # via funmap
+py-cpuinfo==9.0.0
+    # via tables
+pyarrow==13.0.0
+    # via funmap
+pyparsing==3.1.4
+    # via matplotlib
+pypdf2==3.0.1
+    # via funmap
+python-dateutil==2.9.0.post0
+    # via matplotlib
+    # via pandas
+pytz==2024.2
+    # via pandas
+pyyaml==6.0.1
+    # via funmap
+scikit-learn==1.3.2
+    # via funmap
+    # via imbalanced-learn
+scipy==1.10.1
+    # via funmap
+    # via imbalanced-learn
+    # via matplotlib-venn
+    # via powerlaw
+    # via scikit-learn
+    # via xgboost
+seaborn==0.13.0
+    # via funmap
+six==1.16.0
+    # via python-dateutil
+tables==3.8.0
+    # via funmap
+threadpoolctl==3.5.0
+    # via imbalanced-learn
+    # via scikit-learn
+tqdm==4.66.1
+    # via funmap
+tzdata==2024.2
+    # via pandas
+xgboost==2.0.0
+    # via funmap
diff --git a/requirements.lock b/requirements.lock
new file mode 100644
index 00000000..4ca03409
--- /dev/null
+++ b/requirements.lock
@@ -0,0 +1,115 @@
+# generated by rye
+# use `rye lock` or `rye sync` to update this lockfile
+#
+# last locked with the following flags:
+#   pre: false
+#   features: []
+#   all-features: false
+#   with-sources: false
+#   generate-hashes: false
+#   universal: false
+
+-e file:.
+blosc2==2.0.0
+    # via tables
+click==8.1.7
+    # via funmap
+contourpy==1.3.0
+    # via matplotlib
+cycler==0.12.1
+    # via matplotlib
+cython==3.0.11
+    # via tables
+fonttools==4.54.1
+    # via matplotlib
+h5py==3.10.0
+    # via funmap
+imbalanced-learn==0.11.0
+    # via funmap
+joblib==1.3.2
+    # via funmap
+    # via imbalanced-learn
+    # via scikit-learn
+kiwisolver==1.4.7
+    # via matplotlib
+matplotlib==3.7.3
+    # via funmap
+    # via matplotlib-venn
+    # via powerlaw
+    # via seaborn
+matplotlib-venn==0.11.9
+    # via funmap
+mpmath==1.3.0
+    # via powerlaw
+msgpack==1.1.0
+    # via blosc2
+networkx==3.1
+    # via funmap
+numexpr==2.10.1
+    # via tables
+numpy==1.24.4
+    # via contourpy
+    # via funmap
+    # via h5py
+    # via imbalanced-learn
+    # via matplotlib
+    # via matplotlib-venn
+    # via numexpr
+    # via pandas
+    # via powerlaw
+    # via pyarrow
+    # via scikit-learn
+    # via scipy
+    # via seaborn
+    # via tables
+    # via xgboost
+packaging==24.1
+    # via matplotlib
+    # via tables
+pandas==2.0.3
+    # via funmap
+    # via seaborn
+pillow==10.4.0
+    # via matplotlib
+powerlaw==1.5
+    # via funmap
+py-cpuinfo==9.0.0
+    # via tables
+pyarrow==13.0.0
+    # via funmap
+pyparsing==3.1.4
+    # via matplotlib
+pypdf2==3.0.1
+    # via funmap
+python-dateutil==2.9.0.post0
+    # via matplotlib
+    # via pandas
+pytz==2024.2
+    # via pandas
+pyyaml==6.0.1
+    # via funmap
+scikit-learn==1.3.2
+    # via funmap
+    # via imbalanced-learn
+scipy==1.10.1
+    # via funmap
+    # via imbalanced-learn
+    # via matplotlib-venn
+    # via powerlaw
+    # via scikit-learn
+    # via xgboost
+seaborn==0.13.0
+    # via funmap
+six==1.16.0
+    # via python-dateutil
+tables==3.8.0
+    # via funmap
+threadpoolctl==3.5.0
+    # via imbalanced-learn
+    # via scikit-learn
+tqdm==4.66.1
+    # via funmap
+tzdata==2024.2
+    # via pandas
+xgboost==2.0.0
+    # via funmap
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 00000000..38f97a17
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,14 @@
+use pyo3::prelude::*;
+
+/// Formats the sum of two numbers as string.
+#[pyfunction]
+fn sum_as_string(a: usize, b: usize) -> PyResult<String> {
+    Ok((a + b).to_string())
+}
+
+/// A Python module implemented in Rust.
+#[pymodule]
+fn _lib(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    m.add_function(wrap_pyfunction!(sum_as_string, m)?)?;
+    Ok(())
+}

From 1a0db667ae8f9cf3c21c3ddab5692dda6b95b89c Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Fri, 27 Sep 2024 12:55:17 -0500
Subject: [PATCH 27/54] Allow funmap to be run as cli

---
 pyproject.toml         |  2 ++
 pyproject_maturin.toml | 16 ----------------
 src/lib.rs             |  3 ++-
 3 files changed, 4 insertions(+), 17 deletions(-)
 delete mode 100644 pyproject_maturin.toml

diff --git a/pyproject.toml b/pyproject.toml
index d78d7f95..2f5df509 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,8 @@ classifiers = [
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Python :: Implementation :: PyPy",
 ]
+[project.scripts]
+funmap = "funmap.cli:cli"
 
 [build-system]
 requires = ["maturin>=1,<2"]
diff --git a/pyproject_maturin.toml b/pyproject_maturin.toml
deleted file mode 100644
index bb2585da..00000000
--- a/pyproject_maturin.toml
+++ /dev/null
@@ -1,16 +0,0 @@
-[build-system]
-requires = ["maturin>=1.7,<2.0"]
-build-backend = "maturin"
-
-[project]
-name = "funmap"
-requires-python = ">=3.8"
-classifiers = [
-    "Programming Language :: Rust",
-    "Programming Language :: Python :: Implementation :: CPython",
-    "Programming Language :: Python :: Implementation :: PyPy",
-]
-dynamic = ["version"]
-[tool.maturin]
-features = ["pyo3/extension-module"]
-python-source = "funmap"
diff --git a/src/lib.rs b/src/lib.rs
index 38f97a17..69bb5572 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -8,7 +8,8 @@ fn sum_as_string(a: usize, b: usize) -> PyResult<String> {
 
 /// A Python module implemented in Rust.
 #[pymodule]
-fn _lib(m: &Bound<'_, PyModule>) -> PyResult<()> {
+#[pyo3(name = "_lib")]
+fn funmap_lib(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(sum_as_string, m)?)?;
     Ok(())
 }

From 9e534788ccd3e3ae6fac594d1b744138ba4e1987 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Fri, 27 Sep 2024 16:54:53 -0500
Subject: [PATCH 28/54] formatting changes

---
 python/funmap/data_urls.py | 23 ++++++++++++-----------
 python/funmap/logger.py    |  9 ++++++---
 python/funmap/saving.py    |  5 ++++-
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/python/funmap/data_urls.py b/python/funmap/data_urls.py
index 222ce369..e0c6915c 100644
--- a/python/funmap/data_urls.py
+++ b/python/funmap/data_urls.py
@@ -1,17 +1,18 @@
 misc_urls = {
-    'reactome_gold_standard': 'https://figshare.com/ndownloader/files/38647601',
-    'reactome_gold_standard_md5': '671942763b6a7c32506cba1ed9900fe6',
-    'funmap_blacklist': 'https://figshare.com/ndownloader/files/39033977',
-    'mapping_file': 'https://figshare.com/ndownloader/files/39033971'
+    "reactome_gold_standard": "https://figshare.com/ndownloader/files/38647601",
+    "reactome_gold_standard_md5": "671942763b6a7c32506cba1ed9900fe6",
+    "funmap_blacklist": "https://figshare.com/ndownloader/files/39033977",
+    "mapping_file": "https://figshare.com/ndownloader/files/39033971",
 }
 
 # the information about other networks is fixed for now
 network_info = {
-    'name': ['BioGRID', 'BioPlex', 'HI-union', 'STRING'],
-    'type': ['BioGRID', 'BioPlex', 'HI', 'STRING'],
-    'url': ['https://figshare.com/ndownloader/files/39125054',
-            'https://figshare.com/ndownloader/files/39125051',
-            'https://figshare.com/ndownloader/files/39125093',
-            'https://figshare.com/ndownloader/files/39125090'
-        ]
+    "name": ["BioGRID", "BioPlex", "HI-union", "STRING"],
+    "type": ["BioGRID", "BioPlex", "HI", "STRING"],
+    "url": [
+        "https://figshare.com/ndownloader/files/39125054",
+        "https://figshare.com/ndownloader/files/39125051",
+        "https://figshare.com/ndownloader/files/39125093",
+        "https://figshare.com/ndownloader/files/39125090",
+    ],
 }
diff --git a/python/funmap/logger.py b/python/funmap/logger.py
index ef53b84f..d994cdf9 100644
--- a/python/funmap/logger.py
+++ b/python/funmap/logger.py
@@ -7,6 +7,7 @@
 
 LOG_LEVEL = logging.INFO
 
+
 def setup_logging(run_config, log_config="logging.yml") -> None:
     """
     Setup ``logging.config``
@@ -44,7 +45,7 @@ def setup_logging(run_config, log_config="logging.yml") -> None:
         handler_critical.setLevel(logging.CRITICAL)
 
         # Create formatters (if needed)
-        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
         handler_debug.setFormatter(formatter)
         handler_info.setFormatter(formatter)
         handler_warning.setFormatter(formatter)
@@ -69,7 +70,9 @@ def setup_logging(run_config, log_config="logging.yml") -> None:
         logger.addHandler(console_handler)
 
         # Log a warning message to the console
-        logger.warning(f'"{log_config}" not found. Using basicConfig with custom log files.')
+        logger.warning(
+            f'"{log_config}" not found. Using basicConfig with custom log files.'
+        )
         return
 
     with open(log_config, "rt") as f:
@@ -85,6 +88,6 @@ def setup_logging(run_config, log_config="logging.yml") -> None:
 
 
 def setup_logger(name):
-    log = logging.getLogger(f'funmap.{name}')
+    log = logging.getLogger(f"funmap.{name}")
     log.setLevel(LOG_LEVEL)
     return log
diff --git a/python/funmap/saving.py b/python/funmap/saving.py
index a92eac8c..fbbff116 100644
--- a/python/funmap/saving.py
+++ b/python/funmap/saving.py
@@ -3,6 +3,7 @@
 
 LOG_DIR = "logs"
 
+
 def ensure_exists(p: Path) -> Path:
     """
     Helper to ensure a directory exists.
@@ -11,14 +12,16 @@ def ensure_exists(p: Path) -> Path:
     p.mkdir(parents=True, exist_ok=True)
     return p
 
+
 def arch_path(config_file) -> Path:
     """
     Construct a path based on the name of a configuration file eg. 'saved/EfficientNet'
     """
     config = yaml.safe_load(open(config_file))
-    p = Path(config['results_dir']) / config['name']
+    p = Path(config["results_dir"]) / config["name"]
     return ensure_exists(p)
 
+
 def log_path(config_file) -> Path:
     p = arch_path(config_file) / LOG_DIR
     return ensure_exists(p)

From bc18d7281e78fcb54a9bb94ef1d63f7ae39e58d9 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Fri, 27 Sep 2024 16:56:50 -0500
Subject: [PATCH 29/54] Add only_extra_features key to config

---
 python/funmap/cli.py    |   2 +-
 python/funmap/funmap.py |  99 +++++++++++++++++-----------------
 python/funmap/utils.py  | 117 ++++++++++++++++++++++------------------
 3 files changed, 113 insertions(+), 105 deletions(-)

diff --git a/python/funmap/cli.py b/python/funmap/cli.py
index 09640f4f..7635dcfe 100644
--- a/python/funmap/cli.py
+++ b/python/funmap/cli.py
@@ -222,7 +222,7 @@ def run(config_file, force_rerun):
             os.path.exists(file_path) for file_path in ml_model_file.values()
         )
         if all_model_exist:
-            log.info(f"Trained model(s) exists. Loading model(s) ...")
+            log.info("Trained model(s) exists. Loading model(s) ...")
             ml_model_dict = {}
             # feature: ex or ei
             for feature in ml_model_file:
diff --git a/python/funmap/funmap.py b/python/funmap/funmap.py
index b0e405f7..78e0cf3c 100644
--- a/python/funmap/funmap.py
+++ b/python/funmap/funmap.py
@@ -147,53 +147,59 @@ def compute_features(cfg, feature_type, min_sample_count, output_dir):
     """Compute the pearson correlation coefficient for each edge in the list of edges and for each"""
     data_dict, all_valid_ids = get_data_dict(cfg, min_sample_count)
     cc_dict = {}
-    for i in data_dict:
-        cc_file = os.path.join(output_dir, f"cc_{i}.h5")
-        cc_dict[i] = cc_file
-
     mr_dict = {}
-    for i in data_dict:
-        mr_file = os.path.join(output_dir, f"mr_{i}.h5")
-        mr_dict[i] = mr_file
-
-    all_cc_exist = all(os.path.exists(file_path) for file_path in cc_dict.values())
-    if all_cc_exist:
-        log.info("All cc files exist. Skipping feature computation.")
-        if feature_type == "cc":
-            return cc_dict, mr_dict, all_valid_ids
+    if not cfg["only_extra_features"]:
+        for i in data_dict:
+            cc_file = os.path.join(output_dir, f"cc_{i}.h5")
+            cc_dict[i] = cc_file
+
+        mr_dict = {}
+        for i in data_dict:
+            mr_file = os.path.join(output_dir, f"mr_{i}.h5")
+            mr_dict[i] = mr_file
+
+        all_cc_exist = all(os.path.exists(file_path) for file_path in cc_dict.values())
+        if all_cc_exist:
+            log.info("All cc files exist. Skipping feature computation.")
+            if feature_type == "cc":
+                return cc_dict, mr_dict, all_valid_ids
 
-    if feature_type == "mr":
-        all_mr_exist = all(os.path.exists(file_path) for file_path in mr_dict.values())
-        if all_cc_exist and all_mr_exist:
-            log.info("All mr files exist. Skipping feature computation.")
-            return cc_dict, mr_dict, all_valid_ids
-
-    log.debug(f"Computing {feature_type} features")
-    for i in data_dict:
-        cc_file = cc_dict[i]
-        if os.path.exists(cc_file):
-            continue
-        log.info(f"Computing pearson correlation coefficient matrix for {i}")
-        x = data_dict[i].values.astype(np.float32)
-        df = pd.DataFrame(x)
-        corr_matrix = df.corr(method="pearson", min_periods=min_sample_count)
-        arr = corr_matrix.values
-        upper_indices = np.triu_indices(arr.shape[0])
-        with h5py.File(cc_dict[i], "w") as hf:
-            # only store the upper triangle part
-            hf.create_dataset("cc", data=arr[upper_indices])
-            hf.create_dataset("ids", data=data_dict[i].columns.values.astype("S"))
-        cc_dict[i] = cc_file
-
-        # compute pairwise mutual rank features
         if feature_type == "mr":
-            log.info(f"Computing mutual rank matrix for {i}")
-            arr_mr = pairwise_mutual_rank(arr)
-            upper_indices = np.triu_indices(arr_mr.shape[0])
-            with h5py.File(mr_dict[i], "w") as hf:
+            all_mr_exist = all(
+                os.path.exists(file_path) for file_path in mr_dict.values()
+            )
+            if all_cc_exist and all_mr_exist:
+                log.info("All mr files exist. Skipping feature computation.")
+                return cc_dict, mr_dict, all_valid_ids
+
+        log.debug(f"Computing {feature_type} features")
+        for i in data_dict:
+            cc_file = cc_dict[i]
+            if os.path.exists(cc_file):
+                continue
+            log.info(f"Computing pearson correlation coefficient matrix for {i}")
+            x = data_dict[i].values.astype(np.float32)
+            df = pd.DataFrame(x)
+            corr_matrix = df.corr(method="pearson", min_periods=min_sample_count)
+            arr = corr_matrix.values
+            upper_indices = np.triu_indices(arr.shape[0])
+            with h5py.File(cc_dict[i], "w") as hf:
                 # only store the upper triangle part
-                hf.create_dataset("mr", data=arr_mr[upper_indices])
+                hf.create_dataset("cc", data=arr[upper_indices])
                 hf.create_dataset("ids", data=data_dict[i].columns.values.astype("S"))
+            cc_dict[i] = cc_file
+
+            # compute pairwise mutual rank features
+            if feature_type == "mr":
+                log.info(f"Computing mutual rank matrix for {i}")
+                arr_mr = pairwise_mutual_rank(arr)
+                upper_indices = np.triu_indices(arr_mr.shape[0])
+                with h5py.File(mr_dict[i], "w") as hf:
+                    # only store the upper triangle part
+                    hf.create_dataset("mr", data=arr_mr[upper_indices])
+                    hf.create_dataset(
+                        "ids", data=data_dict[i].columns.values.astype("S")
+                    )
 
     return cc_dict, mr_dict, all_valid_ids
 
@@ -232,15 +238,6 @@ def assemble_feature_df(h5_file_mapping, df, dataset="cc"):
     file_keys = list(h5_file_mapping.keys())
     feature_df = pd.DataFrame(columns=file_keys + ["label"])
 
-    def get_1d_indices(i_array, j_array, n):
-        # Ensure i and j are within bounds
-        mask = (i_array < n) & (j_array < n)
-        i = np.minimum(i_array[mask], j_array[mask])
-        j = np.maximum(i_array[mask], j_array[mask])
-
-        # Calculate 1D indices
-        return i * n - i * (i - 1) // 2 + (j - i)
-
     def get_1d_indices(i_array, j_array, n):
         # Ensure i and j are within bounds
         mask = (i_array < n) & (j_array < n)
diff --git a/python/funmap/utils.py b/python/funmap/utils.py
index 81bb68cb..533d52ca 100644
--- a/python/funmap/utils.py
+++ b/python/funmap/utils.py
@@ -86,49 +86,50 @@ def get_data_dict(config, min_sample_count=15):
         the data from the corresponding file.
 
     """
-    data_file = config["data_path"]
+    all_valid_ids = set()
     data_dict = {}
-    if "filter_noncoding_genes" in config and config["filter_noncoding_genes"]:
-        mapping = pd.read_csv(urls["mapping_file"], sep="\t")
-    # extract the data file from the tar.gz file
-    tmp_dir = "tmp_data"
-    if not os.path.exists(tmp_dir):
-        os.mkdir(tmp_dir)
-    os.system(f"tar -xzf {data_file} --strip-components=1 -C {tmp_dir}")
-    # gene ids are gene symbols
-    for dt in config["data_files"]:
-        log.info(f"processing ... {dt['name']}")
-        cur_feature = dt["name"]
-        # cur_file = get_obj_from_tgz(data_file, dt['path'])
+    if not config["only_extra_features"]:
+        data_file = config["data_path"]
+        if "filter_noncoding_genes" in config and config["filter_noncoding_genes"]:
+            mapping = pd.read_csv(urls["mapping_file"], sep="\t")
         # extract the data file from the tar.gz file
-        # cur_data= get_obj_from_tgz(data_file, dt['path'])
-        cur_data = pd.read_csv(
-            os.path.join(tmp_dir, dt["path"]), sep="\t", index_col=0, header=0
-        )
-        if cur_data.shape[1] < min_sample_count:
-            log.info(f"... {dt['name']} ... not enough samples, skipped")
-            continue
-        cur_data = cur_data.T
-        # exclude cohort with sample number < min_sample_count
-        # remove noncoding genes first
-        if config["filter_noncoding_genes"]:
-            coding = mapping.loc[mapping["coding"] == "coding", ["gene_name"]]
-            coding_genes = list(set(coding["gene_name"].to_list()))
-            cur_data = cur_data[[c for c in cur_data.columns if c in coding_genes]]
-        # duplicated columns, for now select the last column
-        cur_data = cur_data.loc[:, ~cur_data.columns.duplicated(keep="last")]
-        data_dict[cur_feature] = cur_data
-
-    shutil.rmtree(tmp_dir)
-
-    log.info("filtering out non valid ids ...")
-    all_valid_ids = set()
-    for i in data_dict:
-        cur_data = data_dict[i]
-        is_valid = cur_data.notna().sum() >= min_sample_count
-        # valid_count = np.sum(is_valid)
-        valid_p = cur_data.columns[is_valid].values
-        all_valid_ids = all_valid_ids.union(set(valid_p))
+        tmp_dir = "tmp_data"
+        if not os.path.exists(tmp_dir):
+            os.mkdir(tmp_dir)
+        os.system(f"tar -xzf {data_file} --strip-components=1 -C {tmp_dir}")
+        # gene ids are gene symbols
+        for dt in config["data_files"]:
+            log.info(f"processing ... {dt['name']}")
+            cur_feature = dt["name"]
+            # cur_file = get_obj_from_tgz(data_file, dt['path'])
+            # extract the data file from the tar.gz file
+            # cur_data= get_obj_from_tgz(data_file, dt['path'])
+            cur_data = pd.read_csv(
+                os.path.join(tmp_dir, dt["path"]), sep="\t", index_col=0, header=0
+            )
+            if cur_data.shape[1] < min_sample_count:
+                log.info(f"... {dt['name']} ... not enough samples, skipped")
+                continue
+            cur_data = cur_data.T
+            # exclude cohort with sample number < min_sample_count
+            # remove noncoding genes first
+            if config["filter_noncoding_genes"]:
+                coding = mapping.loc[mapping["coding"] == "coding", ["gene_name"]]
+                coding_genes = list(set(coding["gene_name"].to_list()))
+                cur_data = cur_data[[c for c in cur_data.columns if c in coding_genes]]
+            # duplicated columns, for now select the last column
+            cur_data = cur_data.loc[:, ~cur_data.columns.duplicated(keep="last")]
+            data_dict[cur_feature] = cur_data
+
+        shutil.rmtree(tmp_dir)
+
+        log.info("filtering out non valid ids ...")
+        for i in data_dict:
+            cur_data = data_dict[i]
+            is_valid = cur_data.notna().sum() >= min_sample_count
+            # valid_count = np.sum(is_valid)
+            valid_p = cur_data.columns[is_valid].values
+            all_valid_ids = all_valid_ids.union(set(valid_p))
     if config["extra_feature_file"] is not None:
         log.info("Importing extra feature file")
         # TODO: Import extra feature file
@@ -342,6 +343,7 @@ def get_config(cfg_file: Path):
         "max_num_edges": 250000,
         "step_size": 1000,
         "lr_cutoff": 50,
+        "only_extra_features": False,
     }
 
     with open(cfg_file, "r") as fh:
@@ -352,6 +354,12 @@ def get_config(cfg_file: Path):
         cfg["task"] = cfg_dict["task"]
         assert cfg["task"] in ["protein_func", "kinase_func"]
 
+    if "only_extra_features" in cfg_dict:
+        cfg["only_extra_features"] = cfg_dict["only_extra_features"]
+        if cfg["only_extra_features"] and "extra_feature_file" not in cfg_dict:
+            msg = "Extra feature file is not defined when only_extra_features = True"
+            print(msg)
+            raise ValueError(msg)
     if "seed" in cfg_dict:
         cfg["seed"] = cfg_dict["seed"]
 
@@ -404,21 +412,24 @@ def get_config(cfg_file: Path):
     if "results_dir" in cfg_dict:
         cfg["results_dir"] = cfg_dict["results_dir"] + "/" + cfg_dict["name"]
 
-    if "data_files" not in cfg_dict:
+    if "data_files" not in cfg_dict and not cfg.get("only_extra_features", False):
         raise ValueError("data_files not specified in config file")
 
     # Check all files listed under data_files are also in the tar.gz file
-    data_files = cfg_dict["data_files"]
-    # List all the files in the tar.gz file
-    with tarfile.open(cfg["data_path"], "r:gz") as tar:
-        tar_files = {Path(file).name for file in tar.getnames()}
-
-    # Check if all files in data_files are in tar_files
-    if not all(file["path"] in tar_files for file in data_files):
-        print("Files listed under data_files are not in the tar.gz file!")
-        raise ValueError("Files listed under data_files are not in the tar.gz file!")
-
-    cfg["data_files"] = cfg_dict["data_files"]
+    if not cfg["only_extra_features"]:
+        data_files = cfg_dict["data_files"]
+        # List all the files in the tar.gz file
+        with tarfile.open(cfg["data_path"], "r:gz") as tar:
+            tar_files = {Path(file).name for file in tar.getnames()}
+
+        # Check if all files in data_files are in tar_files
+        if not all(file["path"] in tar_files for file in data_files):
+            print("Files listed under data_files are not in the tar.gz file!")
+            raise ValueError(
+                "Files listed under data_files are not in the tar.gz file!"
+            )
+
+        cfg["data_files"] = cfg_dict["data_files"]
 
     if "rp_pairs" in cfg_dict:
         cfg["rp_pairs"] = cfg_dict["rp_pairs"]

From 000ae50261b19ae8283de052cf24cf8436283c1e Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Mon, 30 Sep 2024 09:39:09 -0500
Subject: [PATCH 30/54] update to use latest packages

---
 pyproject.toml        | 43 ++++++++++++++++++++++---------------------
 requirements-dev.lock | 43 ++++++++++++++++++++++++-------------------
 requirements.lock     | 43 ++++++++++++++++++++++++-------------------
 3 files changed, 70 insertions(+), 59 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2f5df509..043ed966 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,27 +3,6 @@ name = "funmap"
 version = "0.2.0"
 description = "generate gene co-function networks using omics data"
 authors = [{ name = "Zhiao Shi", email = "zhiao.shi@gmail.com" }]
-dependencies = [
-    "PyPDF2==3.0.1",
-    "click==8.1.7",
-    "h5py==3.10.0",
-    "imbalanced-learn==0.11.0",
-    "joblib==1.3.2",
-    "matplotlib==3.7.3",
-    "matplotlib_venn==0.11.9",
-    "networkx==3.1",
-    "numpy==1.24.4",
-    "pandas==2.0.3",
-    "powerlaw==1.5",
-    "pyarrow==13.0.0",
-    "pyyaml==6.0.1",
-    "scikit-learn==1.3.2",
-    "scipy==1.10.1",
-    "seaborn==0.13.0",
-    "tables==3.8.0",
-    "tqdm==4.66.1",
-    "xgboost==2.0.0",
-]
 readme = "README.md"
 license = { text = "MIT license" }
 requires-python = ">=3.8"
@@ -32,6 +11,28 @@ classifiers = [
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Python :: Implementation :: PyPy",
 ]
+dependencies = [
+    "h5py>=3.12.1",
+    "numpy>=2.1.1",
+    "seaborn>=0.13.2",
+    "tqdm>=4.66.5",
+    "xgboost>=2.1.1",
+    "pyyaml>=6.0.2",
+    "scipy>=1.14.1",
+    "pyarrow>=17.0.0",
+    "pandas>=2.2.3",
+    "joblib>=1.4.2",
+    "matplotlib>=3.9.2",
+    "scikit-learn>=1.5.2",
+    "imbalanced-learn>=0.12.3",
+    "pypdf2>=3.0.1",
+    "matplotlib-venn>=1.1.1",
+    "networkx>=3.3",
+    "powerlaw>=1.5",
+    "click>=8.1.7",
+    "tables>=3.10.1",
+]
+
 [project.scripts]
 funmap = "funmap.cli:cli"
 
diff --git a/requirements-dev.lock b/requirements-dev.lock
index d3de5995..8ff4123e 100644
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -10,7 +10,7 @@
 #   universal: false
 
 -e file:.
-blosc2==2.0.0
+blosc2==2.7.1
     # via tables
 click==8.1.7
     # via funmap
@@ -18,37 +18,39 @@ contourpy==1.3.0
     # via matplotlib
 cycler==0.12.1
     # via matplotlib
-cython==3.0.11
-    # via tables
 fonttools==4.54.1
     # via matplotlib
-h5py==3.10.0
+h5py==3.12.1
     # via funmap
-imbalanced-learn==0.11.0
+imbalanced-learn==0.12.3
     # via funmap
-joblib==1.3.2
+joblib==1.4.2
     # via funmap
     # via imbalanced-learn
     # via scikit-learn
 kiwisolver==1.4.7
     # via matplotlib
-matplotlib==3.7.3
+matplotlib==3.9.2
     # via funmap
     # via matplotlib-venn
     # via powerlaw
     # via seaborn
-matplotlib-venn==0.11.9
+matplotlib-venn==1.1.1
     # via funmap
 maturin==1.7.4
 mpmath==1.3.0
     # via powerlaw
 msgpack==1.1.0
     # via blosc2
-networkx==3.1
+ndindex==1.9.2
+    # via blosc2
+networkx==3.3
     # via funmap
 numexpr==2.10.1
+    # via blosc2
     # via tables
-numpy==1.24.4
+numpy==2.1.1
+    # via blosc2
     # via contourpy
     # via funmap
     # via h5py
@@ -67,7 +69,7 @@ numpy==1.24.4
 packaging==24.1
     # via matplotlib
     # via tables
-pandas==2.0.3
+pandas==2.2.3
     # via funmap
     # via seaborn
 pillow==10.4.0
@@ -75,8 +77,9 @@ pillow==10.4.0
 powerlaw==1.5
     # via funmap
 py-cpuinfo==9.0.0
+    # via blosc2
     # via tables
-pyarrow==13.0.0
+pyarrow==17.0.0
     # via funmap
 pyparsing==3.1.4
     # via matplotlib
@@ -87,30 +90,32 @@ python-dateutil==2.9.0.post0
     # via pandas
 pytz==2024.2
     # via pandas
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via funmap
-scikit-learn==1.3.2
+scikit-learn==1.5.2
     # via funmap
     # via imbalanced-learn
-scipy==1.10.1
+scipy==1.14.1
     # via funmap
     # via imbalanced-learn
     # via matplotlib-venn
     # via powerlaw
     # via scikit-learn
     # via xgboost
-seaborn==0.13.0
+seaborn==0.13.2
     # via funmap
 six==1.16.0
     # via python-dateutil
-tables==3.8.0
+tables==3.10.1
     # via funmap
 threadpoolctl==3.5.0
     # via imbalanced-learn
     # via scikit-learn
-tqdm==4.66.1
+tqdm==4.66.5
     # via funmap
+typing-extensions==4.12.2
+    # via tables
 tzdata==2024.2
     # via pandas
-xgboost==2.0.0
+xgboost==2.1.1
     # via funmap
diff --git a/requirements.lock b/requirements.lock
index 4ca03409..43e3816b 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -10,7 +10,7 @@
 #   universal: false
 
 -e file:.
-blosc2==2.0.0
+blosc2==2.7.1
     # via tables
 click==8.1.7
     # via funmap
@@ -18,36 +18,38 @@ contourpy==1.3.0
     # via matplotlib
 cycler==0.12.1
     # via matplotlib
-cython==3.0.11
-    # via tables
 fonttools==4.54.1
     # via matplotlib
-h5py==3.10.0
+h5py==3.12.1
     # via funmap
-imbalanced-learn==0.11.0
+imbalanced-learn==0.12.3
     # via funmap
-joblib==1.3.2
+joblib==1.4.2
     # via funmap
     # via imbalanced-learn
     # via scikit-learn
 kiwisolver==1.4.7
     # via matplotlib
-matplotlib==3.7.3
+matplotlib==3.9.2
     # via funmap
     # via matplotlib-venn
     # via powerlaw
     # via seaborn
-matplotlib-venn==0.11.9
+matplotlib-venn==1.1.1
     # via funmap
 mpmath==1.3.0
     # via powerlaw
 msgpack==1.1.0
     # via blosc2
-networkx==3.1
+ndindex==1.9.2
+    # via blosc2
+networkx==3.3
     # via funmap
 numexpr==2.10.1
+    # via blosc2
     # via tables
-numpy==1.24.4
+numpy==2.1.1
+    # via blosc2
     # via contourpy
     # via funmap
     # via h5py
@@ -66,7 +68,7 @@ numpy==1.24.4
 packaging==24.1
     # via matplotlib
     # via tables
-pandas==2.0.3
+pandas==2.2.3
     # via funmap
     # via seaborn
 pillow==10.4.0
@@ -74,8 +76,9 @@ pillow==10.4.0
 powerlaw==1.5
     # via funmap
 py-cpuinfo==9.0.0
+    # via blosc2
     # via tables
-pyarrow==13.0.0
+pyarrow==17.0.0
     # via funmap
 pyparsing==3.1.4
     # via matplotlib
@@ -86,30 +89,32 @@ python-dateutil==2.9.0.post0
     # via pandas
 pytz==2024.2
     # via pandas
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via funmap
-scikit-learn==1.3.2
+scikit-learn==1.5.2
     # via funmap
     # via imbalanced-learn
-scipy==1.10.1
+scipy==1.14.1
     # via funmap
     # via imbalanced-learn
     # via matplotlib-venn
     # via powerlaw
     # via scikit-learn
     # via xgboost
-seaborn==0.13.0
+seaborn==0.13.2
     # via funmap
 six==1.16.0
     # via python-dateutil
-tables==3.8.0
+tables==3.10.1
     # via funmap
 threadpoolctl==3.5.0
     # via imbalanced-learn
     # via scikit-learn
-tqdm==4.66.1
+tqdm==4.66.5
     # via funmap
+typing-extensions==4.12.2
+    # via tables
 tzdata==2024.2
     # via pandas
-xgboost==2.0.0
+xgboost==2.1.1
     # via funmap

From 29e8c14f7a31a15c52318074ad91f23d78645b94 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Mon, 30 Sep 2024 09:57:07 -0500
Subject: [PATCH 31/54] fix processing of extra_feature_df

---
 python/funmap/cli.py   | 7 ++-----
 python/funmap/utils.py | 1 +
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/python/funmap/cli.py b/python/funmap/cli.py
index 7635dcfe..2fec1579 100644
--- a/python/funmap/cli.py
+++ b/python/funmap/cli.py
@@ -126,13 +126,10 @@ def run(config_file, force_rerun):
     setup_logging(config_file)
     cfg = setup_experiment(config_file)
     extra_feature_file = cfg["extra_feature_file"]
+    extra_feature_df = None
     if extra_feature_file is not None:
         log.info("Loading extra feature file into dataframe")
         extra_feature_df = process_extra_feature(extra_feature_file)
-    else:
-        extra_feature_df = None
-    # if (extra_feature_file is not None) and (not check_extra_feature_file(extra_feature_file)):
-    #     return
     gs_file = cfg["gs_file"]
     if (gs_file is not None) and (not check_gold_standard_file(gs_file)):
         return
@@ -367,7 +364,7 @@ def run(config_file, force_rerun):
             "feature_type": "cc",
             "gs_file": gs_file,
             # no extra feature for plotting
-            "extra_feature_df": None,
+            "extra_feature_df": extra_feature_df,
             "valid_id_list": all_valid_ids,
             "test_size": test_size,
             "seed": seed,
diff --git a/python/funmap/utils.py b/python/funmap/utils.py
index 533d52ca..7293ff30 100644
--- a/python/funmap/utils.py
+++ b/python/funmap/utils.py
@@ -582,3 +582,4 @@ def process_extra_feature(extra_feature_file) -> pd.DataFrame:
     extra_feature_df = extra_feature_df.drop_duplicates(
         subset=["P1", "P2"], keep="last"
     )
+    return extra_feature_df

From 46c5fa84c9222f6882cb959872cbae30c0894c11 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Tue, 1 Oct 2024 15:04:52 -0500
Subject: [PATCH 32/54] Add documentation and fix messages

---
 python/funmap/cli.py    |  2 +-
 python/funmap/funmap.py |  2 +-
 src/lib.rs              | 20 +++++++++++++++++++-
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/python/funmap/cli.py b/python/funmap/cli.py
index 2fec1579..498b6bde 100644
--- a/python/funmap/cli.py
+++ b/python/funmap/cli.py
@@ -152,7 +152,7 @@ def run(config_file, force_rerun):
     max_num_edges = cfg["max_num_edges"]
     step_size = cfg["step_size"]
     start_edge_num = cfg["start_edge_num"]
-
+    only_extra_features = cfg["only_extra_features"]
     results_dir = Path(cfg["results_dir"])
     saved_data_dir = results_dir / cfg["subdirs"]["saved_data_dir"]
     model_dir = results_dir / cfg["subdirs"]["saved_model_dir"]
diff --git a/python/funmap/funmap.py b/python/funmap/funmap.py
index 78e0cf3c..50d5a37a 100644
--- a/python/funmap/funmap.py
+++ b/python/funmap/funmap.py
@@ -643,7 +643,7 @@ def dataset_llr(
             cur_llr_res["dataset"] = f + "_EXTRAFEAT"
             llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True)
             llr_ds.to_csv(llr_dataset_file, sep="\t", index=False)
-            log.info(f"Calculating llr for {dataset} ... done")
+            log.info(f"Calculating llr for {f} ... done")
     return llr_ds
 
 
diff --git a/src/lib.rs b/src/lib.rs
index 69bb5572..a829e04d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,23 @@
 use pyo3::prelude::*;
 
+/// funmap_lib
+/// Supports the addition of extra features in the format of gene pairs
+///
+/// Files can reach up to 10 GB, so Rust is used to speed up and optimize the merging process.
+///
+/// # Step 1: Identify all unique genes
+///
+/// This step identifies all the unique genes found in both the data and extra-feature files.
+///
+/// This allows for a universal order to genes that will be referenced in other steps
+///
+/// Input: All data files and extra feature files
+/// Output: .pkl file containing the unique genes found in alphabetical order (A before B)
+///
+/// # Step 2: Align extra features
+///
+/// WARN: May need to re-align the regular data files
+/// For each extra feature file, re-index the rows according to the order of genes in Step 1.
 /// Formats the sum of two numbers as string.
 #[pyfunction]
 fn sum_as_string(a: usize, b: usize) -> PyResult<String> {
@@ -8,7 +26,7 @@ fn sum_as_string(a: usize, b: usize) -> PyResult<String> {
 
 /// A Python module implemented in Rust.
 #[pymodule]
-#[pyo3(name = "_lib")]
+#[pyo3(name = "_lib")] // module name is _lib. (imports as funmap._lib). This is to hide these functions from regular users.
 fn funmap_lib(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(sum_as_string, m)?)?;
     Ok(())

From d34fc2c9966dd8b0b1770951ee0c4aabeeb65596 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Tue, 1 Oct 2024 16:17:56 -0500
Subject: [PATCH 33/54] add unique gene identification

---
 Cargo.lock | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++
 Cargo.toml |  1 +
 src/lib.rs | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 118 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index ded191fc..075d2231 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,19 @@
 # It is not intended for manual editing.
 version = 3
 
+[[package]]
+name = "ahash"
+version = "0.8.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
+dependencies = [
+ "cfg-if",
+ "getrandom",
+ "once_cell",
+ "version_check",
+ "zerocopy",
+]
+
 [[package]]
 name = "autocfg"
 version = "1.4.0"
@@ -18,9 +31,21 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 name = "funmap_lib"
 version = "0.1.0"
 dependencies = [
+ "ahash",
  "pyo3",
 ]
 
+[[package]]
+name = "getrandom"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -169,3 +194,35 @@ name = "unindent"
 version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce"
+
+[[package]]
+name = "version_check"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
+
+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+
+[[package]]
+name = "zerocopy"
+version = "0.7.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.7.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
diff --git a/Cargo.toml b/Cargo.toml
index ecac49bc..b5dc393a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,4 +9,5 @@ name = "funmap"
 crate-type = ["cdylib"]
 
 [dependencies]
+ahash = "0.8.11"
 pyo3 = "0.22.0"
diff --git a/src/lib.rs b/src/lib.rs
index a829e04d..06a84168 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,6 +1,12 @@
+use ahash::{HashSet, HashSetExt};
 use pyo3::prelude::*;
+use std::{
+    fs::File,
+    io::{BufRead, BufReader},
+};
 
 /// funmap_lib
+///
 /// Supports the addition of extra features in the format of gene pairs
 ///
 /// Files can reach up to 10 GB, so Rust is used to speed up and optimize the merging process.
@@ -17,7 +23,60 @@ use pyo3::prelude::*;
 /// # Step 2: Align extra features
 ///
 /// WARN: May need to re-align the regular data files
+///
 /// For each extra feature file, re-index the rows according to the order of genes in Step 1.
+#[pyfunction]
+fn process_files(
+    expression_paths: Vec<String>,
+    extra_feature_paths: Vec<String>,
+) -> PyResult<bool> {
+    // Step 1: Identify all unique genes
+    // Across both expression and extra_feature_paths
+    // Create final unique_gene pkl file
+
+    let mut uniq_gene = HashSet::new();
+
+    // Read expression data where first column is gene information;
+    for file_path in expression_paths.iter() {
+        let file = File::open(file_path)?;
+        let reader = BufReader::new(file);
+        let mut has_header = true;
+        for line in reader.lines() {
+            let line = line?;
+            if has_header {
+                // skip header
+                has_header = false;
+                continue;
+            }
+            let row: Vec<&str> = line.split('\t').collect();
+            if row.len() > 1 {
+                uniq_gene.insert(row[0].to_string()); // add gene to set
+            }
+        }
+    }
+
+    // Read extra feature data, where first and second column are genes
+    for file_path in extra_feature_paths.iter() {
+        let file = File::open(file_path)?;
+        let reader = BufReader::new(file);
+        let mut has_header = true;
+        for line in reader.lines() {
+            let line = line?;
+            if has_header {
+                // skip header
+                has_header = false;
+                continue;
+            }
+            let row: Vec<&str> = line.split('\t').collect();
+            if row.len() > 2 {
+                uniq_gene.insert(row[0].to_string()); // add first gene to set
+                uniq_gene.insert(row[1].to_string()); // add second gene to set
+            }
+        }
+    }
+    Ok(true)
+}
+
 /// Formats the sum of two numbers as string.
 #[pyfunction]
 fn sum_as_string(a: usize, b: usize) -> PyResult<String> {
@@ -29,5 +88,6 @@ fn sum_as_string(a: usize, b: usize) -> PyResult<String> {
 #[pyo3(name = "_lib")] // module name is _lib. (imports as funmap._lib). This is to hide these functions from regular users.
 fn funmap_lib(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(sum_as_string, m)?)?;
+    m.add_function(wrap_pyfunction!(process_files, m)?)?;
     Ok(())
 }

From 47dcd5e10c61a67f0a89d61986b915e7e1eb6411 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Tue, 1 Oct 2024 16:55:16 -0500
Subject: [PATCH 34/54] add format error checking

---
 src/lib.rs | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index 06a84168..06fe4056 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,5 @@
 use ahash::{HashSet, HashSetExt};
-use pyo3::prelude::*;
+use pyo3::{exceptions::PyValueError, prelude::*};
 use std::{
     fs::File,
     io::{BufRead, BufReader},
@@ -29,6 +29,7 @@ use std::{
 fn process_files(
     expression_paths: Vec<String>,
     extra_feature_paths: Vec<String>,
+    output_folder: String,
 ) -> PyResult<bool> {
     // Step 1: Identify all unique genes
     // Across both expression and extra_feature_paths
@@ -46,6 +47,13 @@ fn process_files(
             if has_header {
                 // skip header
                 has_header = false;
+                let row: Vec<&str> = line.split('\t').collect();
+                if row.len() < 2 {
+                    return Err(PyValueError::new_err(format!(
+                        "Expression data file at {} does not have enough columns.",
+                        file_path
+                    )));
+                }
                 continue;
             }
             let row: Vec<&str> = line.split('\t').collect();
@@ -65,6 +73,13 @@ fn process_files(
             if has_header {
                 // skip header
                 has_header = false;
+                let row: Vec<&str> = line.split('\t').collect();
+                if row.len() < 3 {
+                    return Err(PyValueError::new_err(format!(
+                        "Extra feature file at {} does not have enough columns.",
+                        file_path
+                    )));
+                }
                 continue;
             }
             let row: Vec<&str> = line.split('\t').collect();
@@ -74,6 +89,9 @@ fn process_files(
             }
         }
     }
+
+    // Save to pickle
+
     Ok(true)
 }
 

From ca2d1f615fe8eb5d13b8fdf49479f442e31d8aad Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Wed, 2 Oct 2024 11:13:23 -0500
Subject: [PATCH 35/54] save uniq_gene to pkl file in output_folder

---
 Cargo.lock | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 Cargo.toml |  1 +
 src/lib.rs | 13 +++++++++-
 3 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index 075d2231..6150eee1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -21,6 +21,12 @@ version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
 
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
 [[package]]
 name = "cfg-if"
 version = "1.0.0"
@@ -33,6 +39,7 @@ version = "0.1.0"
 dependencies = [
  "ahash",
  "pyo3",
+ "serde-pickle",
 ]
 
 [[package]]
@@ -58,6 +65,12 @@ version = "2.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5"
 
+[[package]]
+name = "iter-read"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c397ca3ea05ad509c4ec451fea28b4771236a376ca1c69fd5143aae0cf8f93c4"
+
 [[package]]
 name = "libc"
 version = "0.2.159"
@@ -73,6 +86,34 @@ dependencies = [
  "autocfg",
 ]
 
+[[package]]
+name = "num-bigint"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
+dependencies = [
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-integer"
+version = "0.1.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+]
+
 [[package]]
 name = "once_cell"
 version = "1.19.0"
@@ -166,6 +207,39 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "serde"
+version = "1.0.210"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde-pickle"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c762ad136a26407c6a80825813600ceeab5e613660d93d79a41f0ec877171e71"
+dependencies = [
+ "byteorder",
+ "iter-read",
+ "num-bigint",
+ "num-traits",
+ "serde",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.210"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "syn"
 version = "2.0.77"
diff --git a/Cargo.toml b/Cargo.toml
index b5dc393a..80d83754 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,3 +11,4 @@ crate-type = ["cdylib"]
 [dependencies]
 ahash = "0.8.11"
 pyo3 = "0.22.0"
+serde-pickle = "1.1.1"
diff --git a/src/lib.rs b/src/lib.rs
index 06fe4056..9c4ad424 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,11 +1,13 @@
 use ahash::{HashSet, HashSetExt};
 use pyo3::{exceptions::PyValueError, prelude::*};
+use serde_pickle::SerOptions;
 use std::{
     fs::File,
     io::{BufRead, BufReader},
+    path::Path,
 };
 
-/// funmap_lib
+/// process_files
 ///
 /// Supports the addition of extra features in the format of gene pairs
 ///
@@ -91,7 +93,16 @@ fn process_files(
     }
 
     // Save to pickle
+    // TODO: Look at other file formats
 
+    let mut uniq_gene: Vec<String> = uniq_gene.iter().cloned().collect();
+
+    // Sort genes alphabetically
+    uniq_gene.sort();
+    let folder_path = Path::new(&output_folder);
+    let uniq_gene_file_path = folder_path.join("uniq_gene.pkl");
+    let mut w = File::create(uniq_gene_file_path)?;
+    serde_pickle::to_writer(&mut w, &uniq_gene, SerOptions::default()).unwrap();
     Ok(true)
 }
 

From 4e850bce85124ac189271faab2a7301b85d917c7 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Fri, 4 Oct 2024 14:36:23 -0500
Subject: [PATCH 36/54] start align_file function

---
 src/lib.rs | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 9c4ad424..60a11da3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -24,9 +24,15 @@ use std::{
 ///
 /// # Step 2: Align extra features
 ///
-/// WARN: May need to re-align the regular data files
-///
 /// For each extra feature file, re-index the rows according to the order of genes in Step 1.
+///
+/// Then save each column as a separate pkl file
+///
+/// Input: unique genes from Step 1, extra feature files.
+/// Output: One pkl file for each feature across all extra feature files
+///
+/// Function
+/// Output: list of all output feature pkl files
 #[pyfunction]
 fn process_files(
     expression_paths: Vec<String>,
@@ -104,6 +110,33 @@ fn process_files(
     let mut w = File::create(uniq_gene_file_path)?;
     serde_pickle::to_writer(&mut w, &uniq_gene, SerOptions::default()).unwrap();
     Ok(true)
+
+    // Re-align each file
+}
+
+fn align_file(path: &String, uniq_gene: &Vec<String>) -> PyResult<bool> {
+    // Read extra feature data, where first and second column are genes
+    let file = File::open(path)?;
+    let reader = BufReader::new(file);
+    let mut has_header = true;
+    for line in reader.lines() {
+        let line = line?;
+        if has_header {
+            // skip header
+            has_header = false;
+            let row: Vec<&str> = line.split('\t').collect();
+            if row.len() < 3 {
+                return Err(PyValueError::new_err(format!(
+                    "Extra feature file at {} does not have enough columns.",
+                    file_path
+                )));
+            }
+            continue;
+        }
+        let row: Vec<&str> = line.split('\t').collect();
+        if row.len() > 2 {}
+    }
+    Ok(true)
 }
 
 /// Formats the sum of two numbers as string.

From 799f39b15493ed6236efae58a04297b71121b4d2 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Fri, 4 Oct 2024 14:38:39 -0500
Subject: [PATCH 37/54] fix rust compilation error

---
 src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index 60a11da3..80ccaaca 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -128,7 +128,7 @@ fn align_file(path: &String, uniq_gene: &Vec<String>) -> PyResult<bool> {
             if row.len() < 3 {
                 return Err(PyValueError::new_err(format!(
                     "Extra feature file at {} does not have enough columns.",
-                    file_path
+                    path
                 )));
             }
             continue;

From c67e533a417073e12942fe243bb1ea6f9ff79a5e Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Mon, 7 Oct 2024 12:19:13 -0500
Subject: [PATCH 38/54] add code for calculating new_index for each feature
 file

---
 .gitignore           |  1 +
 python/funmap/cli.py |  2 +-
 src/lib.rs           | 27 ++++++++++++++++++++++-----
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index eba216d4..4184eca0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -120,3 +120,4 @@ wheels/
 
 # venv
 .venv
+test_data
diff --git a/python/funmap/cli.py b/python/funmap/cli.py
index 498b6bde..29a66390 100644
--- a/python/funmap/cli.py
+++ b/python/funmap/cli.py
@@ -88,7 +88,7 @@ def qc(config_file, force_rerun):
 
 @cli.command()
 def rust():
-    print(f"sum: {funmap_lib.sum_as_string(5, 10)}")
+    funmap_lib.process_files(["test_data/dia.tsv"], [], "test_data")
 
 
 @cli.command(help="run funmap")
diff --git a/src/lib.rs b/src/lib.rs
index 80ccaaca..24566f35 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,4 +1,4 @@
-use ahash::{HashSet, HashSetExt};
+use ahash::{AHashMap, HashSet, HashSetExt};
 use pyo3::{exceptions::PyValueError, prelude::*};
 use serde_pickle::SerOptions;
 use std::{
@@ -109,16 +109,24 @@ fn process_files(
     let uniq_gene_file_path = folder_path.join("uniq_gene.pkl");
     let mut w = File::create(uniq_gene_file_path)?;
     serde_pickle::to_writer(&mut w, &uniq_gene, SerOptions::default()).unwrap();
-    Ok(true)
-
+    let n = uniq_gene.len();
     // Re-align each file
+    // Create a HashMap to store the indices of each string
+    let mut gene_index_map: AHashMap<&String, usize> = AHashMap::new();
+    for (index, gene) in uniq_gene.iter().enumerate() {
+        gene_index_map.insert(gene, index);
+    }
+
+    // One column of indices, and one column of values. Separated by file
+    Ok(true)
 }
 
-fn align_file(path: &String, uniq_gene: &Vec<String>) -> PyResult<bool> {
+fn align_file(path: &String, uniq_gene: &AHashMap<&String, usize>, n: usize) -> PyResult<bool> {
     // Read extra feature data, where first and second column are genes
     let file = File::open(path)?;
     let reader = BufReader::new(file);
     let mut has_header = true;
+    let mut indices: Vec<usize> = Vec::new();
     for line in reader.lines() {
         let line = line?;
         if has_header {
@@ -134,7 +142,16 @@ fn align_file(path: &String, uniq_gene: &Vec<String>) -> PyResult<bool> {
             continue;
         }
         let row: Vec<&str> = line.split('\t').collect();
-        if row.len() > 2 {}
+        if row.len() > 2 {
+            let index1 = uniq_gene.get(&row[0].to_string()).unwrap();
+            let index2 = uniq_gene.get(&row[1].to_string()).unwrap();
+            let (i, j) = if index1 <= index2 {
+                (index1, index2)
+            } else {
+                (index2, index1)
+            };
+            let new_index = i * n - i * (i - 1) / 2 + (j - i);
+        }
     }
     Ok(true)
 }

From 1af42a1560375fd16358428dc7bec3fb4dfac744 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Tue, 8 Oct 2024 11:08:40 -0500
Subject: [PATCH 39/54] File creation

---
 python/funmap/cli.py |  4 +++-
 src/lib.rs           | 32 +++++++++++++++++++++++++++-----
 2 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/python/funmap/cli.py b/python/funmap/cli.py
index 29a66390..26f3d0bb 100644
--- a/python/funmap/cli.py
+++ b/python/funmap/cli.py
@@ -88,7 +88,9 @@ def qc(config_file, force_rerun):
 
 @cli.command()
 def rust():
-    funmap_lib.process_files(["test_data/dia.tsv"], [], "test_data")
+    funmap_lib.process_files(
+        ["test_data/dia.tsv"], ["test_data/all_no_methyl.tsv"], "test_data"
+    )
 
 
 @cli.command(help="run funmap")
diff --git a/src/lib.rs b/src/lib.rs
index 24566f35..606025d7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -3,7 +3,7 @@ use pyo3::{exceptions::PyValueError, prelude::*};
 use serde_pickle::SerOptions;
 use std::{
     fs::File,
-    io::{BufRead, BufReader},
+    io::{BufRead, BufReader, BufWriter, Write},
     path::Path,
 };
 
@@ -117,16 +117,26 @@ fn process_files(
         gene_index_map.insert(gene, index);
     }
 
+    for file_path in extra_feature_paths {
+        align_file(&file_path, &gene_index_map, n as i32, folder_path)?;
+    }
     // One column of indices, and one column of values. Separated by file
     Ok(true)
 }
 
-fn align_file(path: &String, uniq_gene: &AHashMap<&String, usize>, n: usize) -> PyResult<bool> {
+fn align_file(
+    path: &String,
+    uniq_gene: &AHashMap<&String, usize>,
+    n: i32,
+    output_folder: &Path,
+) -> PyResult<bool> {
     // Read extra feature data, where first and second column are genes
     let file = File::open(path)?;
     let reader = BufReader::new(file);
     let mut has_header = true;
     let mut indices: Vec<usize> = Vec::new();
+    let mut feature_count = 0;
+    let mut writers = Vec::new();
     for line in reader.lines() {
         let line = line?;
         if has_header {
@@ -139,6 +149,13 @@ fn align_file(path: &String, uniq_gene: &AHashMap<&String, usize>, n: usize) ->
                     path
                 )));
             }
+            feature_count = row.len() - 2;
+            for i in 0..feature_count {
+                let file_path = output_folder.join(format!("{}.col", row[i + 2]));
+                let f = File::create(file_path)?;
+                let bf = BufWriter::new(f);
+                writers.push(bf);
+            }
             continue;
         }
         let row: Vec<&str> = line.split('\t').collect();
@@ -146,11 +163,16 @@ fn align_file(path: &String, uniq_gene: &AHashMap<&String, usize>, n: usize) ->
             let index1 = uniq_gene.get(&row[0].to_string()).unwrap();
             let index2 = uniq_gene.get(&row[1].to_string()).unwrap();
             let (i, j) = if index1 <= index2 {
-                (index1, index2)
+                (*index1 as i32, *index2 as i32)
             } else {
-                (index2, index1)
+                (*index2 as i32, *index1 as i32)
             };
-            let new_index = i * n - i * (i - 1) / 2 + (j - i);
+            let new_index = (i * n - i * (i - 1) / 2 + (j - i)) as usize;
+            indices.push(new_index);
+            for i in 0..feature_count {
+                writers[i].write_all(row[i + 2].as_bytes())?;
+                writers[i].write_all(b"\n")?;
+            }
         }
     }
     Ok(true)

From 6de0073b95a13241ddf2800684b7c2e540429fb1 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Tue, 8 Oct 2024 15:36:17 -0500
Subject: [PATCH 40/54] create index file

---
 src/lib.rs | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 606025d7..a2954c01 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -137,6 +137,12 @@ fn align_file(
     let mut indices: Vec<usize> = Vec::new();
     let mut feature_count = 0;
     let mut writers = Vec::new();
+    let original_file = Path::new(path).file_name().unwrap().to_str().unwrap();
+    let index_file_path = output_folder.join(format!("{}.index", original_file));
+    let f = File::create(index_file_path)?;
+    let bf = BufWriter::new(f);
+    writers.push(bf);
+
     for line in reader.lines() {
         let line = line?;
         if has_header {
@@ -169,9 +175,11 @@ fn align_file(
             };
             let new_index = (i * n - i * (i - 1) / 2 + (j - i)) as usize;
             indices.push(new_index);
+            writers[0].write_all(new_index.to_string().as_bytes())?;
+            writers[0].write_all(b"\n")?;
             for i in 0..feature_count {
-                writers[i].write_all(row[i + 2].as_bytes())?;
-                writers[i].write_all(b"\n")?;
+                writers[i + 1].write_all(row[i + 2].as_bytes())?;
+                writers[i + 1].write_all(b"\n")?;
             }
         }
     }

From 289185d233b2df1ca97ac32529644adf38b2bbe5 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Tue, 8 Oct 2024 16:29:10 -0500
Subject: [PATCH 41/54] Remove unnecessary file read

---
 python/funmap/funmap.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/funmap/funmap.py b/python/funmap/funmap.py
index 50d5a37a..14b02454 100644
--- a/python/funmap/funmap.py
+++ b/python/funmap/funmap.py
@@ -540,8 +540,6 @@ def get_1d_indices(i_array, j_array, n):
         # Calculate 1D indices
         return i * n - i * (i - 1) // 2 + (j - i)
 
-    with h5py.File(feature_file, "r") as h5_file:
-        gene_ids = h5_file["ids"][:]
     df = pd.DataFrame(all_pairs, columns=["P1", "P2"])
     with h5py.File(feature_file, "r") as h5_file:
         gene_ids = h5_file["ids"][:]

From a49d5ecf520af0c3776fa1f3c8b49b1d8eb40345 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 10 Oct 2024 14:09:30 -0500
Subject: [PATCH 42/54] Use new feature_df creation

---
 python/funmap/cli.py    | 22 +++++++++++++++++-----
 python/funmap/funmap.py | 30 +++++++++++++++++++++---------
 python/funmap/utils.py  | 36 +++++++++++++++++++++++++++++++++++-
 src/lib.rs              | 15 ++++++++++++---
 4 files changed, 85 insertions(+), 18 deletions(-)

diff --git a/python/funmap/cli.py b/python/funmap/cli.py
index 26f3d0bb..29f50c77 100644
--- a/python/funmap/cli.py
+++ b/python/funmap/cli.py
@@ -6,8 +6,9 @@
 import click
 import numpy as np
 import pandas as pd
-from funmap import _lib as funmap_lib
+
 from funmap import __version__
+from funmap import _lib as funmap_lib
 from funmap.data_urls import misc_urls as urls
 from funmap.funmap import (
     compute_features,
@@ -88,8 +89,14 @@ def qc(config_file, force_rerun):
 
 @cli.command()
 def rust():
+    mapping = pd.read_csv(urls["mapping_file"], sep="\t")
+    coding = mapping.loc[mapping["coding"] == "coding", ["gene_name"]]
+    coding_genes = list(set(coding["gene_name"].to_list()))
     funmap_lib.process_files(
-        ["test_data/dia.tsv"], ["test_data/all_no_methyl.tsv"], "test_data"
+        ["test_data/dia.tsv"],
+        ["test_data/all_no_methyl.tsv"],
+        "test_data",
+        coding_genes,
     )
 
 
@@ -127,11 +134,12 @@ def run(config_file, force_rerun):
 
     setup_logging(config_file)
     cfg = setup_experiment(config_file)
-    extra_feature_file = cfg["extra_feature_file"]
+    extra_feature_file = cfg["extra_feature_folder"]
     extra_feature_df = None
-    if extra_feature_file is not None:
+    uniq_gene = None
+    if extra_feature_folder is not None:
         log.info("Loading extra feature file into dataframe")
-        extra_feature_df = process_extra_feature(extra_feature_file)
+        (uniq_gene, extra_feature_df) = new_extra_feature(extra_feature_file)
     gs_file = cfg["gs_file"]
     if (gs_file is not None) and (not check_gold_standard_file(gs_file)):
         return
@@ -195,6 +203,10 @@ def run(config_file, force_rerun):
     cc_dict, mr_dict, all_valid_ids = compute_features(
         cfg, feature_type, min_sample_count, saved_data_dir
     )
+
+    if uniq_gene is not None:
+        all_valid_ids = uniq_gene
+
     gs_args = {
         "task": task,
         "saved_data_dir": saved_data_dir,
diff --git a/python/funmap/funmap.py b/python/funmap/funmap.py
index 14b02454..fecb1108 100644
--- a/python/funmap/funmap.py
+++ b/python/funmap/funmap.py
@@ -286,7 +286,13 @@ def get_1d_indices(i_array, j_array, n):
 
 
 def extract_features(
-    df, feature_type, cc_dict, ppi_feature=None, extra_feature_df=None, mr_dict=None
+    df,
+    feature_type,
+    cc_dict,
+    ppi_feature=None,
+    extra_feature_df=None,
+    mr_dict=None,
+    chunk_data: tuple[int, int] | None = None,
 ):
     """
       extract_features - creates the final feature `pandas` dataframe used by xgboost
@@ -323,12 +329,17 @@ def extract_features(
 
     # TODO: add extra features if provided
     if extra_feature_df is not None:
-        df.reset_index(drop=True, inplace=True)
-        merged_df = pd.merge(
-            df[["P1", "P2"]], extra_feature_df, on=["P1", "P2"], how="left"
-        )
-        merged_df = merged_df.drop(columns=["P1", "P2"])
-        feature_df = pd.merge(feature_df, merged_df, left_index=True, right_index=True)
+        if chunk_data is not None:
+            (start_idx, chunk_size) = chunk_data
+            feature_df = pd.concat(
+                [feature_df, extra_feature_df[start_idx : start_idx + chunk_size]],
+                axis=1,
+            )
+        else:
+            feature_df = pd.concat(
+                [feature_df, extra_feature_df],
+                axis=1,
+            )
     # move 'label' column to the end of the dataframe if it exists
     if "label" in feature_df.columns:
         feature_df = feature_df[
@@ -513,10 +524,10 @@ def prepare_gs_data(**kwargs):
     else:
         ppi_feature = None
     gs_train_df = extract_features(
-        gs_train, feature_type, cc_dict, ppi_feature, extra_feature_df, mr_dict
+        gs_train, feature_type, cc_dict, ppi_feature, extra_feature_df, mr_dict, None
     )
     gs_test_df = extract_features(
-        gs_test, feature_type, cc_dict, ppi_feature, extra_feature_df, mr_dict
+        gs_test, feature_type, cc_dict, ppi_feature, extra_feature_df, mr_dict, None
     )
 
     # store both the ids with gs_test_df for later use
@@ -754,6 +765,7 @@ def process_and_save_chunk(start_idx, chunk_id):
                 cur_ppi_feature,
                 extra_feature_df,
                 mr_dict,
+                (start_idx, chunk_size),
             )
             predictions = model.predict_proba(feature_df)
             prediction_df = pd.DataFrame(predictions[:, 1], columns=["prediction"])
diff --git a/python/funmap/utils.py b/python/funmap/utils.py
index 7293ff30..b296601f 100644
--- a/python/funmap/utils.py
+++ b/python/funmap/utils.py
@@ -7,7 +7,8 @@
 import urllib
 from pathlib import Path
 from urllib.parse import urlparse
-
+import glob
+import pickle
 import pandas as pd
 import yaml
 
@@ -568,6 +569,39 @@ def check_extra_feature_file(file_path, missing_value="NA"):
     return True
 
 
+def read_files_to_dataframe(file_paths):
+    # Read each file into a DataFrame and concatenate
+    dataframes = []
+    for file_path in file_paths:
+        # Read file into DataFrame; each line becomes a row in a single column
+        df = pd.read_csv(file_path, header=None, names=["data"])
+        dataframes.append(df)
+
+    # Concatenate all DataFrames into one
+    final_df = pd.concat(dataframes, ignore_index=True)
+    return final_df
+
+
+def reorder_dataframe(final_df, index_file_path):
+    # Read the index file
+    with open(index_file_path, "r") as f:
+        indices = [int(line.strip()) for line in f.readlines()]
+
+    # Reorder the DataFrame using the indices
+    reordered_df = final_df.iloc[indices].reset_index(drop=True)
+    return reordered_df
+
+
+def new_extra_feature(extra_feature_folder):
+    index_file = list(glob.glob(f"{extra_feature_folder}/*.index"))[0]
+    features = list(glob.glob(f"{extra_feature_folder}/*.col"))
+    with open(f"{extra_feature_folder}/uniq_gene.pkl", "rb") as r:
+        uniq_gene = pickle.load(r)
+    df = read_files_to_dataframe(features)
+    df = read_files_to_dataframe(df, index_file)
+    return (uniq_gene, index_file)
+
+
 def process_extra_feature(extra_feature_file) -> pd.DataFrame:
     extra_feature_df = pd.read_csv(extra_feature_file, sep="\t")
     extra_feature_df.columns.values[0] = "P1"
diff --git a/src/lib.rs b/src/lib.rs
index a2954c01..ecd46da7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,4 +1,4 @@
-use ahash::{AHashMap, HashSet, HashSetExt};
+use ahash::{AHashMap, AHashSet, HashSet, HashSetExt};
 use pyo3::{exceptions::PyValueError, prelude::*};
 use serde_pickle::SerOptions;
 use std::{
@@ -33,11 +33,15 @@ use std::{
 ///
 /// Function
 /// Output: list of all output feature pkl files
+///
+/// TODO: Protein-coding gene filtering
 #[pyfunction]
+#[pyo3(signature = (expression_paths, extra_feature_paths, output_folder, valid_ids=None))]
 fn process_files(
     expression_paths: Vec<String>,
     extra_feature_paths: Vec<String>,
     output_folder: String,
+    valid_ids: Option<Vec<String>>,
 ) -> PyResult<bool> {
     // Step 1: Identify all unique genes
     // Across both expression and extra_feature_paths
@@ -98,11 +102,16 @@ fn process_files(
         }
     }
 
+    let mut uniq_gene: Vec<String> = if let Some(valid_ids) = valid_ids {
+        let valid_ids = AHashSet::from_iter(valid_ids);
+        uniq_gene.union(&valid_ids).cloned().collect()
+    } else {
+        uniq_gene.iter().cloned().collect()
+    };
+
     // Save to pickle
     // TODO: Look at other file formats
 
-    let mut uniq_gene: Vec<String> = uniq_gene.iter().cloned().collect();
-
     // Sort genes alphabetically
     uniq_gene.sort();
     let folder_path = Path::new(&output_folder);

From 3cab234448bd9bbe14f647681f894e8509b30e05 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 10 Oct 2024 14:12:14 -0500
Subject: [PATCH 43/54] temporarily hard code paths

---
 python/funmap/cli.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/funmap/cli.py b/python/funmap/cli.py
index 29f50c77..95b1338e 100644
--- a/python/funmap/cli.py
+++ b/python/funmap/cli.py
@@ -93,8 +93,13 @@ def rust():
     coding = mapping.loc[mapping["coding"] == "coding", ["gene_name"]]
     coding_genes = list(set(coding["gene_name"].to_list()))
     funmap_lib.process_files(
-        ["test_data/dia.tsv"],
-        ["test_data/all_no_methyl.tsv"],
+        [
+            "dummy/dia.tsv",
+            "dummy/methyl.tsv",
+            "dummy/RNAseq.tsv",
+            "dummy/tmt_abundance",
+        ],
+        ["dummy/all_no_methyl.tsv"],
         "test_data",
         coding_genes,
     )

From 457a0b11b66a9013e7f8d6fe803422024e1ee872 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 10 Oct 2024 14:26:27 -0500
Subject: [PATCH 44/54] better error messaging

---
 src/lib.rs | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index ecd46da7..fa4a87e5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -51,7 +51,7 @@ fn process_files(
 
     // Read expression data where first column is gene information;
     for file_path in expression_paths.iter() {
-        let file = File::open(file_path)?;
+        let file = File::open(file_path).expect("Could not read expression data");
         let reader = BufReader::new(file);
         let mut has_header = true;
         for line in reader.lines() {
@@ -77,7 +77,7 @@ fn process_files(
 
     // Read extra feature data, where first and second column are genes
     for file_path in extra_feature_paths.iter() {
-        let file = File::open(file_path)?;
+        let file = File::open(file_path).expect("Could not read extra feature file");
         let reader = BufReader::new(file);
         let mut has_header = true;
         for line in reader.lines() {
@@ -116,7 +116,7 @@ fn process_files(
     uniq_gene.sort();
     let folder_path = Path::new(&output_folder);
     let uniq_gene_file_path = folder_path.join("uniq_gene.pkl");
-    let mut w = File::create(uniq_gene_file_path)?;
+    let mut w = File::create(uniq_gene_file_path).expect("Could not cread uniq_gene.pkl");
     serde_pickle::to_writer(&mut w, &uniq_gene, SerOptions::default()).unwrap();
     let n = uniq_gene.len();
     // Re-align each file
@@ -127,7 +127,8 @@ fn process_files(
     }
 
     for file_path in extra_feature_paths {
-        align_file(&file_path, &gene_index_map, n as i32, folder_path)?;
+        align_file(&file_path, &gene_index_map, n as i32, folder_path)
+            .expect("Error aligning file");
     }
     // One column of indices, and one column of values. Separated by file
     Ok(true)

From a4f0bc2763ef4800c8da850e4fa64ffb9ec6d1e0 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 10 Oct 2024 14:32:11 -0500
Subject: [PATCH 45/54] add extra_feature_folder to config creation

---
 python/funmap/cli.py   | 2 +-
 python/funmap/utils.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/funmap/cli.py b/python/funmap/cli.py
index 95b1338e..745e02f9 100644
--- a/python/funmap/cli.py
+++ b/python/funmap/cli.py
@@ -97,7 +97,7 @@ def rust():
             "dummy/dia.tsv",
             "dummy/methyl.tsv",
             "dummy/RNAseq.tsv",
-            "dummy/tmt_abundance",
+            "dummy/tmt_abundance.tsv",
         ],
         ["dummy/all_no_methyl.tsv"],
         "test_data",
diff --git a/python/funmap/utils.py b/python/funmap/utils.py
index b296601f..2babc5b3 100644
--- a/python/funmap/utils.py
+++ b/python/funmap/utils.py
@@ -345,6 +345,7 @@ def get_config(cfg_file: Path):
         "step_size": 1000,
         "lr_cutoff": 50,
         "only_extra_features": False,
+        "extra_feature_folder": None,
     }
 
     with open(cfg_file, "r") as fh:
@@ -370,6 +371,8 @@ def get_config(cfg_file: Path):
 
     if "extra_feature_file" in cfg_dict:
         cfg["extra_feature_file"] = cfg_dict["extra_feature_file"]
+    if "extra_feature_folder" in cfg_dict:
+        cfg["extra_feature_folder"] = cfg_dict["extra_feature_folder"]
 
     if "gs_file" in cfg_dict:
         cfg["gs_file"] = cfg_dict["gs_file"]

From 0a478d354280e2f5868ef590742b2f68c29ab94d Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 10 Oct 2024 14:33:35 -0500
Subject: [PATCH 46/54] rename variables

---
 python/funmap/cli.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/funmap/cli.py b/python/funmap/cli.py
index 745e02f9..49abc19a 100644
--- a/python/funmap/cli.py
+++ b/python/funmap/cli.py
@@ -139,12 +139,12 @@ def run(config_file, force_rerun):
 
     setup_logging(config_file)
     cfg = setup_experiment(config_file)
-    extra_feature_file = cfg["extra_feature_folder"]
+    extra_feature_foler = cfg["extra_feature_folder"]
     extra_feature_df = None
     uniq_gene = None
     if extra_feature_folder is not None:
         log.info("Loading extra feature file into dataframe")
-        (uniq_gene, extra_feature_df) = new_extra_feature(extra_feature_file)
+        (uniq_gene, extra_feature_df) = new_extra_feature(extra_feature_folder)
     gs_file = cfg["gs_file"]
     if (gs_file is not None) and (not check_gold_standard_file(gs_file)):
         return

From 7fa7c0beb4020603a748667e0898a540ddc82c8c Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 10 Oct 2024 14:33:49 -0500
Subject: [PATCH 47/54] fix naming

---
 python/funmap/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/funmap/cli.py b/python/funmap/cli.py
index 49abc19a..53060df4 100644
--- a/python/funmap/cli.py
+++ b/python/funmap/cli.py
@@ -139,7 +139,7 @@ def run(config_file, force_rerun):
 
     setup_logging(config_file)
     cfg = setup_experiment(config_file)
-    extra_feature_foler = cfg["extra_feature_folder"]
+    extra_feature_folder = cfg["extra_feature_folder"]
     extra_feature_df = None
     uniq_gene = None
     if extra_feature_folder is not None:

From ac7eb2faee99de811901a335ac79f352127e3e9c Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 10 Oct 2024 14:34:45 -0500
Subject: [PATCH 48/54] add missing imports

---
 python/funmap/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/funmap/cli.py b/python/funmap/cli.py
index 53060df4..9b762b24 100644
--- a/python/funmap/cli.py
+++ b/python/funmap/cli.py
@@ -27,7 +27,7 @@
     check_extra_feature_file,
     check_gold_standard_file,
     cleanup_experiment,
-    process_extra_feature,
+    new_extra_feature,
     setup_experiment,
 )
 

From 36ca663e9bc7be6554b3b97fcbe631fad34cf469 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 10 Oct 2024 14:37:09 -0500
Subject: [PATCH 49/54] fix function call

---
 python/funmap/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/funmap/utils.py b/python/funmap/utils.py
index 2babc5b3..1f77ce27 100644
--- a/python/funmap/utils.py
+++ b/python/funmap/utils.py
@@ -601,7 +601,7 @@ def new_extra_feature(extra_feature_folder):
     with open(f"{extra_feature_folder}/uniq_gene.pkl", "rb") as r:
         uniq_gene = pickle.load(r)
     df = read_files_to_dataframe(features)
-    df = read_files_to_dataframe(df, index_file)
+    df = reorder_dataframe(df, index_file)
     return (uniq_gene, index_file)
 
 

From 61a80a0be1ed669f1b32d5a659a8462f9e5a9bc0 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 10 Oct 2024 14:47:31 -0500
Subject: [PATCH 50/54] fix reordering dataframe

---
 python/funmap/utils.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/python/funmap/utils.py b/python/funmap/utils.py
index 1f77ce27..2019cad6 100644
--- a/python/funmap/utils.py
+++ b/python/funmap/utils.py
@@ -585,13 +585,21 @@ def read_files_to_dataframe(file_paths):
     return final_df
 
 
-def reorder_dataframe(final_df, index_file_path):
+def reorder_dataframe(final_df, index_file_path, max_length):
+    reordered_df = pd.DataFrame(index=range(max_length), columns=final_df.columns)
     # Read the index file
     with open(index_file_path, "r") as f:
         indices = [int(line.strip()) for line in f.readlines()]
 
-    # Reorder the DataFrame using the indices
-    reordered_df = final_df.iloc[indices].reset_index(drop=True)
+    # Ensure the length of the indices is less than or equal to max_length
+    if len(indices) > max_length:
+        raise ValueError("More indices provided than the max_length.")
+
+    # Fill in the reordered DataFrame based on the provided indices
+    for i, index in enumerate(indices):
+        if index < max_length:
+            reordered_df.iloc[index] = final_df.iloc[i]
+
     return reordered_df
 
 
@@ -600,8 +608,12 @@ def new_extra_feature(extra_feature_folder):
     features = list(glob.glob(f"{extra_feature_folder}/*.col"))
     with open(f"{extra_feature_folder}/uniq_gene.pkl", "rb") as r:
         uniq_gene = pickle.load(r)
+    i = j = len(uniq_gene) - 1
+    n = i + 1
+    max_len = i * n - i * (i - 1) // 2 + (j - i)
+
     df = read_files_to_dataframe(features)
-    df = reorder_dataframe(df, index_file)
+    df = reorder_dataframe(df, index_file, max_len)
     return (uniq_gene, index_file)
 
 

From 8279aecb3cf822530c6a9b14394e8ea3bebe0a01 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 10 Oct 2024 14:48:06 -0500
Subject: [PATCH 51/54] logging

---
 python/funmap/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/funmap/utils.py b/python/funmap/utils.py
index 2019cad6..3ba69fcf 100644
--- a/python/funmap/utils.py
+++ b/python/funmap/utils.py
@@ -613,6 +613,7 @@ def new_extra_feature(extra_feature_folder):
     max_len = i * n - i * (i - 1) // 2 + (j - i)
 
     df = read_files_to_dataframe(features)
+    log.info(df.head())
     df = reorder_dataframe(df, index_file, max_len)
     return (uniq_gene, index_file)
 

From a177b404cc1e412d2aee2d61b99771213a118dc1 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 10 Oct 2024 14:50:21 -0500
Subject: [PATCH 52/54] fix column names

---
 python/funmap/utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/funmap/utils.py b/python/funmap/utils.py
index 3ba69fcf..ba3a9e2f 100644
--- a/python/funmap/utils.py
+++ b/python/funmap/utils.py
@@ -577,7 +577,11 @@ def read_files_to_dataframe(file_paths):
     dataframes = []
     for file_path in file_paths:
         # Read file into DataFrame; each line becomes a row in a single column
-        df = pd.read_csv(file_path, header=None, names=["data"])
+        df = pd.read_csv(
+            file_path,
+            header=None,
+            names=[os.path.basename(file_path).replace(".col", "")],
+        )
         dataframes.append(df)
 
     # Concatenate all DataFrames into one

From e90285d4a6fab858f0bcd0f62a193b7afbe86a24 Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Thu, 10 Oct 2024 15:35:32 -0500
Subject: [PATCH 53/54] fix return type

---
 python/funmap/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/funmap/utils.py b/python/funmap/utils.py
index ba3a9e2f..9a843796 100644
--- a/python/funmap/utils.py
+++ b/python/funmap/utils.py
@@ -619,7 +619,7 @@ def new_extra_feature(extra_feature_folder):
     df = read_files_to_dataframe(features)
     log.info(df.head())
     df = reorder_dataframe(df, index_file, max_len)
-    return (uniq_gene, index_file)
+    return (uniq_gene, df)
 
 
 def process_extra_feature(extra_feature_file) -> pd.DataFrame:

From 4be7af8692ce8ad2cbf700a2b014085c39851e6d Mon Sep 17 00:00:00 2001
From: John Elizarraras <elizarrj27@gmail.com>
Date: Fri, 11 Oct 2024 12:06:58 -0500
Subject: [PATCH 54/54] build sqlite db for extra_features

---
 Cargo.lock | 135 ++++++++++++++++++++++++++++++++++++++++++++++++++
 Cargo.toml |   2 +
 src/lib.rs | 142 +++++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 263 insertions(+), 16 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6150eee1..aa9638c1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -21,24 +21,74 @@ version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
 
+[[package]]
+name = "bitflags"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
+
 [[package]]
 name = "byteorder"
 version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
+[[package]]
+name = "cc"
+version = "1.1.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "58e804ac3194a48bb129643eb1d62fcc20d18c6b8c181704489353d13120bcd1"
+dependencies = [
+ "shlex",
+]
+
 [[package]]
 name = "cfg-if"
 version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
+[[package]]
+name = "csv"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe"
+dependencies = [
+ "csv-core",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "fallible-iterator"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649"
+
+[[package]]
+name = "fallible-streaming-iterator"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
+
 [[package]]
 name = "funmap_lib"
 version = "0.1.0"
 dependencies = [
  "ahash",
+ "csv",
  "pyo3",
+ "rusqlite",
  "serde-pickle",
 ]
 
@@ -53,6 +103,24 @@ dependencies = [
  "wasi",
 ]
 
+[[package]]
+name = "hashbrown"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+dependencies = [
+ "ahash",
+]
+
+[[package]]
+name = "hashlink"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
+dependencies = [
+ "hashbrown",
+]
+
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -71,12 +139,35 @@ version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c397ca3ea05ad509c4ec451fea28b4771236a376ca1c69fd5143aae0cf8f93c4"
 
+[[package]]
+name = "itoa"
+version = "1.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
+
 [[package]]
 name = "libc"
 version = "0.2.159"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5"
 
+[[package]]
+name = "libsqlite3-sys"
+version = "0.30.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e99fb7a497b1e3339bc746195567ed8d3e24945ecd636e3619d20b9de9e9149"
+dependencies = [
+ "cc",
+ "pkg-config",
+ "vcpkg",
+]
+
+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+
 [[package]]
 name = "memoffset"
 version = "0.9.1"
@@ -120,6 +211,12 @@ version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
+[[package]]
+name = "pkg-config"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
+
 [[package]]
 name = "portable-atomic"
 version = "1.8.0"
@@ -207,6 +304,26 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "rusqlite"
+version = "0.32.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7753b721174eb8ff87a9a0e799e2d7bc3749323e773db92e0984debb00019d6e"
+dependencies = [
+ "bitflags",
+ "fallible-iterator",
+ "fallible-streaming-iterator",
+ "hashlink",
+ "libsqlite3-sys",
+ "smallvec",
+]
+
+[[package]]
+name = "ryu"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
+
 [[package]]
 name = "serde"
 version = "1.0.210"
@@ -240,6 +357,18 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "shlex"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
+
+[[package]]
+name = "smallvec"
+version = "1.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
+
 [[package]]
 name = "syn"
 version = "2.0.77"
@@ -269,6 +398,12 @@ version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce"
 
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
 [[package]]
 name = "version_check"
 version = "0.9.5"
diff --git a/Cargo.toml b/Cargo.toml
index 80d83754..e3c902b7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,5 +10,7 @@ crate-type = ["cdylib"]
 
 [dependencies]
 ahash = "0.8.11"
+csv = "1.3.0"
 pyo3 = "0.22.0"
+rusqlite = { version = "0.32.1", features = ["bundled"] }
 serde-pickle = "1.1.1"
diff --git a/src/lib.rs b/src/lib.rs
index fa4a87e5..14090ecb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,7 @@
 use ahash::{AHashMap, AHashSet, HashSet, HashSetExt};
+use csv::ReaderBuilder;
 use pyo3::{exceptions::PyValueError, prelude::*};
+use rusqlite::{params, params_from_iter, Connection, Result};
 use serde_pickle::SerOptions;
 use std::{
     fs::File,
@@ -127,12 +129,118 @@ fn process_files(
     }
 
     for file_path in extra_feature_paths {
-        align_file(&file_path, &gene_index_map, n as i32, folder_path)
-            .expect("Error aligning file");
+        new(&file_path, &gene_index_map, n as i32, folder_path).expect("Error aligning file");
     }
     // One column of indices, and one column of values. Separated by file
     Ok(true)
 }
+// Function to parse a string into a float, returning None for invalid or NaN values
+fn safe_parse_float(s: &str) -> Option<f64> {
+    match s.parse::<f64>() {
+        Ok(f) if f.is_nan() => None, // Explicitly keep NaNs as None
+        Ok(f) => Some(f),
+        Err(_) => None, // If it can't be parsed, treat as None (null)
+    }
+}
+fn new(
+    path: &String,
+    uniq_gene: &AHashMap<&String, usize>,
+    n: i32,
+    output_folder: &Path,
+) -> PyResult<bool> {
+    // Create SQLite connection
+    let conn = Connection::open("db.sqlite").unwrap();
+
+    // Open the TSV file
+    let file_path = path;
+    let mut rdr = ReaderBuilder::new()
+        .delimiter(b'\t')
+        .from_path(file_path)
+        .unwrap();
+
+    // Get the headers to determine feature names dynamically
+    let headers = rdr.headers().unwrap().clone();
+
+    // Features are all columns after the first two columns
+    let feature_names: Vec<&str> = headers
+        .iter()
+        .skip(2) // Skip the first two columns
+        .collect();
+
+    // Dynamically create the SQL table with the appropriate number of features, all as FLOAT
+    let feature_columns: Vec<String> = feature_names
+        .iter()
+        .map(|name| format!("{} FLOAT", name))
+        .collect();
+
+    let create_table_query = format!(
+        "CREATE TABLE IF NOT EXISTS gene_data (
+            index_id TEXT PRIMARY KEY,
+            {}
+        )",
+        feature_columns.join(", ")
+    );
+    conn.execute(&create_table_query, []).unwrap();
+
+    // Prepare to write features to a separate file (feature names only)
+    let feature_file = File::create("features.txt")?;
+    let mut feature_writer = BufWriter::new(feature_file);
+
+    // Write feature names to the file
+    for feature in &feature_names {
+        writeln!(feature_writer, "{}", feature)?;
+    }
+
+    // Read each record from TSV and insert it into the database
+    for result in rdr.records() {
+        let record = result.unwrap();
+        let column1 = &record[0];
+        let column2 = &record[1];
+        let index1 = uniq_gene.get(&column1.to_string());
+        let index2 = uniq_gene.get(&column2.to_string());
+        if let (Some(index1), Some(index2)) = (index1, index2) {
+            let (i, j) = if index1 <= index2 {
+                (*index1 as i32, *index2 as i32)
+            } else {
+                (*index2 as i32, *index1 as i32)
+            };
+            let index_id = (i * n - i * (i - 1) / 2 + (j - i)) as usize;
+
+            // Extract feature values, converting them to Option<f64> to handle nulls/NaNs
+            let feature_values: Vec<Option<f64>> = record
+                .iter()
+                .skip(2) // Skip the first two columns
+                .map(safe_parse_float) // Handle nulls and invalid values
+                .collect();
+
+            // Insert into SQLite database using dynamic query
+            let insert_query = format!(
+                "INSERT OR REPLACE INTO gene_data (index_id, {})
+            VALUES (?1, {})",
+                feature_names.join(", "),
+                feature_values
+                    .iter()
+                    .enumerate()
+                    .map(|(i, _)| format!("?{}", i + 2))
+                    .collect::<Vec<String>>()
+                    .join(", ")
+            );
+
+            // Collect the parameters for the query, using None for null/NaN values
+            let mut params_vec: Vec<&(dyn rusqlite::ToSql + Sync)> = vec![&index_id];
+            for val in &feature_values {
+                match val {
+                    Some(v) => params_vec.push(v),
+                    None => params_vec.push(&rusqlite::types::Null),
+                }
+            }
+
+            conn.execute(&insert_query, params_from_iter(params_vec.iter()))
+                .unwrap();
+        }
+    }
+    Ok(true)
+}
 
 fn align_file(
     path: &String,
@@ -176,20 +284,22 @@ fn align_file(
         }
         let row: Vec<&str> = line.split('\t').collect();
         if row.len() > 2 {
-            let index1 = uniq_gene.get(&row[0].to_string()).unwrap();
-            let index2 = uniq_gene.get(&row[1].to_string()).unwrap();
-            let (i, j) = if index1 <= index2 {
-                (*index1 as i32, *index2 as i32)
-            } else {
-                (*index2 as i32, *index1 as i32)
-            };
-            let new_index = (i * n - i * (i - 1) / 2 + (j - i)) as usize;
-            indices.push(new_index);
-            writers[0].write_all(new_index.to_string().as_bytes())?;
-            writers[0].write_all(b"\n")?;
-            for i in 0..feature_count {
-                writers[i + 1].write_all(row[i + 2].as_bytes())?;
-                writers[i + 1].write_all(b"\n")?;
+            let index1 = uniq_gene.get(&row[0].to_string());
+            let index2 = uniq_gene.get(&row[1].to_string());
+            if let (Some(index1), Some(index2)) = (index1, index2) {
+                let (i, j) = if index1 <= index2 {
+                    (*index1 as i32, *index2 as i32)
+                } else {
+                    (*index2 as i32, *index1 as i32)
+                };
+                let new_index = (i * n - i * (i - 1) / 2 + (j - i)) as usize;
+                indices.push(new_index);
+                writers[0].write_all(new_index.to_string().as_bytes())?;
+                writers[0].write_all(b"\n")?;
+                for i in 0..feature_count {
+                    writers[i + 1].write_all(row[i + 2].as_bytes())?;
+                    writers[i + 1].write_all(b"\n")?;
+                }
             }
         }
     }