From 1b572806d3d95fa066b7a1425c55ab92d7face17 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Fri, 13 Sep 2024 11:05:38 -0500 Subject: [PATCH 01/54] style: formatting changes --- funmap/funmap.py | 469 +++++++++++++++++++++++++++++------------------ 1 file changed, 291 insertions(+), 178 deletions(-) diff --git a/funmap/funmap.py b/funmap/funmap.py index 65b6075d..01aa93cb 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -25,27 +25,32 @@ log = setup_logger(__name__) + def get_valid_gs_data(gs_path: str, valid_gene_list: List[str], md5=None): log.info(f'Loading gold standard feature file "{gs_path}" ...') if is_url_scheme(gs_path): - gs_edge_df = read_csv_with_md5_check(gs_path, expected_md5=md5, - local_path='download_gs_file', sep='\t') + gs_edge_df = read_csv_with_md5_check( + gs_path, expected_md5=md5, local_path="download_gs_file", sep="\t" + ) if gs_edge_df is None: - raise ValueError('Failed to download gold standard file') + raise ValueError("Failed to download gold standard file") else: - gs_edge_df = pd.read_csv(gs_path, sep='\t') - - log.info('Done loading gold standard feature file') - gs_edge_df = gs_edge_df.rename(columns={gs_edge_df.columns[0]: 'P1', - gs_edge_df.columns[1]: 'P2'}) - gs_edge_df = gs_edge_df[gs_edge_df['P1'].isin(valid_gene_list) & - gs_edge_df['P2'].isin(valid_gene_list) & - (gs_edge_df['P1'] != gs_edge_df['P2'])] - m = ~pd.DataFrame(np.sort(gs_edge_df[['P1','P2']], axis=1)).duplicated() + gs_edge_df = pd.read_csv(gs_path, sep="\t") + + log.info("Done loading gold standard feature file") + gs_edge_df = gs_edge_df.rename( + columns={gs_edge_df.columns[0]: "P1", gs_edge_df.columns[1]: "P2"} + ) + gs_edge_df = gs_edge_df[ + gs_edge_df["P1"].isin(valid_gene_list) + & gs_edge_df["P2"].isin(valid_gene_list) + & (gs_edge_df["P1"] != gs_edge_df["P2"]) + ] + m = ~pd.DataFrame(np.sort(gs_edge_df[["P1", "P2"]], axis=1)).duplicated() gs_edge_df = gs_edge_df[list(m)] gs_edge_df.reset_index(drop=True, inplace=True) # rename the last column name to 'label' - gs_edge_df.rename(columns={gs_edge_df.columns[-1]: 'label'}, inplace=True) + gs_edge_df.rename(columns={gs_edge_df.columns[-1]: "label"}, inplace=True) return gs_edge_df @@ -108,16 +113,22 @@ def pairwise_mutual_rank(pcc_matrix): valid_a = ~np.isnan(pcc_matrix) valid_b = valid_a.T - rank_ab = np.argsort(pcc_matrix, axis=1).argsort(axis=1, kind='mergesort') + 1 # Start ranks from 1 - rank_ba = np.argsort(pcc_matrix, axis=0).argsort(axis=0, kind='mergesort') + 1 # Start ranks from 1 + rank_ab = ( + np.argsort(pcc_matrix, axis=1).argsort(axis=1, kind="mergesort") + 1 + ) # Start ranks from 1 + rank_ba = ( + np.argsort(pcc_matrix, axis=0).argsort(axis=0, kind="mergesort") + 1 + ) # Start ranks from 1 n_a = np.sum(valid_a, axis=1) n_b = np.sum(valid_b, axis=0) valid_indices_a, valid_indices_b = np.where(valid_a) - mr_values = np.sqrt((rank_ab[valid_indices_a, valid_indices_b] / n_b[valid_indices_b]) * - (rank_ba[valid_indices_a, valid_indices_b] / n_a[valid_indices_a])) + mr_values = np.sqrt( + (rank_ab[valid_indices_a, valid_indices_b] / n_b[valid_indices_b]) + * (rank_ba[valid_indices_a, valid_indices_b] / n_a[valid_indices_a]) + ) mr_matrix = np.full_like(pcc_matrix, np.nan) mr_matrix[valid_indices_a, valid_indices_b] = mr_values @@ -126,26 +137,25 @@ def pairwise_mutual_rank(pcc_matrix): def compute_features(cfg, feature_type, min_sample_count, output_dir): - """Compute the pearson correlation coefficient for each edge in the list of edges and for each - """ + """Compute the pearson correlation coefficient for each edge in the list of edges and for each""" data_dict, all_valid_ids = get_data_dict(cfg, min_sample_count) cc_dict = {} for i in data_dict: - cc_file = os.path.join(output_dir, f'cc_{i}.h5') + cc_file = os.path.join(output_dir, f"cc_{i}.h5") cc_dict[i] = cc_file mr_dict = {} for i in data_dict: - mr_file = os.path.join(output_dir, f'mr_{i}.h5') + mr_file = os.path.join(output_dir, f"mr_{i}.h5") mr_dict[i] = mr_file all_cc_exist = all(os.path.exists(file_path) for file_path in cc_dict.values()) if all_cc_exist: log.info("All cc files exist. Skipping feature computation.") - if feature_type == 'cc': + if feature_type == "cc": return cc_dict, mr_dict, all_valid_ids - if feature_type == 'mr': + if feature_type == "mr": all_mr_exist = all(os.path.exists(file_path) for file_path in mr_dict.values()) if all_cc_exist and all_mr_exist: log.info("All mr files exist. Skipping feature computation.") @@ -159,24 +169,24 @@ def compute_features(cfg, feature_type, min_sample_count, output_dir): log.info(f"Computing pearson correlation coefficient matrix for {i}") x = data_dict[i].values.astype(np.float32) df = pd.DataFrame(x) - corr_matrix = df.corr(method='pearson', min_periods=min_sample_count) + corr_matrix = df.corr(method="pearson", min_periods=min_sample_count) arr = corr_matrix.values upper_indices = np.triu_indices(arr.shape[0]) - with h5py.File(cc_dict[i], 'w') as hf: + with h5py.File(cc_dict[i], "w") as hf: # only store the upper triangle part - hf.create_dataset('cc', data=arr[upper_indices]) - hf.create_dataset('ids', data=data_dict[i].columns.values.astype('S')) + hf.create_dataset("cc", data=arr[upper_indices]) + hf.create_dataset("ids", data=data_dict[i].columns.values.astype("S")) cc_dict[i] = cc_file # compute pairwise mutual rank features - if feature_type == 'mr': + if feature_type == "mr": log.info(f"Computing mutual rank matrix for {i}") arr_mr = pairwise_mutual_rank(arr) upper_indices = np.triu_indices(arr_mr.shape[0]) - with h5py.File(mr_dict[i], 'w') as hf: + with h5py.File(mr_dict[i], "w") as hf: # only store the upper triangle part - hf.create_dataset('mr', data=arr_mr[upper_indices]) - hf.create_dataset('ids', data=data_dict[i].columns.values.astype('S')) + hf.create_dataset("mr", data=arr_mr[upper_indices]) + hf.create_dataset("ids", data=data_dict[i].columns.values.astype("S")) return cc_dict, mr_dict, all_valid_ids @@ -193,37 +203,54 @@ def balance_classes(df, random_state=42): minority_class = class_0 if len(class_0) < len(class_1) else class_1 majority_class = class_1 if minority_class is class_0 else class_0 - majority_class_undersampled = resample(majority_class, replace=False, - n_samples=len(minority_class), random_state=random_state) + majority_class_undersampled = resample( + majority_class, + replace=False, + n_samples=len(minority_class), + random_state=random_state, + ) balanced_df = pd.concat([minority_class, majority_class_undersampled]) # Shuffle the rows in the balanced DataFrame - balanced_df = balanced_df.sample(frac=1, random_state=random_state).reset_index(drop=True) + balanced_df = balanced_df.sample(frac=1, random_state=random_state).reset_index( + drop=True + ) return balanced_df -def assemble_feature_df(h5_file_mapping, df, dataset='cc'): + +def assemble_feature_df(h5_file_mapping, df, dataset="cc"): df.reset_index(drop=True, inplace=True) # Initialize feature_df with columns for HDF5 file keys and 'label' file_keys = list(h5_file_mapping.keys()) - feature_df = pd.DataFrame(columns=file_keys + ['label']) + feature_df = pd.DataFrame(columns=file_keys + ["label"]) # Iterate over HDF5 files and load feature values for key, file_path in h5_file_mapping.items(): - with h5py.File(file_path, 'r') as h5_file: - gene_ids = h5_file['ids'][:] + with h5py.File(file_path, "r") as h5_file: + gene_ids = h5_file["ids"][:] gene_to_index = {gene.astype(str): idx for idx, gene in enumerate(gene_ids)} # Get gene indices for P1 and P2 - p1_indices = np.array([gene_to_index.get(gene, -1) for gene in df.iloc[:, 0]]) - p2_indices = np.array([gene_to_index.get(gene, -1) for gene in df.iloc[:, 1]]) + p1_indices = np.array( + [gene_to_index.get(gene, -1) for gene in df.iloc[:, 0]] + ) + p2_indices = np.array( + [gene_to_index.get(gene, -1) for gene in df.iloc[:, 1]] + ) f_dataset = h5_file[dataset] f_values = np.empty(len(df), dtype=float) valid_indices = (p1_indices != -1) & (p2_indices != -1) - linear_indices_func = lambda row_indices, col_indices, n: np.array(col_indices) - np.array(row_indices) + (2*n - np.array(row_indices) + 1) * np.array(row_indices) // 2 - linear_indices = linear_indices_func(p1_indices[valid_indices], p2_indices[valid_indices], len(gene_ids)) + linear_indices_func = ( + lambda row_indices, col_indices, n: np.array(col_indices) + - np.array(row_indices) + + (2 * n - np.array(row_indices) + 1) * np.array(row_indices) // 2 + ) + linear_indices = linear_indices_func( + p1_indices[valid_indices], p2_indices[valid_indices], len(gene_ids) + ) f_values[valid_indices] = f_dataset[:][linear_indices] f_values[~valid_indices] = np.nan @@ -232,35 +259,40 @@ def assemble_feature_df(h5_file_mapping, df, dataset='cc'): feature_df[key] = f_values # if the last column is 'label', assign it to feature_df - if df.columns[-1] == 'label': - feature_df['label'] = df[df.columns[-1]] + if df.columns[-1] == "label": + feature_df["label"] = df[df.columns[-1]] else: # delete the 'label' column from feature_df - del feature_df['label'] + del feature_df["label"] return feature_df -def extract_features(df, feature_type, cc_dict, ppi_feature=None, extra_feature=None, mr_dict=None): - if feature_type == 'mr': +def extract_features( + df, feature_type, cc_dict, ppi_feature=None, extra_feature=None, mr_dict=None +): + if feature_type == "mr": if not mr_dict: - raise ValueError('mr dict is empty') + raise ValueError("mr dict is empty") - feature_dict = cc_dict if feature_type == 'cc' else mr_dict + feature_dict = cc_dict if feature_type == "cc" else mr_dict feature_df = assemble_feature_df(feature_dict, df, feature_type) if ppi_feature is not None: ppi_dict = {key: set(value) for key, value in ppi_feature.items()} for ppi_source, ppi_tuples in ppi_dict.items(): feature_df[ppi_source] = df.apply( - lambda row: 1 if (row['P1'], row['P2']) in ppi_tuples else 0, axis=1) + lambda row: 1 if (row["P1"], row["P2"]) in ppi_tuples else 0, axis=1 + ) # TODO: add extra features if provided if extra_feature is not None: pass # move 'label' column to the end of the dataframe if it exists - if 'label' in feature_df.columns: - feature_df = feature_df[[col for col in feature_df.columns if col != 'label'] + ['label']] + if "label" in feature_df.columns: + feature_df = feature_df[ + [col for col in feature_df.columns if col != "label"] + ["label"] + ] return feature_df @@ -279,72 +311,94 @@ def get_ppi_feature(): A dictionary with PPI features, where the keys are the feature names and the values are lists of tuples representing the protein interactions. """ - feature_names = ['BioGRID', 'BioPlex', 'HI-union'] - urls = [network_info['url'][i] for i in range(len(network_info['name'])) - if network_info['name'][i] in feature_names] + feature_names = ["BioGRID", "BioPlex", "HI-union"] + urls = [ + network_info["url"][i] + for i in range(len(network_info["name"])) + if network_info["name"][i] in feature_names + ] ppi_features = {} # use pandas to read the file - for (i, url) in enumerate(urls): - data = pd.read_csv(url, sep='\t', header=None) + for i, url in enumerate(urls): + data = pd.read_csv(url, sep="\t", header=None) data = data.apply(lambda x: tuple(sorted(x)), axis=1) - ppi_name = f'{feature_names[i]}_PPI' + ppi_name = f"{feature_names[i]}_PPI" ppi_features[ppi_name] = data.tolist() return ppi_features def train_ml_model(data_df, ml_type, seed, n_jobs, feature_mapping, model_dir): - assert ml_type == 'xgboost', 'ML model must be xgboost' - models = train_model(data_df.iloc[:, :-1], data_df.iloc[:, -1], seed, n_jobs, feature_mapping, - model_dir) + assert ml_type == "xgboost", "ML model must be xgboost" + models = train_model( + data_df.iloc[:, :-1], + data_df.iloc[:, -1], + seed, + n_jobs, + feature_mapping, + model_dir, + ) return models def train_model(X, y, seed, n_jobs, feature_mapping, model_dir): model_params = { - 'n_estimators': [10, 20, 50, 100], - 'max_depth': [1, 2, 3, 4, 5], - 'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0] + "n_estimators": [10, 20, 50, 100], + "max_depth": [1, 2, 3, 4, 5], + "learning_rate": [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0], } models = {} for ft in feature_mapping: # use only mutual rank - log.info(f'Training model for {ft} ...') + log.info(f"Training model for {ft} ...") xgb_model = xgb.XGBClassifier(random_state=seed, n_jobs=n_jobs) cv = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True) - clf = GridSearchCV(xgb_model, model_params, scoring='roc_auc', cv=cv, - n_jobs=1, verbose=2) - if ft == 'ex': + clf = GridSearchCV( + xgb_model, model_params, scoring="roc_auc", cv=cv, n_jobs=1, verbose=2 + ) + if ft == "ex": # exclude ppi features - Xtrain = X.loc[:, ~X.columns.str.endswith('_PPI')] + Xtrain = X.loc[:, ~X.columns.str.endswith("_PPI")] else: Xtrain = X model = clf.fit(Xtrain, y) models[ft] = model - ml_model_file = model_dir / f'model_{ft}.pkl.gz' - with gzip.open(ml_model_file, 'wb') as fh: + ml_model_file = model_dir / f"model_{ft}.pkl.gz" + with gzip.open(ml_model_file, "wb") as fh: pickle.dump(model, fh) - log.info(f'Training model for {ft} ... done') + log.info(f"Training model for {ft} ... done") return models -def compute_llr(predicted_all_pairs, llr_res_file, start_edge_num, max_num_edges, step_size, - gs_test): +def compute_llr( + predicted_all_pairs, llr_res_file, start_edge_num, max_num_edges, step_size, gs_test +): # make sure max_num_edges is smaller than the number of non-NA values - assert max_num_edges < np.count_nonzero(~np.isnan(predicted_all_pairs.iloc[:, -1].values)), \ - f'max_num_edges should be smaller than the number of non-NA values' + assert max_num_edges < np.count_nonzero( + ~np.isnan(predicted_all_pairs.iloc[:, -1].values) + ), f"max_num_edges should be smaller than the number of non-NA values" - cur_col_name = 'prediction' + cur_col_name = "prediction" cur_results = predicted_all_pairs.nlargest(max_num_edges, cur_col_name) - selected_edges_all = cur_results[['P1', 'P2']].apply(lambda row: tuple(sorted({row['P1'], row['P2']})), axis=1) - - gs_test_pos_set = set(gs_test[gs_test['label'] == 1][['P1', 'P2']].apply(lambda row: tuple(sorted({row['P1'], row['P2']})), axis=1)) - gs_test_neg_set = set(gs_test[gs_test['label'] == 0][['P1', 'P2']].apply(lambda row: tuple(sorted({row['P1'], row['P2']})), axis=1)) + selected_edges_all = cur_results[["P1", "P2"]].apply( + lambda row: tuple(sorted({row["P1"], row["P2"]})), axis=1 + ) + + gs_test_pos_set = set( + gs_test[gs_test["label"] == 1][["P1", "P2"]].apply( + lambda row: tuple(sorted({row["P1"], row["P2"]})), axis=1 + ) + ) + gs_test_neg_set = set( + gs_test[gs_test["label"] == 0][["P1", "P2"]].apply( + lambda row: tuple(sorted({row["P1"], row["P2"]})), axis=1 + ) + ) n_gs_test_pos_set = len(gs_test_pos_set) n_gs_test_neg_set = len(gs_test_neg_set) @@ -352,73 +406,86 @@ def compute_llr(predicted_all_pairs, llr_res_file, start_edge_num, max_num_edges # llr_res_dict only save maximum of max_steps data points for downstream # analysis / plotting total = math.ceil((max_num_edges - start_edge_num) / step_size) + 1 - for k in tqdm(range(start_edge_num, max_num_edges+step_size, step_size), total=total, ascii=' >='): + for k in tqdm( + range(start_edge_num, max_num_edges + step_size, step_size), + total=total, + ascii=" >=", + ): selected_edges = set(selected_edges_all[:k]) all_nodes = set(itertools.chain.from_iterable(selected_edges)) common_pos_edges = selected_edges & gs_test_pos_set common_neg_edges = selected_edges & gs_test_neg_set try: - lr = len(common_pos_edges) / len(common_neg_edges) / (n_gs_test_pos_set / n_gs_test_neg_set) + lr = ( + len(common_pos_edges) + / len(common_neg_edges) + / (n_gs_test_pos_set / n_gs_test_neg_set) + ) except ZeroDivisionError: lr = 0 llr = np.log(lr) if lr > 0 else np.nan n_node = len(all_nodes) - result_dict['k'].append(k) - result_dict['n'].append(n_node) - result_dict['llr'].append(llr) + result_dict["k"].append(k) + result_dict["n"].append(n_node) + result_dict["llr"].append(llr) llr_res = pd.DataFrame(result_dict) if llr_res_file is not None: - llr_res.to_csv(llr_res_file, sep='\t', index=False) + llr_res.to_csv(llr_res_file, sep="\t", index=False) return llr_res def prepare_gs_data(**kwargs): - task = kwargs['task'] - cc_dict = kwargs['cc_dict'] - mr_dict = kwargs['mr_dict'] - gs_file = kwargs['gs_file'] + task = kwargs["task"] + cc_dict = kwargs["cc_dict"] + mr_dict = kwargs["mr_dict"] + gs_file = kwargs["gs_file"] gs_file_md5 = None - feature_type = kwargs['feature_type'] - extra_feature_file = kwargs['extra_feature_file'] - valid_id_list = kwargs['valid_id_list'] - test_size = kwargs['test_size'] - seed = kwargs['seed'] + feature_type = kwargs["feature_type"] + extra_feature_file = kwargs["extra_feature_file"] + valid_id_list = kwargs["valid_id_list"] + test_size = kwargs["test_size"] + seed = kwargs["seed"] if gs_file is None: - gs_file = urls['reactome_gold_standard'] - gs_file_md5 = urls['reactome_gold_standard_md5'] + gs_file = urls["reactome_gold_standard"] + gs_file_md5 = urls["reactome_gold_standard_md5"] - log.info('Preparing gold standard data') + log.info("Preparing gold standard data") gs_df = get_valid_gs_data(gs_file, valid_id_list, md5=gs_file_md5) gs_df_balanced = balance_classes(gs_df, random_state=seed) del gs_df - gs_train, gs_test = train_test_split(gs_df_balanced, - test_size=test_size, random_state=seed, - stratify=gs_df_balanced.iloc[:, -1]) - if task == 'protein_func': + gs_train, gs_test = train_test_split( + gs_df_balanced, + test_size=test_size, + random_state=seed, + stratify=gs_df_balanced.iloc[:, -1], + ) + if task == "protein_func": ppi_feature = get_ppi_feature() else: ppi_feature = None - gs_train_df = extract_features(gs_train, feature_type, cc_dict, ppi_feature, - extra_feature_file, mr_dict) - gs_test_df = extract_features(gs_test, feature_type, cc_dict, ppi_feature, - extra_feature_file, mr_dict) + gs_train_df = extract_features( + gs_train, feature_type, cc_dict, ppi_feature, extra_feature_file, mr_dict + ) + gs_test_df = extract_features( + gs_test, feature_type, cc_dict, ppi_feature, extra_feature_file, mr_dict + ) # store both the ids with gs_test_df for later use # add the first two column of gs_test to gs_test_df at the beginning gs_test_df = pd.concat([gs_test.iloc[:, :2], gs_test_df], axis=1) - log.info('Preparing gs data ... done') + log.info("Preparing gs data ... done") return gs_train_df, gs_test_df -def extract_dataset_feature(all_pairs, feature_file, feature_type='cc'): +def extract_dataset_feature(all_pairs, feature_file, feature_type="cc"): # convert all_pairs to a dataframe - df = pd.DataFrame(all_pairs, columns=['P1', 'P2']) - with h5py.File(feature_file, 'r') as h5_file: - gene_ids = h5_file['ids'][:] + df = pd.DataFrame(all_pairs, columns=["P1", "P2"]) + with h5py.File(feature_file, "r") as h5_file: + gene_ids = h5_file["ids"][:] gene_to_index = {gene.astype(str): idx for idx, gene in enumerate(gene_ids)} # Get gene indices for P1 and P2 @@ -429,50 +496,71 @@ def extract_dataset_feature(all_pairs, feature_file, feature_type='cc'): f_values = np.empty(len(df), dtype=float) valid_indices = (p1_indices != -1) & (p2_indices != -1) - linear_indices_func = lambda row_indices, col_indices, n: np.array(col_indices) - np.array(row_indices) + (2*n - np.array(row_indices) + 1) * np.array(row_indices) // 2 - linear_indices = linear_indices_func(p1_indices[valid_indices], p2_indices[valid_indices], len(gene_ids)) + linear_indices_func = ( + lambda row_indices, col_indices, n: np.array(col_indices) + - np.array(row_indices) + + (2 * n - np.array(row_indices) + 1) * np.array(row_indices) // 2 + ) + linear_indices = linear_indices_func( + p1_indices[valid_indices], p2_indices[valid_indices], len(gene_ids) + ) f_values[valid_indices] = f_dataset[:][linear_indices] f_values[~valid_indices] = np.nan # extracted feature is the 'prediction' column - df['prediction'] = f_values + df["prediction"] = f_values return df -def dataset_llr(all_ids, feature_dict, feature_type, gs_test, start_edge_num, - max_num_edge, step_size, llr_dataset_file): +def dataset_llr( + all_ids, + feature_dict, + feature_type, + gs_test, + start_edge_num, + max_num_edge, + step_size, + llr_dataset_file, +): llr_ds = pd.DataFrame() all_ids_sorted = sorted(all_ids) all_pairs = list(itertools.combinations(all_ids_sorted, 2)) all_ds_pred = None for dataset in feature_dict: - log.info(f'Calculating llr for {dataset} ...') + log.info(f"Calculating llr for {dataset} ...") feature_file = feature_dict[dataset] - predicted_all_pairs = extract_dataset_feature(all_pairs, feature_file, feature_type) + predicted_all_pairs = extract_dataset_feature( + all_pairs, feature_file, feature_type + ) if all_ds_pred is None: - all_ds_pred = predicted_all_pairs['prediction'].values + all_ds_pred = predicted_all_pairs["prediction"].values else: - all_ds_pred = np.vstack((all_ds_pred, predicted_all_pairs['prediction'].values)) - - cur_llr_res = compute_llr(predicted_all_pairs, None, start_edge_num, - max_num_edge, step_size, gs_test) - cur_llr_res['dataset'] = dataset + all_ds_pred = np.vstack( + (all_ds_pred, predicted_all_pairs["prediction"].values) + ) + + cur_llr_res = compute_llr( + predicted_all_pairs, None, start_edge_num, max_num_edge, step_size, gs_test + ) + cur_llr_res["dataset"] = dataset llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True) - llr_ds.to_csv(llr_dataset_file, sep='\t', index=False) - log.info(f'Calculating llr for {dataset} ... done') + llr_ds.to_csv(llr_dataset_file, sep="\t", index=False) + log.info(f"Calculating llr for {dataset} ... done") # calculate llr for all datasets based on the average prediction - log.info(f'Calculating llr for all datasets average ...') - all_ds_pred_df = pd.DataFrame(all_pairs, columns=['P1', 'P2']) + log.info(f"Calculating llr for all datasets average ...") + all_ds_pred_df = pd.DataFrame(all_pairs, columns=["P1", "P2"]) all_ds_pred_avg = np.nanmean(all_ds_pred, axis=0) - all_ds_pred_df['prediction'] = all_ds_pred_avg - cur_llr_res = compute_llr(all_ds_pred_df, None, start_edge_num, max_num_edge, step_size, gs_test) - cur_llr_res['dataset'] = 'all_average' + all_ds_pred_df["prediction"] = all_ds_pred_avg + cur_llr_res = compute_llr( + all_ds_pred_df, None, start_edge_num, max_num_edge, step_size, gs_test + ) + cur_llr_res["dataset"] = "all_average" llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True) - log.info(f'Calculating llr for all datasets average ... done') - llr_ds.to_csv(llr_dataset_file, sep='\t', index=False) + log.info(f"Calculating llr for all datasets average ... done") + llr_ds.to_csv(llr_dataset_file, sep="\t", index=False) return llr_ds @@ -481,39 +569,43 @@ def get_cutoff(model_dict, gs_test, lr_cutoff): cutoff_p_dict = {} cutoff_llr_dict = {} for ft in model_dict: - log.info(f'Calculating cutoff prob for {ft} ...') + log.info(f"Calculating cutoff prob for {ft} ...") model = model_dict[ft] - if ft == 'ex': - gs_test_df = gs_test.loc[:, ~gs_test.columns.str.endswith('_PPI')] + if ft == "ex": + gs_test_df = gs_test.loc[:, ~gs_test.columns.str.endswith("_PPI")] else: gs_test_df = gs_test prob = model.predict_proba(gs_test_df.iloc[:, 2:-1]) prob = prob[:, 1] - pred_df = pd.DataFrame(prob, columns=['prob']) + pred_df = pd.DataFrame(prob, columns=["prob"]) pred_df = pd.concat([pred_df, gs_test_df.iloc[:, -1]], axis=1) - pred_df = pred_df.sort_values(by='prob', ascending=False) + pred_df = pred_df.sort_values(by="prob", ascending=False) - P = pred_df['label'].sum() + P = pred_df["label"].sum() N = len(pred_df) - P - cumulative_pp = np.cumsum(pred_df['label']) + cumulative_pp = np.cumsum(pred_df["label"]) cumulative_pn = np.arange(len(pred_df)) + 1 - cumulative_pp llr_values = np.log((cumulative_pp / cumulative_pn) / (P / N)) - pred_df['llr'] = llr_values + pred_df["llr"] = llr_values # find the first prob that has llr >= lr_cutoff cutoff = np.log(lr_cutoff) cutoff_prob = None for _, row in pred_df[::-1].iterrows(): - if not np.isinf(row['llr']) and row['llr'] >= cutoff: - cutoff_prob = row['prob'] - cutoff_llr = row['llr'] + if not np.isinf(row["llr"]) and row["llr"] >= cutoff: + cutoff_prob = row["prob"] + cutoff_llr = row["llr"] break # if cutoff_prob is None, it means that the lr_cutoff is too high # and we cannot find a cutoff prob that has llr >= lr_cutoff if cutoff_prob is None: - log.error(f'Cannot find cutoff prob for {ft}, lower lr_cutoff and try again') - import sys; sys.exit(1) + log.error( + f"Cannot find cutoff prob for {ft}, lower lr_cutoff and try again" + ) + import sys + + sys.exit(1) cutoff_p_dict[ft] = cutoff_prob cutoff_llr_dict[ft] = cutoff_llr @@ -523,34 +615,43 @@ def get_cutoff(model_dict, gs_test, lr_cutoff): def predict_network(predict_results_file, cutoff_p, output_file): for ft in predict_results_file: - log.info(f'Predicting network for {ft} ...') + log.info(f"Predicting network for {ft} ...") predicted_df = pd.read_parquet(predict_results_file[ft]) - filtered_df = predicted_df[predicted_df['prediction'] > cutoff_p[ft]] + filtered_df = predicted_df[predicted_df["prediction"] > cutoff_p[ft]] cur_file = output_file[ft] - filtered_df[['P1', 'P2']].to_csv(cur_file, sep='\t', index=False, header=None) + filtered_df[["P1", "P2"]].to_csv(cur_file, sep="\t", index=False, header=None) directory, file_name = os.path.split(cur_file) base_name, extension = os.path.splitext(file_name) - new_file_name = f'{base_name}_with_p{extension}' + new_file_name = f"{base_name}_with_p{extension}" new_file = os.path.join(directory, new_file_name) - filtered_df.to_csv(new_file, sep='\t', index=False) + filtered_df.to_csv(new_file, sep="\t", index=False) num_edges = len(filtered_df) - num_nodes = len(set(filtered_df['P1']) | set(filtered_df['P2'])) - log.info(f'Number of edges: {num_edges}') - log.info(f'Number of nodes: {num_nodes}') - log.info(f'Predicting network for {ft} ... done') - - -def predict_all_pairs(model_dict, all_ids, feature_type, ppi_feature, cc_dict, - mr_dict, extra_feature_file, prediction_dir, - output_file, n_jobs=1): + num_nodes = len(set(filtered_df["P1"]) | set(filtered_df["P2"])) + log.info(f"Number of edges: {num_edges}") + log.info(f"Number of nodes: {num_nodes}") + log.info(f"Predicting network for {ft} ... done") + + +def predict_all_pairs( + model_dict, + all_ids, + feature_type, + ppi_feature, + cc_dict, + mr_dict, + extra_feature_file, + prediction_dir, + output_file, + n_jobs=1, +): chunk_size = 1000000 - log.info('Genearating all pairs ...') + log.info("Genearating all pairs ...") all_ids = sorted(all_ids) all_pairs = list(itertools.combinations(all_ids, 2)) - log.info('Genearating all pairs ... done') + log.info("Genearating all pairs ... done") log.info(f'Number of valid ids {format(len(all_ids), ",")}') # remove all "chunk_*.parquet" files in prediction_dir if they exist - pattern = os.path.join(prediction_dir, 'chunk_*.parquet') + pattern = os.path.join(prediction_dir, "chunk_*.parquet") matching_files = glob.glob(pattern) for file in matching_files: os.remove(file) @@ -558,34 +659,46 @@ def predict_all_pairs(model_dict, all_ids, feature_type, ppi_feature, cc_dict, for ft in model_dict: log.info(f'Predicting all pairs ({format(len(all_pairs), ",")}) for {ft} ...') model = model_dict[ft] + def process_and_save_chunk(start_idx, chunk_id): - chunk = all_pairs[start_idx:start_idx + chunk_size] - chunk_df = pd.DataFrame(chunk, columns=['P1', 'P2']) - if ft == 'ex': + chunk = all_pairs[start_idx : start_idx + chunk_size] + chunk_df = pd.DataFrame(chunk, columns=["P1", "P2"]) + if ft == "ex": cur_ppi_feature = None else: cur_ppi_feature = ppi_feature - feature_df = extract_features(chunk_df, feature_type, cc_dict, cur_ppi_feature, - extra_feature_file, mr_dict) + feature_df = extract_features( + chunk_df, + feature_type, + cc_dict, + cur_ppi_feature, + extra_feature_file, + mr_dict, + ) predictions = model.predict_proba(feature_df) - prediction_df = pd.DataFrame(predictions[:, 1], columns=['prediction']) - prediction_df['P1'] = chunk_df['P1'] - prediction_df['P2'] = chunk_df['P2'] - prediction_df = prediction_df[['P1', 'P2', 'prediction']] - prediction_df['prediction'] = prediction_df['prediction'].astype('float32') + prediction_df = pd.DataFrame(predictions[:, 1], columns=["prediction"]) + prediction_df["P1"] = chunk_df["P1"] + prediction_df["P2"] = chunk_df["P2"] + prediction_df = prediction_df[["P1", "P2", "prediction"]] + prediction_df["prediction"] = prediction_df["prediction"].astype("float32") table = pa.Table.from_pandas(prediction_df) chunk_id = str(chunk_id).zfill(6) - output_file = f'{prediction_dir}/chunk_{chunk_id}.parquet' + output_file = f"{prediction_dir}/chunk_{chunk_id}.parquet" pq.write_table(table, output_file) with ThreadPoolExecutor(max_workers=n_jobs) as executor: - for chunk_id, chunk_start in enumerate(range(0, len(all_pairs), chunk_size)): + for chunk_id, chunk_start in enumerate( + range(0, len(all_pairs), chunk_size) + ): executor.submit(process_and_save_chunk, chunk_start, chunk_id) - pattern = os.path.join(prediction_dir, 'chunk_*.parquet') + pattern = os.path.join(prediction_dir, "chunk_*.parquet") matching_files = glob.glob(pattern) matching_files.sort() - pq.write_table(pa.concat_tables([pq.read_table(file) for file in matching_files]), output_file[ft]) + pq.write_table( + pa.concat_tables([pq.read_table(file) for file in matching_files]), + output_file[ft], + ) for file in matching_files: os.remove(file) From e53c78ff372ae8760a062cf5a12e5692b61b4207 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Wed, 18 Sep 2024 12:37:11 -0500 Subject: [PATCH 02/54] add documentation for extra-feature file --- funmap/funmap.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/funmap/funmap.py b/funmap/funmap.py index 01aa93cb..00565c9e 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -26,7 +26,9 @@ log = setup_logger(__name__) -def get_valid_gs_data(gs_path: str, valid_gene_list: List[str], md5=None): +def get_valid_gs_data( + gs_path: str, valid_gene_list: List[str], md5=None +): # TODO: Add error message if there is no overlap between valid_gene_list and the gold standard set log.info(f'Loading gold standard feature file "{gs_path}" ...') if is_url_scheme(gs_path): gs_edge_df = read_csv_with_md5_check( @@ -271,6 +273,26 @@ def assemble_feature_df(h5_file_mapping, df, dataset="cc"): def extract_features( df, feature_type, cc_dict, ppi_feature=None, extra_feature=None, mr_dict=None ): + """ + extract_features - creates the final feature `pandas` dataframe used by xgboost + + Params: + df: pd.DataFrame + feature_type: str - Type of feature used. Currently supporting pearson correlation or mutual rank. Accepted values: 'cc' or 'mr'. + cc_dict: dict[str, str] - dictionary containing paths to the pre-calculated features. + ... + extra_feature: str - path for file containing extra features + + Extra Features: + + This is a TSV file with the following format: + + Gene A Gene B Feature X Feature Y + -------- -------- ----------- ----------- + ABC1 DEF2 0.12 34.5 + GHI3 JKLM4 -0.6 NA + … … … … + """ if feature_type == "mr": if not mr_dict: raise ValueError("mr dict is empty") From fa90597a13c32655963f883085498788f7268a59 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Wed, 18 Sep 2024 14:58:41 -0500 Subject: [PATCH 03/54] Add support for adding extra_feature_file --- funmap/funmap.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/funmap/funmap.py b/funmap/funmap.py index 00565c9e..816d11d6 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -19,7 +19,12 @@ from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedKFold import xgboost as xgb -from funmap.utils import get_data_dict, is_url_scheme, read_csv_with_md5_check +from funmap.utils import ( + get_data_dict, + is_url_scheme, + read_csv_with_md5_check, + check_extra_feature_file, +) from funmap.data_urls import network_info, misc_urls as urls from funmap.logger import setup_logger @@ -308,8 +313,30 @@ def extract_features( # TODO: add extra features if provided if extra_feature is not None: - pass - + if check_extra_feature_file(extra_feature): + log.info(f"Loading extra features from {extra_feature}") + extra_feature_df = pd.read_csv(extra_feature, sep="\t") + log.info( + f"Extra Features: {len(extra_feature_df.columns) - 2}\nGene Pairs: {extra_feature_df.shape[0]}" + ) + extra_feature_df.columns.values[0] = "P1" + extra_feature_df.columns.values[0] = "P2" + extra_feature_df[["P1", "P2"]] = extra_feature_df.apply( + lambda row: sorted([row["P1"], row["P2"]]) + if row["P1"] > row["P2"] + else [row["P1"], row["P2"]], + axis=1, + result_type="expand", + ) + old_feature_count = feature_df.shape[0] + feature_df = pd.merge( + feature_df, extra_feature_df, on=["P1", "P2"], how="outer" + ) + log.info(f"Added gene pairs: {feature_df.shape[0] - old_feature_count}") + else: + log.warning( + "File format verification failed. Skipping adding extra feature." + ) # move 'label' column to the end of the dataframe if it exists if "label" in feature_df.columns: feature_df = feature_df[ From bad4f6da97180d190b6a2c63c5d3a3a53a364243 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Wed, 18 Sep 2024 15:19:12 -0500 Subject: [PATCH 04/54] remove extra_file check --- funmap/funmap.py | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/funmap/funmap.py b/funmap/funmap.py index 816d11d6..518d0f69 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -313,30 +313,25 @@ def extract_features( # TODO: add extra features if provided if extra_feature is not None: - if check_extra_feature_file(extra_feature): - log.info(f"Loading extra features from {extra_feature}") - extra_feature_df = pd.read_csv(extra_feature, sep="\t") - log.info( - f"Extra Features: {len(extra_feature_df.columns) - 2}\nGene Pairs: {extra_feature_df.shape[0]}" - ) - extra_feature_df.columns.values[0] = "P1" - extra_feature_df.columns.values[0] = "P2" - extra_feature_df[["P1", "P2"]] = extra_feature_df.apply( - lambda row: sorted([row["P1"], row["P2"]]) - if row["P1"] > row["P2"] - else [row["P1"], row["P2"]], - axis=1, - result_type="expand", - ) - old_feature_count = feature_df.shape[0] - feature_df = pd.merge( - feature_df, extra_feature_df, on=["P1", "P2"], how="outer" - ) - log.info(f"Added gene pairs: {feature_df.shape[0] - old_feature_count}") - else: - log.warning( - "File format verification failed. Skipping adding extra feature." - ) + log.info(f"Loading extra features from {extra_feature}") + extra_feature_df = pd.read_csv(extra_feature, sep="\t") + log.info( + f"Extra Features: {len(extra_feature_df.columns) - 2}\nGene Pairs: {extra_feature_df.shape[0]}" + ) + extra_feature_df.columns.values[0] = "P1" + extra_feature_df.columns.values[0] = "P2" + extra_feature_df[["P1", "P2"]] = extra_feature_df.apply( + lambda row: sorted([row["P1"], row["P2"]]) + if row["P1"] > row["P2"] + else [row["P1"], row["P2"]], + axis=1, + result_type="expand", + ) + old_feature_count = feature_df.shape[0] + feature_df = pd.merge( + feature_df, extra_feature_df, on=["P1", "P2"], how="outer" + ) + log.info(f"Added gene pairs: {feature_df.shape[0] - old_feature_count}") # move 'label' column to the end of the dataframe if it exists if "label" in feature_df.columns: feature_df = feature_df[ From a9927df2e126b4f90cd3b7e83be288dbea337229 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Wed, 18 Sep 2024 15:30:48 -0500 Subject: [PATCH 05/54] skip feature file check --- funmap/cli.py | 368 +++++++++++++++++++++++++++++--------------------- 1 file changed, 216 insertions(+), 152 deletions(-) diff --git a/funmap/cli.py b/funmap/cli.py index 8094bf1f..8cd7cf34 100644 --- a/funmap/cli.py +++ b/funmap/cli.py @@ -7,7 +7,7 @@ from pathlib import Path from funmap.funmap import compute_llr, predict_all_pairs, dataset_llr, predict_all_pairs from funmap.plotting import explore_data, plot_results, merge_and_delete -from funmap.funmap import train_ml_model, prepare_gs_data, get_cutoff, get_ppi_feature +from funmap.funmap import train_ml_model, prepare_gs_data, get_cutoff, get_ppi_feature from funmap.funmap import compute_features, predict_network from funmap.data_urls import misc_urls as urls from funmap.logger import setup_logging, setup_logger @@ -17,250 +17,313 @@ log = setup_logger(__name__) -@click.group(help='funmap command line interface') -@click.version_option(version=f'{__version__}') + +@click.group(help="funmap command line interface") +@click.version_option(version=f"{__version__}") def cli(): """ Command line interface for funmap. """ click.echo("====== funmap =======") -@cli.command(help='check the data quality') -@click.option('--config-file', '-c', required=True, type=click.Path(exists=True), - help='path to experiment configuration yaml file') -@click.option('--force-rerun', '-f', is_flag=True, default=False, - help='if set, will remove results from previous run first') + +@cli.command(help="check the data quality") +@click.option( + "--config-file", + "-c", + required=True, + type=click.Path(exists=True), + help="path to experiment configuration yaml file", +) +@click.option( + "--force-rerun", + "-f", + is_flag=True, + default=False, + help="if set, will remove results from previous run first", +) def qc(config_file, force_rerun): if force_rerun: while True: - confirmation = input("Do you want to remove results from previous run? (y/n): ") - if confirmation.lower() == 'y': - click.echo('Removing results from previous run') + confirmation = input( + "Do you want to remove results from previous run? (y/n): " + ) + if confirmation.lower() == "y": + click.echo("Removing results from previous run") cleanup_experiment(config_file) break - elif confirmation.lower() == 'n': - click.echo('Not removing results from previous run') + elif confirmation.lower() == "n": + click.echo("Not removing results from previous run") break else: click.echo("Invalid input. Please enter 'y' or 'n'.") setup_logging(config_file) - log.info(f'Running QC...') + log.info(f"Running QC...") cfg = setup_experiment(config_file) all_fig_names = [] - figure_dir = Path(cfg['results_dir']) / cfg['subdirs']['figure_dir'] - min_sample_count = cfg['min_sample_count'] + figure_dir = Path(cfg["results_dir"]) / cfg["subdirs"]["figure_dir"] + min_sample_count = cfg["min_sample_count"] fig_names = explore_data(cfg, min_sample_count, figure_dir) all_fig_names.extend(fig_names) - merge_and_delete(figure_dir, all_fig_names, 'qc.pdf') - log.info('figure qc.pdf saved to {}'.format(figure_dir)) - log.info('QC complete') + merge_and_delete(figure_dir, all_fig_names, "qc.pdf") + log.info("figure qc.pdf saved to {}".format(figure_dir)) + log.info("QC complete") -@cli.command(help='run funmap') -@click.option('--config-file', '-c', required=True, type=click.Path(exists=True), - help='path to experiment configuration yaml file') -@click.option('--force-rerun', '-f', is_flag=True, default=False, - help='if set, will remove results from previous run first') +@cli.command(help="run funmap") +@click.option( + "--config-file", + "-c", + required=True, + type=click.Path(exists=True), + help="path to experiment configuration yaml file", +) +@click.option( + "--force-rerun", + "-f", + is_flag=True, + default=False, + help="if set, will remove results from previous run first", +) def run(config_file, force_rerun): - click.echo(f'Running funmap...') + click.echo(f"Running funmap...") if force_rerun: while True: - confirmation = input("Do you want to remove results from previous run? (y/n): ") - if confirmation.lower() == 'y': - click.echo('Removing results from previous run') + confirmation = input( + "Do you want to remove results from previous run? (y/n): " + ) + if confirmation.lower() == "y": + click.echo("Removing results from previous run") cleanup_experiment(config_file) break - elif confirmation.lower() == 'n': - click.echo('Not removing results from previous run') + elif confirmation.lower() == "n": + click.echo("Not removing results from previous run") break else: click.echo("Invalid input. Please enter 'y' or 'n'.") setup_logging(config_file) cfg = setup_experiment(config_file) - extra_feature_file = cfg['extra_feature_file'] - if (extra_feature_file is not None) and (not check_extra_feature_file(extra_feature_file)): - return - gs_file = cfg['gs_file'] + extra_feature_file = cfg["extra_feature_file"] + # if (extra_feature_file is not None) and (not check_extra_feature_file(extra_feature_file)): + # return + gs_file = cfg["gs_file"] if (gs_file is not None) and (not check_gold_standard_file(gs_file)): return - task = cfg['task'] - seed = cfg['seed'] + task = cfg["task"] + seed = cfg["seed"] np.random.seed(seed) - ml_type = cfg['ml_type'] - feature_type = cfg['feature_type'] + ml_type = cfg["ml_type"] + feature_type = cfg["feature_type"] # min_feature_count = cfg['min_feature_count'] - min_sample_count = cfg['min_sample_count'] + min_sample_count = cfg["min_sample_count"] # filter_before_prediction = cfg['filter_before_prediction'] - test_size = cfg['test_size'] + test_size = cfg["test_size"] # filter_after_prediction = cfg['filter_after_prediction'] # filter_criterion = cfg['filter_criterion'] # filter_threshold = cfg['filter_threshold'] # filter_blacklist = cfg['filter_blacklist'] - n_jobs = cfg['n_jobs'] - lr_cutoff = cfg['lr_cutoff'] - max_num_edges = cfg['max_num_edges'] - step_size = cfg['step_size'] - start_edge_num = cfg['start_edge_num'] + n_jobs = cfg["n_jobs"] + lr_cutoff = cfg["lr_cutoff"] + max_num_edges = cfg["max_num_edges"] + step_size = cfg["step_size"] + start_edge_num = cfg["start_edge_num"] - results_dir = Path(cfg['results_dir']) - saved_data_dir = results_dir / cfg['subdirs']['saved_data_dir'] - model_dir = results_dir / cfg['subdirs']['saved_model_dir'] - prediction_dir = results_dir / cfg['subdirs']['saved_predictions_dir'] - network_dir = results_dir / cfg['subdirs']['network_dir'] - figure_dir = results_dir / cfg['subdirs']['figure_dir'] + results_dir = Path(cfg["results_dir"]) + saved_data_dir = results_dir / cfg["subdirs"]["saved_data_dir"] + model_dir = results_dir / cfg["subdirs"]["saved_model_dir"] + prediction_dir = results_dir / cfg["subdirs"]["saved_predictions_dir"] + network_dir = results_dir / cfg["subdirs"]["network_dir"] + figure_dir = results_dir / cfg["subdirs"]["figure_dir"] - if cfg['task'] == 'protein_func': - feature_mapping = ['ex', 'ei'] + if cfg["task"] == "protein_func": + feature_mapping = ["ex", "ei"] else: - feature_mapping = ['ex'] + feature_mapping = ["ex"] # here the file stored a dictionary of ml models - ml_model_file = {feature: model_dir / f'model_{feature}.pkl.gz' - for feature in feature_mapping } - predicted_all_pairs_file = {feature: prediction_dir / f'predicted_all_pairs_{feature}.parquet' - for feature in feature_mapping } - llr_res_file = {feature: results_dir / f'llr_results_{feature}.tsv' - for feature in feature_mapping } - edge_list_file = {feature: network_dir/ f'funmap_{feature}.tsv' - for feature in feature_mapping } + ml_model_file = { + feature: model_dir / f"model_{feature}.pkl.gz" for feature in feature_mapping + } + predicted_all_pairs_file = { + feature: prediction_dir / f"predicted_all_pairs_{feature}.parquet" + for feature in feature_mapping + } + llr_res_file = { + feature: results_dir / f"llr_results_{feature}.tsv" + for feature in feature_mapping + } + edge_list_file = { + feature: network_dir / f"funmap_{feature}.tsv" for feature in feature_mapping + } # gold standard data include specified feature (cc or mr) and ppi feature (if applicable) # and extra feature if applicable - gs_df_file = saved_data_dir / 'gold_standard_data.h5' + gs_df_file = saved_data_dir / "gold_standard_data.h5" # blacklist_file = urls['funmap_blacklist'] # llr obtained with each invividual dataset - llr_dataset_file = results_dir / 'llr_dataset.tsv' + llr_dataset_file = results_dir / "llr_dataset.tsv" gs_train = gs_test = None cutoff_p = cutoff_llr = None ml_model_dict = {} # compute and save cc, mr results - cc_dict, mr_dict, all_valid_ids = compute_features(cfg, feature_type, min_sample_count, - saved_data_dir) + cc_dict, mr_dict, all_valid_ids = compute_features( + cfg, feature_type, min_sample_count, saved_data_dir + ) gs_args = { - 'task': task, - 'saved_data_dir': saved_data_dir, - 'cc_dict': cc_dict, - 'mr_dict': mr_dict, - 'feature_type': feature_type, - 'gs_file': gs_file, - 'extra_feature_file': extra_feature_file, - 'valid_id_list': all_valid_ids, - 'test_size': test_size, - 'seed': seed + "task": task, + "saved_data_dir": saved_data_dir, + "cc_dict": cc_dict, + "mr_dict": mr_dict, + "feature_type": feature_type, + "gs_file": gs_file, + "extra_feature_file": extra_feature_file, + "valid_id_list": all_valid_ids, + "test_size": test_size, + "seed": seed, } - all_edge_list_exist = all(os.path.exists(file_path) for file_path in edge_list_file.values()) + all_edge_list_exist = all( + os.path.exists(file_path) for file_path in edge_list_file.values() + ) if all_edge_list_exist: - log.info('Fumap network(s) already exists. Skipping model training and prediction.') + log.info( + "Fumap network(s) already exists. Skipping model training and prediction." + ) else: - all_model_exist = all(os.path.exists(file_path) for file_path in ml_model_file.values()) + all_model_exist = all( + os.path.exists(file_path) for file_path in ml_model_file.values() + ) if all_model_exist: - log.info(f'Trained model(s) exists. Loading model(s) ...') + log.info(f"Trained model(s) exists. Loading model(s) ...") ml_model_dict = {} # feature: ex or ei for feature in ml_model_file: - with gzip.open(ml_model_file[feature], 'rb') as fh: + with gzip.open(ml_model_file[feature], "rb") as fh: ml_model = pickle.load(fh) ml_model_dict[feature] = ml_model - log.info(f'Loading model(s) ... done') + log.info(f"Loading model(s) ... done") if not gs_df_file.exists(): - log.error(f'Trained models found but gold standard data file {gs_df_file} ' - f'does not exist.') + log.error( + f"Trained models found but gold standard data file {gs_df_file} " + f"does not exist." + ) return - with pd.HDFStore(gs_df_file, mode='r') as store: - gs_train = store['train'] - gs_test = store['test'] + with pd.HDFStore(gs_df_file, mode="r") as store: + gs_train = store["train"] + gs_test = store["test"] else: gs_train, gs_test = prepare_gs_data(**gs_args) - with pd.HDFStore(gs_df_file, mode='w') as store: - store.put('train', gs_train) - store.put('test', gs_test) - ml_model_dict = train_ml_model(gs_train, ml_type, seed, n_jobs, feature_mapping, - model_dir) + with pd.HDFStore(gs_df_file, mode="w") as store: + store.put("train", gs_train) + store.put("test", gs_test) + ml_model_dict = train_ml_model( + gs_train, ml_type, seed, n_jobs, feature_mapping, model_dir + ) - all_predicted_all_pairs_exist = all(os.path.exists(file_path) for file_path in - predicted_all_pairs_file.values()) + all_predicted_all_pairs_exist = all( + os.path.exists(file_path) for file_path in predicted_all_pairs_file.values() + ) if all_predicted_all_pairs_exist: - log.info('Predicted all pairs already exists. Skipping prediction.') + log.info("Predicted all pairs already exists. Skipping prediction.") else: - log.info('Predicting all pairs ...') - if task == 'protein_func': + log.info("Predicting all pairs ...") + if task == "protein_func": ppi_feature = get_ppi_feature() else: ppi_feature = None pred_all_pairs_args = { - 'model_dict': ml_model_dict, - 'all_ids': all_valid_ids, - 'feature_type': feature_type, - 'ppi_feature': ppi_feature, - 'cc_dict': cc_dict, - 'mr_dict': mr_dict, - 'extra_feature_file': extra_feature_file, - 'prediction_dir': prediction_dir, - 'output_file': predicted_all_pairs_file, - 'n_jobs': n_jobs + "model_dict": ml_model_dict, + "all_ids": all_valid_ids, + "feature_type": feature_type, + "ppi_feature": ppi_feature, + "cc_dict": cc_dict, + "mr_dict": mr_dict, + "extra_feature_file": extra_feature_file, + "prediction_dir": prediction_dir, + "output_file": predicted_all_pairs_file, + "n_jobs": n_jobs, } predict_all_pairs(**pred_all_pairs_args) - log.info('Predicting all pairs ... done') + log.info("Predicting all pairs ... done") cutoff_p, cutoff_llr = get_cutoff(ml_model_dict, gs_test, lr_cutoff) - log.info(f'cutoff probability: {cutoff_p}') - log.info(f'cutoff llr: {cutoff_llr}') + log.info(f"cutoff probability: {cutoff_p}") + log.info(f"cutoff llr: {cutoff_llr}") predict_network(predicted_all_pairs_file, cutoff_p, edge_list_file) if not gs_df_file.exists(): gs_train, gs_test = prepare_gs_data(**gs_args) - with pd.HDFStore(gs_df_file, mode='w') as store: - store.put('train', gs_train) - store.put('test', gs_test) + with pd.HDFStore(gs_df_file, mode="w") as store: + store.put("train", gs_train) + store.put("test", gs_test) else: if gs_test is None: - with pd.HDFStore(gs_df_file, mode='r') as store: - gs_train = store['train'] - gs_test = store['test'] + with pd.HDFStore(gs_df_file, mode="r") as store: + gs_train = store["train"] + gs_test = store["test"] - all_llr_res_exist = all(os.path.exists(file_path) for file_path in llr_res_file.values()) - all_edge_list_exist = all(os.path.exists(file_path) for file_path in edge_list_file.values()) + all_llr_res_exist = all( + os.path.exists(file_path) for file_path in llr_res_file.values() + ) + all_edge_list_exist = all( + os.path.exists(file_path) for file_path in edge_list_file.values() + ) if all_llr_res_exist and all_edge_list_exist: - log.info('LLR results already exist.') + log.info("LLR results already exist.") else: for ft in feature_mapping: - log.info(f'Computing LLR for {ft} ...') + log.info(f"Computing LLR for {ft} ...") if not predicted_all_pairs_file[ft].exists(): - log.error(f'Predicted all pairs file {predicted_all_pairs_file[ft]} does not exist.') + log.error( + f"Predicted all pairs file {predicted_all_pairs_file[ft]} does not exist." + ) return predicted_all_pairs = pd.read_parquet(predicted_all_pairs_file[ft]) # also save the llr results to file - compute_llr(predicted_all_pairs, llr_res_file[ft], start_edge_num, max_num_edges, step_size, - gs_test) - log.info(f'Computing LLR for {ft} ... done') + compute_llr( + predicted_all_pairs, + llr_res_file[ft], + start_edge_num, + max_num_edges, + step_size, + gs_test, + ) + log.info(f"Computing LLR for {ft} ... done") validation_res = {} for ft in feature_mapping: validation_res[ft] = { - 'llr_res_path': llr_res_file[ft], - 'edge_list_path': edge_list_file[ft] + "llr_res_path": llr_res_file[ft], + "edge_list_path": edge_list_file[ft], } if not llr_dataset_file.exists(): - log.info('Computing LLR for each dataset ...') + log.info("Computing LLR for each dataset ...") # feature_dict = cc_dict if feature_type == 'cc' else mr_dict # use CC features for individual dataset - llr_ds = dataset_llr(all_valid_ids, cc_dict, 'cc', gs_test, start_edge_num, - max_num_edges, step_size, llr_dataset_file) - log.info('Done.') + llr_ds = dataset_llr( + all_valid_ids, + cc_dict, + "cc", + gs_test, + start_edge_num, + max_num_edges, + step_size, + llr_dataset_file, + ) + log.info("Done.") else: - llr_ds = pd.read_csv(llr_dataset_file, sep='\t') + llr_ds = pd.read_csv(llr_dataset_file, sep="\t") if not ml_model_dict: - log.info(f'Trained model(s) exists. Loading model(s) ...') + log.info(f"Trained model(s) exists. Loading model(s) ...") for feature in ml_model_file: - with gzip.open(ml_model_file[feature], 'rb') as fh: + with gzip.open(ml_model_file[feature], "rb") as fh: ml_model = pickle.load(fh) ml_model_dict[feature] = ml_model - log.info(f'Loading model(s) ... done') + log.info(f"Loading model(s) ... done") all_fig_names = [] if cutoff_llr is None: @@ -268,30 +331,31 @@ def run(config_file, force_rerun): gs_dict = {} gs_dict[feature_type.upper()] = gs_train - if task == 'protein_func' and feature_type.upper() == 'MR' and 'rp_pairs' in cfg: + if task == "protein_func" and feature_type.upper() == "MR" and "rp_pairs" in cfg: # extract gs data for CC and MR for plotting gs_args = { - 'task': task, - 'saved_data_dir': saved_data_dir, - 'cc_dict': cc_dict, - 'mr_dict': mr_dict, - 'feature_type': 'cc', - 'gs_file': gs_file, - # no extra feature for plotting - 'extra_feature_file': None, - 'valid_id_list': all_valid_ids, - 'test_size': test_size, - 'seed': seed - } + "task": task, + "saved_data_dir": saved_data_dir, + "cc_dict": cc_dict, + "mr_dict": mr_dict, + "feature_type": "cc", + "gs_file": gs_file, + # no extra feature for plotting + "extra_feature_file": None, + "valid_id_list": all_valid_ids, + "test_size": test_size, + "seed": seed, + } gs_train, gs_test = prepare_gs_data(**gs_args) - gs_dict['CC'] = gs_train + gs_dict["CC"] = gs_train - fig_names = plot_results(cfg, validation_res, llr_ds, gs_dict, cutoff_llr, - figure_dir) + fig_names = plot_results( + cfg, validation_res, llr_ds, gs_dict, cutoff_llr, figure_dir + ) all_fig_names.extend(fig_names) - merge_and_delete(figure_dir, all_fig_names, 'results.pdf') + merge_and_delete(figure_dir, all_fig_names, "results.pdf") -if __name__ == '__main__': +if __name__ == "__main__": cli() From 67ca6c53e1e25967cd0cc5d1934ba832bb19ef7f Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Wed, 18 Sep 2024 15:49:47 -0500 Subject: [PATCH 06/54] upgrade version number to 0.1.21 fix column names for extra feature df --- funmap/funmap.py | 2 +- setup.cfg | 2 +- setup.py | 64 ++++++++++++++++++++++++------------------------ 3 files changed, 34 insertions(+), 34 deletions(-) diff --git a/funmap/funmap.py b/funmap/funmap.py index 518d0f69..02467aae 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -319,7 +319,7 @@ def extract_features( f"Extra Features: {len(extra_feature_df.columns) - 2}\nGene Pairs: {extra_feature_df.shape[0]}" ) extra_feature_df.columns.values[0] = "P1" - extra_feature_df.columns.values[0] = "P2" + extra_feature_df.columns.values[1] = "P2" extra_feature_df[["P1", "P2"]] = extra_feature_df.apply( lambda row: sorted([row["P1"], row["P2"]]) if row["P1"] > row["P2"] diff --git a/setup.cfg b/setup.cfg index 63a95df9..c12570fd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.20 +current_version = 0.1.21 commit = True tag = True diff --git a/setup.py b/setup.py index 48b00773..af504f72 100644 --- a/setup.py +++ b/setup.py @@ -5,48 +5,48 @@ from setuptools import setup, find_packages requirements = [ - 'pyyaml==6.0.1', - 'xgboost==2.0.0', - 'numpy==1.24.4', - 'scipy==1.10.1', - 'pyarrow==13.0.0', - 'pandas==2.0.3', - 'joblib==1.3.2', - 'matplotlib==3.7.3', - 'seaborn==0.13.0', - 'scikit-learn==1.3.2', - 'imbalanced-learn==0.11.0', - 'tqdm==4.66.1', - 'PyPDF2==3.0.1', - 'matplotlib_venn==0.11.9', - 'networkx==3.1', - 'powerlaw==1.5', - 'click==8.1.7', - 'h5py==3.10.0', - 'tables==3.8.0' - ] + "pyyaml==6.0.1", + "xgboost==2.0.0", + "numpy==1.24.4", + "scipy==1.10.1", + "pyarrow==13.0.0", + "pandas==2.0.3", + "joblib==1.3.2", + "matplotlib==3.7.3", + "seaborn==0.13.0", + "scikit-learn==1.3.2", + "imbalanced-learn==0.11.0", + "tqdm==4.66.1", + "PyPDF2==3.0.1", + "matplotlib_venn==0.11.9", + "networkx==3.1", + "powerlaw==1.5", + "click==8.1.7", + "h5py==3.10.0", + "tables==3.8.0", +] -test_requirements = [ ] +test_requirements = [] setup( author="Zhiao Shi", - author_email='zhiao.shi@gmail.com', - python_requires='>=3.7', + author_email="zhiao.shi@gmail.com", + python_requires=">=3.7", description="generate gene co-function networks using omics data", entry_points={ - 'console_scripts': [ - 'funmap=funmap.cli:cli', + "console_scripts": [ + "funmap=funmap.cli:cli", ], }, install_requires=requirements, - license='MIT license', + license="MIT license", include_package_data=True, - keywords=['funmap', 'bioinformatics', 'biological-network'], - name='funmap', - packages=find_packages(include=['funmap', 'funmap.*']), - test_suite='tests', + keywords=["funmap", "bioinformatics", "biological-network"], + name="funmap", + packages=find_packages(include=["funmap", "funmap.*"]), + test_suite="tests", tests_require=test_requirements, - url='https://github.com/bzhanglab/funmap', - version='0.1.20', + url="https://github.com/bzhanglab/funmap", + version="0.1.21", zip_safe=False, ) From 05b316dafd5744b46981eb21ae9933b5c97cd8ed Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 19 Sep 2024 09:42:57 -0500 Subject: [PATCH 07/54] diagnostic printing --- funmap/funmap.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/funmap/funmap.py b/funmap/funmap.py index 02467aae..4d7bd36a 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -327,10 +327,13 @@ def extract_features( axis=1, result_type="expand", ) + log.info(str(feature_df.head())) + log.info("Extra feature:\n" + str(extra_feature_df.head())) old_feature_count = feature_df.shape[0] feature_df = pd.merge( feature_df, extra_feature_df, on=["P1", "P2"], how="outer" ) + log.info("Merged Data Frame:\n" + str(feature_df.head())) log.info(f"Added gene pairs: {feature_df.shape[0] - old_feature_count}") # move 'label' column to the end of the dataframe if it exists if "label" in feature_df.columns: From 4cad093a963b8a3df7d3052b788a1e49858359e9 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 19 Sep 2024 11:26:35 -0500 Subject: [PATCH 08/54] Add extra_feature_file genes to all_valid_ids More diagnostic logging --- funmap/funmap.py | 25 ++++++++++++++++++++++++- funmap/utils.py | 15 +++++++++++++-- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/funmap/funmap.py b/funmap/funmap.py index 4d7bd36a..d07f3369 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -31,7 +31,7 @@ log = setup_logger(__name__) -def get_valid_gs_data( +def get_valid_gs_dat( gs_path: str, valid_gene_list: List[str], md5=None ): # TODO: Add error message if there is no overlap between valid_gene_list and the gold standard set log.info(f'Loading gold standard feature file "{gs_path}" ...') @@ -330,9 +330,32 @@ def extract_features( log.info(str(feature_df.head())) log.info("Extra feature:\n" + str(extra_feature_df.head())) old_feature_count = feature_df.shape[0] + + # Get gene indices for P1 and P2 + # df.reset_index(drop=True, inplace=True) + log.info("df head") + log.info(str(df.head())) + # gene_to_index = {gene.astype(str): idx for idx, gene in enumerate(gene_ids)} + # p1_indices = np.array([gene_to_index.get(gene, -1) for gene in df.iloc[:, 0]]) + # p2_indices = np.array([gene_to_index.get(gene, -1) for gene in df.iloc[:, 1]]) + # + # f_dataset = h5_file[dataset] + # f_values = np.empty(len(df), dtype=float) + # valid_indices = (p1_indices != -1) & (p2_indices != -1) + # + # linear_indices_func = ( + # lambda row_indices, col_indices, n: np.array(col_indices) + # - np.array(row_indices) + # + (2 * n - np.array(row_indices) + 1) * np.array(row_indices) // 2 + # ) + # linear_indices = linear_indices_func( + # p1_indices[valid_indices], p2_indices[valid_indices], len(gene_ids) + # ) feature_df = pd.merge( feature_df, extra_feature_df, on=["P1", "P2"], how="outer" ) + f_values[valid_indices] = f_dataset[:][linear_indices] + f_values[~valid_indices] = np.nan log.info("Merged Data Frame:\n" + str(feature_df.head())) log.info(f"Added gene pairs: {feature_df.shape[0] - old_feature_count}") # move 'label' column to the end of the dataframe if it exists diff --git a/funmap/utils.py b/funmap/utils.py index 1f769274..b3e4c595 100644 --- a/funmap/utils.py +++ b/funmap/utils.py @@ -67,7 +67,7 @@ def normalize_filename(filename): return cleaned_filename -def get_data_dict(config, min_sample_count=15): + def get_data_dict(config, min_sample_count=15): """ Returns a dictionary of data from the provided data configuration, filtered to only include genes that are coding and have at least `min_sample_count` samples. @@ -121,7 +121,18 @@ def get_data_dict(config, min_sample_count=15): # valid_count = np.sum(is_valid) valid_p = cur_data.columns[is_valid].values all_valid_ids = all_valid_ids.union(set(valid_p)) - + if config["extra_feature_file"] is not None: + log.info("Importing extra feature file") + # TODO: Import extra feature file + extra_feature_df = pd.read_csv(extra_feature, sep="\t") + if config['filter_noncoding_genes']: + col1 = extra_feature_df.columns.values[0] + col2 = extra_feature_df.columns.values[1] + coding = mapping.loc[mapping['coding'] == 'coding', ['gene_name']] + coding_genes = list(set(coding['gene_name'].to_list())) + extra_feature_df = extra_feature_df[extra_feature_df[col1].isin(coding_genes) & extra_feature_df[col2].isin(coding_genes)] + genes = set(extra_feature_df.iloc[:, :2].values.flatten().tolist()) + all_valid_ids = all_valid_ids.union(genes) all_valid_ids = list(all_valid_ids) all_valid_ids.sort() log.info(f'total number of valid ids: {len(all_valid_ids)}') From ed5bf9f5cae41438b825c27dc7f6ab1a8863aab6 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 19 Sep 2024 11:30:34 -0500 Subject: [PATCH 09/54] fix indent issue --- funmap/utils.py | 379 ++++++++++++++++++++++++++---------------------- 1 file changed, 208 insertions(+), 171 deletions(-) diff --git a/funmap/utils.py b/funmap/utils.py index b3e4c595..9bebb567 100644 --- a/funmap/utils.py +++ b/funmap/utils.py @@ -17,12 +17,15 @@ def is_url_scheme(path): parsed = urlparse(path) - if parsed.scheme == 'file': + if parsed.scheme == "file": return False - return parsed.scheme != '' + return parsed.scheme != "" -def read_csv_with_md5_check(url, expected_md5=None, local_path='downloaded_file.csv', **kwargs): + +def read_csv_with_md5_check( + url, expected_md5=None, local_path="downloaded_file.csv", **kwargs +): try: response = urllib.request.urlopen(url) content = response.read() @@ -30,11 +33,13 @@ def read_csv_with_md5_check(url, expected_md5=None, local_path='downloaded_file. if expected_md5: md5_hash = hashlib.md5(content).hexdigest() if md5_hash != expected_md5: - log.error('gold standard file: MD5 hash mismatch, file may be corrupted.') + log.error( + "gold standard file: MD5 hash mismatch, file may be corrupted." + ) raise ValueError("MD5 hash mismatch, file may be corrupted.") # Save the content to a local file - with open(local_path, 'wb') as f: + with open(local_path, "wb") as f: f.write(content) df = pd.read_csv(local_path, **kwargs) @@ -44,13 +49,13 @@ def read_csv_with_md5_check(url, expected_md5=None, local_path='downloaded_file. return None -def check_gs_files_exist(file_dict, key='CC'): - if key.upper() == 'CC': - paths = file_dict.get('CC') +def check_gs_files_exist(file_dict, key="CC"): + if key.upper() == "CC": + paths = file_dict.get("CC") if paths is not None and os.path.exists(paths): return True - elif key.upper() == 'MR': - paths = file_dict.get('MR') + elif key.upper() == "MR": + paths = file_dict.get("MR") if paths is not None and all(os.path.exists(p) for p in paths): return True else: @@ -61,13 +66,13 @@ def check_gs_files_exist(file_dict, key='CC'): def normalize_filename(filename): # Remove any characters that are not allowed in filenames - cleaned_filename = re.sub(r'[^\w\s.-]', '', filename) + cleaned_filename = re.sub(r"[^\w\s.-]", "", filename) # Replace spaces with underscores - cleaned_filename = cleaned_filename.replace(' ', '_') + cleaned_filename = cleaned_filename.replace(" ", "_") return cleaned_filename - def get_data_dict(config, min_sample_count=15): +def get_data_dict(config, min_sample_count=15): """ Returns a dictionary of data from the provided data configuration, filtered to only include genes that are coding and have at least `min_sample_count` samples. @@ -79,41 +84,42 @@ def get_data_dict(config, min_sample_count=15): the data from the corresponding file. """ - data_file = config['data_path'] + data_file = config["data_path"] data_dict = {} - if 'filter_noncoding_genes' in config and config['filter_noncoding_genes']: - mapping = pd.read_csv(urls['mapping_file'], sep='\t') + if "filter_noncoding_genes" in config and config["filter_noncoding_genes"]: + mapping = pd.read_csv(urls["mapping_file"], sep="\t") # extract the data file from the tar.gz file - tmp_dir = 'tmp_data' + tmp_dir = "tmp_data" if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) - os.system(f'tar -xzf {data_file} --strip-components=1 -C {tmp_dir}') + os.system(f"tar -xzf {data_file} --strip-components=1 -C {tmp_dir}") # gene ids are gene symbols - for dt in config['data_files']: + for dt in config["data_files"]: log.info(f"processing ... {dt['name']}") - cur_feature = dt['name'] + cur_feature = dt["name"] # cur_file = get_obj_from_tgz(data_file, dt['path']) # extract the data file from the tar.gz file # cur_data= get_obj_from_tgz(data_file, dt['path']) - cur_data = pd.read_csv(os.path.join(tmp_dir, dt['path']), sep='\t', index_col=0, - header=0) + cur_data = pd.read_csv( + os.path.join(tmp_dir, dt["path"]), sep="\t", index_col=0, header=0 + ) if cur_data.shape[1] < min_sample_count: log.info(f"... {dt['name']} ... not enough samples, skipped") continue cur_data = cur_data.T # exclude cohort with sample number < min_sample_count # remove noncoding genes first - if config['filter_noncoding_genes']: - coding = mapping.loc[mapping['coding'] == 'coding', ['gene_name']] - coding_genes = list(set(coding['gene_name'].to_list())) + if config["filter_noncoding_genes"]: + coding = mapping.loc[mapping["coding"] == "coding", ["gene_name"]] + coding_genes = list(set(coding["gene_name"].to_list())) cur_data = cur_data[[c for c in cur_data.columns if c in coding_genes]] # duplicated columns, for now select the last column - cur_data = cur_data.loc[:,~cur_data.columns.duplicated(keep='last')] + cur_data = cur_data.loc[:, ~cur_data.columns.duplicated(keep="last")] data_dict[cur_feature] = cur_data shutil.rmtree(tmp_dir) - log.info('filtering out non valid ids ...') + log.info("filtering out non valid ids ...") all_valid_ids = set() for i in data_dict: cur_data = data_dict[i] @@ -125,17 +131,20 @@ def get_data_dict(config, min_sample_count=15): log.info("Importing extra feature file") # TODO: Import extra feature file extra_feature_df = pd.read_csv(extra_feature, sep="\t") - if config['filter_noncoding_genes']: + if config["filter_noncoding_genes"]: col1 = extra_feature_df.columns.values[0] col2 = extra_feature_df.columns.values[1] - coding = mapping.loc[mapping['coding'] == 'coding', ['gene_name']] - coding_genes = list(set(coding['gene_name'].to_list())) - extra_feature_df = extra_feature_df[extra_feature_df[col1].isin(coding_genes) & extra_feature_df[col2].isin(coding_genes)] + coding = mapping.loc[mapping["coding"] == "coding", ["gene_name"]] + coding_genes = list(set(coding["gene_name"].to_list())) + extra_feature_df = extra_feature_df[ + extra_feature_df[col1].isin(coding_genes) + & extra_feature_df[col2].isin(coding_genes) + ] genes = set(extra_feature_df.iloc[:, :2].values.flatten().tolist()) all_valid_ids = all_valid_ids.union(genes) all_valid_ids = list(all_valid_ids) all_valid_ids.sort() - log.info(f'total number of valid ids: {len(all_valid_ids)}') + log.info(f"total number of valid ids: {len(all_valid_ids)}") # filter out columns that are not in all_valid_ids for i in data_dict: @@ -143,10 +152,10 @@ def get_data_dict(config, min_sample_count=15): selected_columns = cur_data.columns.intersection(all_valid_ids) cur_data = cur_data[selected_columns] # it is possible the entire column is nan, remove it - cur_data = cur_data.dropna(axis=1, how='all') + cur_data = cur_data.dropna(axis=1, how="all") data_dict[i] = cur_data - log.info(f'{i} -- ') - log.info(f' samples x ids: {cur_data.shape}') + log.info(f"{i} -- ") + log.info(f" samples x ids: {cur_data.shape}") return data_dict, all_valid_ids @@ -187,11 +196,14 @@ def get_node_edge(edge_list): # remove duplidated rows in edge_list edge_list = edge_list.drop_duplicates() n_edge = len(edge_list) - nodes = set(edge_list.iloc[:,0].to_list()) | set(edge_list.iloc[:,1].to_list()) - return(dict(n_node = len(nodes), n_edge = n_edge, - edge_per_node = n_edge / len(nodes), - nodes = list(nodes), - edges = edge_list)) + nodes = set(edge_list.iloc[:, 0].to_list()) | set(edge_list.iloc[:, 1].to_list()) + return dict( + n_node=len(nodes), + n_edge=n_edge, + edge_per_node=n_edge / len(nodes), + nodes=list(nodes), + edges=edge_list, + ) def get_node_edge_overlap(network_info): @@ -208,199 +220,206 @@ def get_node_edge_overlap(network_info): overlap : dict A dictionary with the node and edge overlap between networks. """ - networks = pd.DataFrame(columns = ['name', 'type', 'n_node', - 'n_edge', 'edge_per_node', - 'nodes', 'edges']) + networks = pd.DataFrame( + columns=["name", "type", "n_node", "n_edge", "edge_per_node", "nodes", "edges"] + ) for _, row in network_info.iterrows(): - network_name = row['name'] - network_type = row['type'] - network_el = row['el'] + network_name = row["name"] + network_type = row["type"] + network_el = row["el"] res = get_node_edge(network_el) - cur_df = pd.DataFrame({'name': [network_name], - 'type': [network_type], - 'n_node': [int(res['n_node'])], - 'n_edge': [int(res['n_edge'])], - 'edge_per_node': [res['edge_per_node']], - 'nodes': [res['nodes']], - 'edges': [res['edges']]}) + cur_df = pd.DataFrame( + { + "name": [network_name], + "type": [network_type], + "n_node": [int(res["n_node"])], + "n_edge": [int(res["n_edge"])], + "edge_per_node": [res["edge_per_node"]], + "nodes": [res["nodes"]], + "edges": [res["edges"]], + } + ) networks = pd.concat([networks, cur_df], ignore_index=True) # overlap of nodes and edges overlap = {} # node overlap - target = 'FunMap' + target = "FunMap" cur_res = {} - target_node_set = set(networks.loc[networks['name'] == target, - 'nodes'].tolist()[0]) + target_node_set = set(networks.loc[networks["name"] == target, "nodes"].tolist()[0]) target_size = len(target_node_set) for _, row in networks.iterrows(): - if row['name'] == target: + if row["name"] == target: continue - cur_node_set = set(row['nodes']) + cur_node_set = set(row["nodes"]) cur_size = len(cur_node_set) overlap_size = len(target_node_set & cur_node_set) - cur_res[row['name']] = tuple([ - target_size - overlap_size, - cur_size - overlap_size, - overlap_size]) + cur_res[row["name"]] = tuple( + [target_size - overlap_size, cur_size - overlap_size, overlap_size] + ) - overlap['node'] = cur_res + overlap["node"] = cur_res # edge overlap cur_res = {} - target_edge_df = networks.loc[networks['name'] == target, 'edges'].tolist()[0] - target_edge_set = set(tuple(sorted(x)) for x in zip(target_edge_df.pop(0), - target_edge_df.pop(1))) + target_edge_df = networks.loc[networks["name"] == target, "edges"].tolist()[0] + target_edge_set = set( + tuple(sorted(x)) for x in zip(target_edge_df.pop(0), target_edge_df.pop(1)) + ) target_size = len(target_edge_set) for _, row in networks.iterrows(): - if row['name'] == target: + if row["name"] == target: continue - edge_df = row['edges'] + edge_df = row["edges"] edges = [tuple(sorted(x)) for x in zip(edge_df.pop(0), edge_df.pop(1))] cur_edge_set = set(edges) cur_size = len(cur_edge_set) overlap_size = len(target_edge_set & cur_edge_set) - cur_res[row['name']] = tuple([ - target_size - overlap_size, - cur_size - overlap_size, - overlap_size]) + cur_res[row["name"]] = tuple( + [target_size - overlap_size, cur_size - overlap_size, overlap_size] + ) - overlap['edge'] = cur_res + overlap["edge"] = cur_res return overlap + def cleanup_experiment(config_file): cfg = get_config(config_file) - results_dir = Path(cfg['results_dir']) + results_dir = Path(cfg["results_dir"]) shutil.rmtree(results_dir, ignore_errors=True) + def setup_experiment(config_file): cfg = get_config(config_file) - results_dir = Path(cfg['results_dir']) + results_dir = Path(cfg["results_dir"]) # create folders - folder_dict = cfg['subdirs'] - folders = [results_dir / Path(folder_dict[folder_name]) for folder_name in folder_dict] + folder_dict = cfg["subdirs"] + folders = [ + results_dir / Path(folder_dict[folder_name]) for folder_name in folder_dict + ] for folder in folders: folder.mkdir(parents=True, exist_ok=True) # save configuration to results folder - with open(str( results_dir / 'config.yml'), 'w') as fh: + with open(str(results_dir / "config.yml"), "w") as fh: yaml.dump(cfg, fh, sort_keys=False) return cfg + def get_config(cfg_file: Path): cfg = { - 'task': 'protein_func', - 'results_dir': 'results', + "task": "protein_func", + "results_dir": "results", # the following directories are relative to the results_dir - 'subdirs': { - 'saved_model_dir': 'saved_models', - 'saved_data_dir': 'saved_data', - 'saved_predictions_dir': 'saved_predictions', - 'figure_dir': 'figures', - 'network_dir': 'networks', + "subdirs": { + "saved_model_dir": "saved_models", + "saved_data_dir": "saved_data", + "saved_predictions_dir": "saved_predictions", + "figure_dir": "figures", + "network_dir": "networks", }, - 'seed': 42, - 'feature_type': 'cc', - 'test_size': 0.2, - 'ml_type': 'xgboost', - 'gs_file': None, - 'extra_feature_file': None, + "seed": 42, + "feature_type": "cc", + "test_size": 0.2, + "ml_type": "xgboost", + "gs_file": None, + "extra_feature_file": None, # 'filter_before_prediction': True, # 'min_feature_count': 1, - 'min_sample_count': 20, - 'filter_noncoding_genes': False, + "min_sample_count": 20, + "filter_noncoding_genes": False, # 'filter_after_prediction': True, # 'filter_criterion': 'max', # 'filter_threshold': 0.95, # 'filter_blacklist': False, - 'n_jobs': os.cpu_count(), - 'start_edge_num': 1000, - 'max_num_edges': 250000, - 'step_size': 1000, - 'lr_cutoff': 50, + "n_jobs": os.cpu_count(), + "start_edge_num": 1000, + "max_num_edges": 250000, + "step_size": 1000, + "lr_cutoff": 50, } - with open(cfg_file, 'r') as fh: + with open(cfg_file, "r") as fh: cfg_dict = yaml.load(fh, Loader=yaml.FullLoader) # use can change the following parameters in the config file - if 'task' in cfg_dict: - cfg['task'] = cfg_dict['task'] - assert cfg['task'] in ['protein_func', 'kinase_func'] + if "task" in cfg_dict: + cfg["task"] = cfg_dict["task"] + assert cfg["task"] in ["protein_func", "kinase_func"] - if 'seed' in cfg_dict: - cfg['seed'] = cfg_dict['seed'] + if "seed" in cfg_dict: + cfg["seed"] = cfg_dict["seed"] - if 'feature_type' in cfg_dict: - cfg['feature_type'] = cfg_dict['feature_type'] - assert cfg['feature_type'] in ['cc', 'mr'] + if "feature_type" in cfg_dict: + cfg["feature_type"] = cfg_dict["feature_type"] + assert cfg["feature_type"] in ["cc", "mr"] - if 'extra_feature_file' in cfg_dict: - cfg['extra_feature_file'] = cfg_dict['extra_feature_file'] + if "extra_feature_file" in cfg_dict: + cfg["extra_feature_file"] = cfg_dict["extra_feature_file"] - if 'gs_file' in cfg_dict: - cfg['gs_file'] = cfg_dict['gs_file'] + if "gs_file" in cfg_dict: + cfg["gs_file"] = cfg_dict["gs_file"] - if 'min_sample_count' in cfg_dict: - cfg['min_sample_count'] = cfg_dict['min_sample_count'] + if "min_sample_count" in cfg_dict: + cfg["min_sample_count"] = cfg_dict["min_sample_count"] - if 'n_jobs' in cfg_dict: - cfg['n_jobs'] = cfg_dict['n_jobs'] + if "n_jobs" in cfg_dict: + cfg["n_jobs"] = cfg_dict["n_jobs"] - if 'start_edge_num' in cfg_dict: - cfg['start_edge_num'] = cfg_dict['start_edge_num'] + if "start_edge_num" in cfg_dict: + cfg["start_edge_num"] = cfg_dict["start_edge_num"] - if 'max_num_edges' in cfg_dict: - cfg['max_num_edges'] = cfg_dict['max_num_edges'] + if "max_num_edges" in cfg_dict: + cfg["max_num_edges"] = cfg_dict["max_num_edges"] - if 'step_size' in cfg_dict: - cfg['step_size'] = cfg_dict['step_size'] + if "step_size" in cfg_dict: + cfg["step_size"] = cfg_dict["step_size"] - if 'lr_cutoff' in cfg_dict: - cfg['lr_cutoff'] = cfg_dict['lr_cutoff'] + if "lr_cutoff" in cfg_dict: + cfg["lr_cutoff"] = cfg_dict["lr_cutoff"] - if 'name' in cfg_dict: - cfg['name'] = cfg_dict['name'] + if "name" in cfg_dict: + cfg["name"] = cfg_dict["name"] else: - raise ValueError('name not specified in config file') + raise ValueError("name not specified in config file") - if 'data_path' in cfg_dict: - cfg['data_path'] = cfg_dict['data_path'] + if "data_path" in cfg_dict: + cfg["data_path"] = cfg_dict["data_path"] else: - raise ValueError('data_path not specified in config file') + raise ValueError("data_path not specified in config file") - if cfg['task'] == 'protein_func': - if 'filter_noncoding_genes' in cfg_dict: - cfg['filter_noncoding_genes'] = cfg_dict['filter_noncoding_genes'] + if cfg["task"] == "protein_func": + if "filter_noncoding_genes" in cfg_dict: + cfg["filter_noncoding_genes"] = cfg_dict["filter_noncoding_genes"] else: # ignore filter_noncoding_genes for kinase_func - cfg['filter_noncoding_genes'] = False - log.info('ignoring filter_noncoding_genes for kinase_func') + cfg["filter_noncoding_genes"] = False + log.info("ignoring filter_noncoding_genes for kinase_func") - if 'results_dir' in cfg_dict: - cfg['results_dir'] = cfg_dict['results_dir'] + '/' + cfg_dict['name'] + if "results_dir" in cfg_dict: + cfg["results_dir"] = cfg_dict["results_dir"] + "/" + cfg_dict["name"] - if 'data_files' not in cfg_dict: - raise ValueError('data_files not specified in config file') + if "data_files" not in cfg_dict: + raise ValueError("data_files not specified in config file") # Check all files listed under data_files are also in the tar.gz file - data_files = cfg_dict['data_files'] + data_files = cfg_dict["data_files"] # List all the files in the tar.gz file - with tarfile.open(cfg['data_path'], "r:gz") as tar: + with tarfile.open(cfg["data_path"], "r:gz") as tar: tar_files = {Path(file).name for file in tar.getnames()} # Check if all files in data_files are in tar_files - if not all(file['path'] in tar_files for file in data_files): - print('Files listed under data_files are not in the tar.gz file!') - raise ValueError('Files listed under data_files are not in the tar.gz file!') + if not all(file["path"] in tar_files for file in data_files): + print("Files listed under data_files are not in the tar.gz file!") + raise ValueError("Files listed under data_files are not in the tar.gz file!") - cfg['data_files'] = cfg_dict['data_files'] + cfg["data_files"] = cfg_dict["data_files"] - if 'rp_pairs' in cfg_dict: - cfg['rp_pairs'] = cfg_dict['rp_pairs'] + if "rp_pairs" in cfg_dict: + cfg["rp_pairs"] = cfg_dict["rp_pairs"] return cfg @@ -412,9 +431,9 @@ def check_gold_standard_file(file_path, min_count=10000): """ try: - with open(file_path, 'r', newline='') as tsv_file: + with open(file_path, "r", newline="") as tsv_file: dialect = csv.Sniffer().sniff(tsv_file.read(2048)) - if dialect.delimiter != '\t': + if dialect.delimiter != "\t": print("Error: Incorrect TSV format. TSV files should be tab-separated.") return False except csv.Error as e: @@ -423,12 +442,16 @@ def check_gold_standard_file(file_path, min_count=10000): # Check data format and Class values class_values = [] - with open(file_path, 'r', newline='') as tsv_file: - reader = csv.reader(tsv_file, delimiter='\t') + with open(file_path, "r", newline="") as tsv_file: + reader = csv.reader(tsv_file, delimiter="\t") next(reader) # Skip header - for row_num, row in enumerate(reader, start=2): # Add row_num for better error reporting + for row_num, row in enumerate( + reader, start=2 + ): # Add row_num for better error reporting if len(row) != 3: # Assuming each row should have 3 columns - log.error(f'Invalid row format in row {row_num}. Each row should have 3 columns.') + log.error( + f"Invalid row format in row {row_num}. Each row should have 3 columns." + ) return False class_value = row[2].strip() if not class_value.isdigit() or int(class_value) not in (0, 1): @@ -442,14 +465,16 @@ def check_gold_standard_file(file_path, min_count=10000): lesser_count = min(count_0, count_1) if lesser_count < min_count: - log.error(f"The lesser of 0 and 1 occurrences ({lesser_count}) does not meet the threshold. " - f"Expected at least {min_count}.") + log.error( + f"The lesser of 0 and 1 occurrences ({lesser_count}) does not meet the threshold. " + f"Expected at least {min_count}." + ) return False return True -def check_extra_feature_file(file_path, missing_value='NA'): +def check_extra_feature_file(file_path, missing_value="NA"): """ Notes ----- @@ -465,9 +490,9 @@ def check_extra_feature_file(file_path, missing_value='NA'): The TSV file should be tab-separated. """ try: - with open(file_path, 'r', newline='') as tsv_file: + with open(file_path, "r", newline="") as tsv_file: dialect = csv.Sniffer().sniff(tsv_file.read(1024)) - if dialect.delimiter != '\t': + if dialect.delimiter != "\t": log.error("Incorrect TSV format. TSV files should be tab-separated.") return False except csv.Error: @@ -475,8 +500,8 @@ def check_extra_feature_file(file_path, missing_value='NA'): return False # Check header and number of columns - with open(file_path, 'r', newline='') as tsv_file: - reader = csv.reader(tsv_file, delimiter='\t') + with open(file_path, "r", newline="") as tsv_file: + reader = csv.reader(tsv_file, delimiter="\t") header = next(reader, None) if header is None: log.error("The TSV file must have a header row.") @@ -488,12 +513,16 @@ def check_extra_feature_file(file_path, missing_value='NA'): return False # Check data type consistency for additional columns - with open(file_path, 'r', newline='') as tsv_file: - reader = csv.DictReader(tsv_file, delimiter='\t') + with open(file_path, "r", newline="") as tsv_file: + reader = csv.DictReader(tsv_file, delimiter="\t") column_data_types = {} - for row_num, row in enumerate(reader, start=2): # Add row_num for better error reporting + for row_num, row in enumerate( + reader, start=2 + ): # Add row_num for better error reporting for column_name, value in row.items(): - if column_name not in header[:2]: # Skip the first two columns (Protein_1 and Protein_2) + if ( + column_name not in header[:2] + ): # Skip the first two columns (Protein_1 and Protein_2) if value == missing_value: continue try: @@ -501,18 +530,26 @@ def check_extra_feature_file(file_path, missing_value='NA'): if float_value.is_integer(): value = int(float_value) except ValueError: - log.error("Invalid data type in row %d, column '%s'. " - "The value '%s' should be either an integer, a float, or '%s' (missing value).", - row_num, column_name, value, missing_value) + log.error( + "Invalid data type in row %d, column '%s'. " + "The value '%s' should be either an integer, a float, or '%s' (missing value).", + row_num, + column_name, + value, + missing_value, + ) return False # Store the data type of each column (float or integer) - data_type = float if '.' in value else int + data_type = float if "." in value else int if column_name not in column_data_types: column_data_types[column_name] = data_type elif column_data_types[column_name] != data_type: - log.error("Inconsistent data type in column '%s'. " - "Expected a consistent data type (integer, float, or '%s') for all rows.", - column_name, missing_value) + log.error( + "Inconsistent data type in column '%s'. " + "Expected a consistent data type (integer, float, or '%s') for all rows.", + column_name, + missing_value, + ) return False return True From b22c47efd5deefba4e241c47595e292901d37171 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 19 Sep 2024 11:32:06 -0500 Subject: [PATCH 10/54] fix extra_file path --- funmap/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funmap/utils.py b/funmap/utils.py index 9bebb567..cc00a09c 100644 --- a/funmap/utils.py +++ b/funmap/utils.py @@ -130,7 +130,7 @@ def get_data_dict(config, min_sample_count=15): if config["extra_feature_file"] is not None: log.info("Importing extra feature file") # TODO: Import extra feature file - extra_feature_df = pd.read_csv(extra_feature, sep="\t") + extra_feature_df = pd.read_csv(config["extra_feature_file"], sep="\t") if config["filter_noncoding_genes"]: col1 = extra_feature_df.columns.values[0] col2 = extra_feature_df.columns.values[1] From 262d580e59957ee2b4552191ec3b615edae23fe1 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 19 Sep 2024 11:53:33 -0500 Subject: [PATCH 11/54] fix function names problem --- funmap/funmap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funmap/funmap.py b/funmap/funmap.py index d07f3369..f775d241 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -31,7 +31,7 @@ log = setup_logger(__name__) -def get_valid_gs_dat( +def get_valid_gs_data( gs_path: str, valid_gene_list: List[str], md5=None ): # TODO: Add error message if there is no overlap between valid_gene_list and the gold standard set log.info(f'Loading gold standard feature file "{gs_path}" ...') From dbc14796b24c01b86c1986ba7621680740a4640d Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 19 Sep 2024 12:11:39 -0500 Subject: [PATCH 12/54] Use df for merging extra features --- funmap/funmap.py | 34 ++++------------------------------ 1 file changed, 4 insertions(+), 30 deletions(-) diff --git a/funmap/funmap.py b/funmap/funmap.py index f775d241..88d2750d 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -327,37 +327,11 @@ def extract_features( axis=1, result_type="expand", ) - log.info(str(feature_df.head())) - log.info("Extra feature:\n" + str(extra_feature_df.head())) - old_feature_count = feature_df.shape[0] - - # Get gene indices for P1 and P2 - # df.reset_index(drop=True, inplace=True) - log.info("df head") - log.info(str(df.head())) - # gene_to_index = {gene.astype(str): idx for idx, gene in enumerate(gene_ids)} - # p1_indices = np.array([gene_to_index.get(gene, -1) for gene in df.iloc[:, 0]]) - # p2_indices = np.array([gene_to_index.get(gene, -1) for gene in df.iloc[:, 1]]) - # - # f_dataset = h5_file[dataset] - # f_values = np.empty(len(df), dtype=float) - # valid_indices = (p1_indices != -1) & (p2_indices != -1) - # - # linear_indices_func = ( - # lambda row_indices, col_indices, n: np.array(col_indices) - # - np.array(row_indices) - # + (2 * n - np.array(row_indices) + 1) * np.array(row_indices) // 2 - # ) - # linear_indices = linear_indices_func( - # p1_indices[valid_indices], p2_indices[valid_indices], len(gene_ids) - # ) - feature_df = pd.merge( - feature_df, extra_feature_df, on=["P1", "P2"], how="outer" + merged_df = pd.merge( + df[["P1", "P2"]], extra_feature_df, on=["P1", "P2"], how="left" ) - f_values[valid_indices] = f_dataset[:][linear_indices] - f_values[~valid_indices] = np.nan - log.info("Merged Data Frame:\n" + str(feature_df.head())) - log.info(f"Added gene pairs: {feature_df.shape[0] - old_feature_count}") + merged_df = merged_df.drop(columns=["P1", "P2", "label"]) + feature_df = pd.concat([feature_df, merged_df], axis=1) # move 'label' column to the end of the dataframe if it exists if "label" in feature_df.columns: feature_df = feature_df[ From 1fc6f15b5e8d8cbb733074cc119315a34b350f68 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 19 Sep 2024 12:14:24 -0500 Subject: [PATCH 13/54] don't drop missing column --- funmap/funmap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funmap/funmap.py b/funmap/funmap.py index 88d2750d..0fc6da97 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -330,7 +330,7 @@ def extract_features( merged_df = pd.merge( df[["P1", "P2"]], extra_feature_df, on=["P1", "P2"], how="left" ) - merged_df = merged_df.drop(columns=["P1", "P2", "label"]) + merged_df = merged_df.drop(columns=["P1", "P2"]) feature_df = pd.concat([feature_df, merged_df], axis=1) # move 'label' column to the end of the dataframe if it exists if "label" in feature_df.columns: From cfdf84d8a7128c3459f73ef8249ec7e358457ed4 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 19 Sep 2024 13:18:52 -0500 Subject: [PATCH 14/54] diagnostic logging more diagnostics log final dataframe show nan rows track df size --- funmap/funmap.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/funmap/funmap.py b/funmap/funmap.py index 0fc6da97..2159feba 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -327,17 +327,38 @@ def extract_features( axis=1, result_type="expand", ) + df.reset_index(drop=True, inplace=True) merged_df = pd.merge( df[["P1", "P2"]], extra_feature_df, on=["P1", "P2"], how="left" ) merged_df = merged_df.drop(columns=["P1", "P2"]) + log.info("Merged_df") + log.info(str(merged_df.head())) + non_nan_values_exist = merged_df.notna().any().any() + + if non_nan_values_exist: + print("There are non-NaN values in the merged_df.") + else: + print("All values in merged_df are NaN.") + + log.info("before merge") + log.info(str(feature_df.head())) + log.info(feature_df.size) feature_df = pd.concat([feature_df, merged_df], axis=1) + log.info("After merge") + log.info(str(feature_df.head())) + log.info(feature_df.size) # move 'label' column to the end of the dataframe if it exists if "label" in feature_df.columns: feature_df = feature_df[ [col for col in feature_df.columns if col != "label"] + ["label"] ] + nan_rows = feature_df[feature_df["label"].isna()] + print("NAN ROWS:") + print(nan_rows) + log.info("Final DF") + log.info(str(feature_df.head())) return feature_df From 62ba8b975208be4e09438c2d34baa83cb9d180be Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 19 Sep 2024 14:48:30 -0500 Subject: [PATCH 15/54] Remove duplicate gene pairs in extra_features row logging log fixes --- funmap/funmap.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/funmap/funmap.py b/funmap/funmap.py index 2159feba..3360a6f5 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -327,6 +327,12 @@ def extract_features( axis=1, result_type="expand", ) + extra_feature_df = extra_feature_df[ + extra_feature_df.duplicated(subset=["P1", "P2"], keep="first") + ] + log.info( + f"Row count for df: {df.shape[0]}\nRow count for features: {feature_df.shape[0]}" + ) df.reset_index(drop=True, inplace=True) merged_df = pd.merge( df[["P1", "P2"]], extra_feature_df, on=["P1", "P2"], how="left" @@ -346,6 +352,9 @@ def extract_features( log.info(feature_df.size) feature_df = pd.concat([feature_df, merged_df], axis=1) log.info("After merge") + log.info( + f"Row count for merge_df: {merge_df.shape[0]}\nRow count for features: {feature_df.shape[0]}" + ) log.info(str(feature_df.head())) log.info(feature_df.size) # move 'label' column to the end of the dataframe if it exists From 5691803951b8183e1c28654e3bde6c688031f615 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 19 Sep 2024 15:05:15 -0500 Subject: [PATCH 16/54] use merge instead of concat --- funmap/funmap.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/funmap/funmap.py b/funmap/funmap.py index 3360a6f5..d6bc04ac 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -350,10 +350,10 @@ def extract_features( log.info("before merge") log.info(str(feature_df.head())) log.info(feature_df.size) - feature_df = pd.concat([feature_df, merged_df], axis=1) + feature_df = pd.merge(feature_df, merged_df, left_index=True, right_index=True) log.info("After merge") log.info( - f"Row count for merge_df: {merge_df.shape[0]}\nRow count for features: {feature_df.shape[0]}" + f"Row count for merge_df: {merged_df.shape[0]}\nRow count for features: {feature_df.shape[0]}" ) log.info(str(feature_df.head())) log.info(feature_df.size) From 28b76b3221f2d922318fb9ac6274cf562169cb2f Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 19 Sep 2024 15:23:33 -0500 Subject: [PATCH 17/54] drop duplicates more explicitly --- funmap/funmap.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/funmap/funmap.py b/funmap/funmap.py index d6bc04ac..9a5e9c6c 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -327,9 +327,9 @@ def extract_features( axis=1, result_type="expand", ) - extra_feature_df = extra_feature_df[ - extra_feature_df.duplicated(subset=["P1", "P2"], keep="first") - ] + extra_feature_df = extra_feature_df.drop_duplicates( + subset=["P1", "P2"], keep="last" + ) log.info( f"Row count for df: {df.shape[0]}\nRow count for features: {feature_df.shape[0]}" ) From e075c7519a2e6a81b0940b249002fcb8a0c9866b Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 19 Sep 2024 15:31:36 -0500 Subject: [PATCH 18/54] remove extra logs --- funmap/funmap.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/funmap/funmap.py b/funmap/funmap.py index 9a5e9c6c..418aa8a3 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -315,9 +315,6 @@ def extract_features( if extra_feature is not None: log.info(f"Loading extra features from {extra_feature}") extra_feature_df = pd.read_csv(extra_feature, sep="\t") - log.info( - f"Extra Features: {len(extra_feature_df.columns) - 2}\nGene Pairs: {extra_feature_df.shape[0]}" - ) extra_feature_df.columns.values[0] = "P1" extra_feature_df.columns.values[1] = "P2" extra_feature_df[["P1", "P2"]] = extra_feature_df.apply( @@ -330,44 +327,17 @@ def extract_features( extra_feature_df = extra_feature_df.drop_duplicates( subset=["P1", "P2"], keep="last" ) - log.info( - f"Row count for df: {df.shape[0]}\nRow count for features: {feature_df.shape[0]}" - ) df.reset_index(drop=True, inplace=True) merged_df = pd.merge( df[["P1", "P2"]], extra_feature_df, on=["P1", "P2"], how="left" ) merged_df = merged_df.drop(columns=["P1", "P2"]) - log.info("Merged_df") - log.info(str(merged_df.head())) - non_nan_values_exist = merged_df.notna().any().any() - - if non_nan_values_exist: - print("There are non-NaN values in the merged_df.") - else: - print("All values in merged_df are NaN.") - - log.info("before merge") - log.info(str(feature_df.head())) - log.info(feature_df.size) feature_df = pd.merge(feature_df, merged_df, left_index=True, right_index=True) - log.info("After merge") - log.info( - f"Row count for merge_df: {merged_df.shape[0]}\nRow count for features: {feature_df.shape[0]}" - ) - log.info(str(feature_df.head())) - log.info(feature_df.size) # move 'label' column to the end of the dataframe if it exists if "label" in feature_df.columns: feature_df = feature_df[ [col for col in feature_df.columns if col != "label"] + ["label"] ] - nan_rows = feature_df[feature_df["label"].isna()] - print("NAN ROWS:") - print(nan_rows) - - log.info("Final DF") - log.info(str(feature_df.head())) return feature_df From 49441acfda95bb14061a2c688c65427b55e6ebfc Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 19 Sep 2024 16:11:42 -0500 Subject: [PATCH 19/54] Add LLR calculation and plotting for extra features --- funmap/funmap.py | 42 +++ funmap/plotting.py | 789 +++++++++++++++++++++++++++------------------ 2 files changed, 519 insertions(+), 312 deletions(-) diff --git a/funmap/funmap.py b/funmap/funmap.py index 418aa8a3..90076c9b 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -557,6 +557,12 @@ def extract_dataset_feature(all_pairs, feature_file, feature_type="cc"): return df +def extract_extra_features(all_pairs, ef_df): + # convert all_pairs to a dataframe + df = pd.DataFrame(all_pairs, columns=["P1", "P2"]) + pd.merge(df, ef_df, on=["P1", "P2"], how="left") + + def dataset_llr( all_ids, feature_dict, @@ -566,6 +572,7 @@ def dataset_llr( max_num_edge, step_size, llr_dataset_file, + extra_feature, ): llr_ds = pd.DataFrame() all_ids_sorted = sorted(all_ids) @@ -605,6 +612,41 @@ def dataset_llr( llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True) log.info(f"Calculating llr for all datasets average ... done") llr_ds.to_csv(llr_dataset_file, sep="\t", index=False) + if extra_feature is not None: + log.info("Calculating LLR for extra features") + extra_feature_df = pd.read_csv(extra_feature, sep="\t") + extra_feature_df.columns.values[0] = "P1" + extra_feature_df.columns.values[1] = "P2" + extra_feature_df[["P1", "P2"]] = extra_feature_df.apply( + lambda row: sorted([row["P1"], row["P2"]]) + if row["P1"] > row["P2"] + else [row["P1"], row["P2"]], + axis=1, + result_type="expand", + ) + extra_feature_df = extra_feature_df.drop_duplicates( + subset=["P1", "P2"], keep="last" + ) + extra_feature_df = extract_extra_features( + all_pairs, extra_feature_df + ) # filter out unused pairs + features = extra_feature_df.columns.values[2:] + for f in features: + subset_df = extra_feature_df[["P1", "P2", f]] + subset_df.columns.values[-1] = "prediction" + log.info(f"Calculating llr for extra feature {f} ...") + cur_llr_res = compute_llr( + subset_df, + None, + start_edge_num, + max_num_edge, + step_size, + gs_test, + ) + cur_llr_res["dataset"] = f + "_EXTRAFEAT" + llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True) + llr_ds.to_csv(llr_dataset_file, sep="\t", index=False) + log.info(f"Calculating llr for {dataset} ... done") return llr_ds diff --git a/funmap/plotting.py b/funmap/plotting.py index 3a46a970..6a01b74b 100644 --- a/funmap/plotting.py +++ b/funmap/plotting.py @@ -21,6 +21,7 @@ log = setup_logger(__name__) + def edge_number(x, pos): """ Formatter function to format the x-axis tick labels @@ -38,64 +39,84 @@ def edge_number(x, pos): The formatted string of the value. """ if x >= 1e6: - s = '{:1.1f}M'.format(x*1e-6) + s = "{:1.1f}M".format(x * 1e-6) elif x == 0: - s = '0' + s = "0" else: - s = '{:1.0f}K'.format(x*1e-3) + s = "{:1.0f}K".format(x * 1e-3) return s -def plot_llr_comparison(cfg, validation_results, llr_ds, output_file='llr_comparison.pdf'): - name_type_dict = {item['name']: item['type'] for item in cfg['data_files']} - name_type_dict['all_average'] = 'other' - datasets = sorted(llr_ds['dataset'].unique().tolist()) +def plot_llr_comparison( + cfg, validation_results, llr_ds, output_file="llr_comparison.pdf" +): + name_type_dict = {item["name"]: item["type"] for item in cfg["data_files"]} + name_type_dict["all_average"] = "other" + datasets = sorted(llr_ds["dataset"].unique().tolist()) fig, ax = plt.subplots(figsize=(20, 16)) for ds in datasets: start = -1 - cur_df = llr_ds[llr_ds['dataset'] == ds] - if name_type_dict[ds].upper() == 'RNA': - ltype = '--' - elif name_type_dict[ds].upper() == 'PROTEIN': - ltype = ':' + cur_df = llr_ds[llr_ds["dataset"] == ds] + if name_type_dict[ds].upper() == "RNA": + ltype = "--" + elif name_type_dict[ds].upper() == "PROTEIN": + ltype = ":" + elif ds.endswith("_EXTRAFEAT"): + ds = ds.replace("_EXTRAFEAT", "") + ltype = "-." else: - ltype = '-' - ax.plot(cur_df['k'], cur_df['llr'], linestyle=ltype, label=ds) + ltype = "-" + ax.plot(cur_df["k"], cur_df["llr"], linestyle=ltype, label=ds) if start == -1: - start = cur_df['k'].iloc[0] + start = cur_df["k"].iloc[0] # plot llr_res with the same start point for ft in validation_results: - llr_res = pd.read_csv(validation_results[ft]['llr_res_path'], sep='\t') - llr_res = llr_res[llr_res['k'] >= start] - ax.plot(llr_res['k'], llr_res['llr'], label=f'funmap_{ft}', linewidth=3) - - line_styles = [Line2D([0], [0], linestyle='--', color='black', label='RNA'), - Line2D([0], [0], linestyle=':', color='black', label='Protein'), - Line2D([0], [0], linestyle='-', color='black', label='Other')] - color_legend = ax.legend(handles=[Patch(color=line.get_color(), label=line.get_label()) for line in ax.lines], - bbox_to_anchor=(1.05, 1), title='Data type', fontsize=16, - bbox_transform=ax.transAxes) - linestyle_legend = ax.legend(handles=line_styles, - bbox_to_anchor=(1.05, 0.2), title='Model', fontsize=16, - bbox_transform=ax.transAxes) + llr_res = pd.read_csv(validation_results[ft]["llr_res_path"], sep="\t") + llr_res = llr_res[llr_res["k"] >= start] + ax.plot(llr_res["k"], llr_res["llr"], label=f"funmap_{ft}", linewidth=3) + + line_styles = [ + Line2D([0], [0], linestyle="--", color="black", label="RNA"), + Line2D([0], [0], linestyle=":", color="black", label="Protein"), + Line2D([0], [0], linestyle="-.", color="black", label="Extra Feature"), + Line2D([0], [0], linestyle="-", color="black", label="Other"), + ] + color_legend = ax.legend( + handles=[ + Patch(color=line.get_color(), label=line.get_label()) for line in ax.lines + ], + bbox_to_anchor=(1.05, 1), + title="Data type", + fontsize=16, + bbox_transform=ax.transAxes, + ) + linestyle_legend = ax.legend( + handles=line_styles, + bbox_to_anchor=(1.05, 0.2), + title="Model", + fontsize=16, + bbox_transform=ax.transAxes, + ) ax.add_artist(color_legend) ax.add_artist(linestyle_legend) ax.xaxis.set_major_formatter(edge_number) - ax.set_xlabel('number of pairs', fontsize=16) - ax.set_ylabel('log likelihood ratio', fontsize=16) - ax.yaxis.grid(color = 'gainsboro', linestyle = 'dotted') + ax.set_xlabel("number of pairs", fontsize=16) + ax.set_ylabel("log likelihood ratio", fontsize=16) + ax.yaxis.grid(color="gainsboro", linestyle="dotted") plt.tight_layout() plt.box(on=None) - plt.savefig(output_file, bbox_inches='tight', bbox_extra_artists=[color_legend, linestyle_legend]) + plt.savefig( + output_file, + bbox_inches="tight", + bbox_extra_artists=[color_legend, linestyle_legend], + ) plt.close(fig) return output_file -def explore_data(data_config: Path, - min_sample_count: int, - output_dir: Path): +def explore_data(data_config: Path, min_sample_count: int, output_dir: Path): """ Generate plots to explore and visualize data @@ -122,9 +143,9 @@ def explore_data(data_config: Path, max_col_to_plot = 100 - log.info('Generating plots to explore and visualize data ...') + log.info("Generating plots to explore and visualize data ...") for ds in data_dict: - log.info(f'... {ds}') + log.info(f"... {ds}") data_df = data_dict[ds] fig, ax = plt.subplots(1, 2, figsize=(20, 5)) cur_data = data_df.T @@ -132,32 +153,34 @@ def explore_data(data_config: Path, if cur_data.shape[1] > max_col_to_plot: cur_data = cur_data.sample(max_col_to_plot, axis=1) ax[0].boxplot(cur_data) - ax[0].set_ylabel('expression') + ax[0].set_ylabel("expression") if data_df.shape[0] > max_col_to_plot: - ax[0].set_xlabel(f'random selected {max_col_to_plot} samples (total n={data_df.shape[0]})') + ax[0].set_xlabel( + f"random selected {max_col_to_plot} samples (total n={data_df.shape[0]})" + ) ax[0].set_xticklabels([]) ax[0].set_xticks([]) else: - ax[0].set_xlabel('sample') - ax[0].set_xticklabels(cur_data.columns, rotation=45, ha='right') + ax[0].set_xlabel("sample") + ax[0].set_xticklabels(cur_data.columns, rotation=45, ha="right") # density plot for each sample in each dataset for i in range(data_df.shape[0]): sns.kdeplot(data_df.iloc[i, :], linewidth=1, ax=ax[1]) - locator=MaxNLocator(60) + locator = MaxNLocator(60) ax[1].xaxis.set_major_locator(locator) - ax[1].set_xlabel('values') - ax[1].set_ylabel('density') + ax[1].set_xlabel("values") + ax[1].set_ylabel("density") ticks_loc = ax[1].get_xticks().tolist() ax[1].xaxis.set_major_locator(mticker.FixedLocator(ticks_loc)) - ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=45, ha='right') + ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=45, ha="right") # set title for the figu - fig.suptitle(f'{ds}', fontsize=16) + fig.suptitle(f"{ds}", fontsize=16) fig.tight_layout() - cur_file_name = f'{ds}_sample_plot.pdf' + cur_file_name = f"{ds}_sample_plot.pdf" fig_names.append(cur_file_name) - log.info(f'Saving figure {cur_file_name} ...') + log.info(f"Saving figure {cur_file_name} ...") fig.savefig(output_dir / cur_file_name) plt.close(fig) data_keys.append(ds) @@ -166,52 +189,58 @@ def explore_data(data_config: Path, fig, ax = plt.subplots(figsize=(10, 5)) ax.boxplot(data) ax.set_xticklabels(data_keys, rotation=45) - ax.yaxis.grid(color = 'gainsboro', linestyle = 'dotted') + ax.yaxis.grid(color="gainsboro", linestyle="dotted") - ax.set_xlabel('dataset') - ax.set_ylabel('median expression') + ax.set_xlabel("dataset") + ax.set_ylabel("median expression") plt.box(on=None) fig.tight_layout() - file_name = 'data_box_plot.pdf' + file_name = "data_box_plot.pdf" fig_names.append(file_name) - log.info(f'Saving figure {file_name} ...') + log.info(f"Saving figure {file_name} ...") fig.savefig(output_dir / file_name) plt.close(fig) # boxplot of the number of samples and genes sample_count = pd.DataFrame( - {'count':[data_dict[ds].shape[0] for ds in data_dict], - 'dataset': [ds for ds in data_dict]}) - gene_count = pd.DataFrame({ - 'count':[data_dict[ds].shape[1] for ds in data_dict], - 'dataset': [ds for ds in data_dict]}) + { + "count": [data_dict[ds].shape[0] for ds in data_dict], + "dataset": [ds for ds in data_dict], + } + ) + gene_count = pd.DataFrame( + { + "count": [data_dict[ds].shape[1] for ds in data_dict], + "dataset": [ds for ds in data_dict], + } + ) fig, ax = plt.subplots(1, 2, figsize=(10, 5)) - bars0 = ax[0].barh(sample_count['dataset'], sample_count['count'], color='#774FA0') - bars1 = ax[1].barh(gene_count['dataset'], gene_count['count'], color='#7DC462') + bars0 = ax[0].barh(sample_count["dataset"], sample_count["count"], color="#774FA0") + bars1 = ax[1].barh(gene_count["dataset"], gene_count["count"], color="#7DC462") - ax[0].spines['top'].set_visible(False) - ax[0].spines['left'].set_visible(False) - ax[0].spines['right'].set_visible(False) - ax[0].tick_params(axis='both', which='major', labelsize=12) - ax[0].bar_label(bars0, label_type='edge', fontsize=10) - ax[0].set_xlabel('number of samples') + ax[0].spines["top"].set_visible(False) + ax[0].spines["left"].set_visible(False) + ax[0].spines["right"].set_visible(False) + ax[0].tick_params(axis="both", which="major", labelsize=12) + ax[0].bar_label(bars0, label_type="edge", fontsize=10) + ax[0].set_xlabel("number of samples") - ax[1].spines['top'].set_visible(False) - ax[1].spines['left'].set_visible(False) - ax[1].spines['right'].set_visible(False) + ax[1].spines["top"].set_visible(False) + ax[1].spines["left"].set_visible(False) + ax[1].spines["right"].set_visible(False) ax[1].set_yticklabels([]) - ax[1].tick_params(axis='x', which='major', labelsize=12) - ax[1].tick_params(axis='y', which='both', left=False, right=False, labelleft=False) + ax[1].tick_params(axis="x", which="major", labelsize=12) + ax[1].tick_params(axis="y", which="both", left=False, right=False, labelleft=False) - ax[1].bar_label(bars1, label_type='edge', fontsize=10) - ax[1].set_xlabel('number of genes') + ax[1].bar_label(bars1, label_type="edge", fontsize=10) + ax[1].set_xlabel("number of genes") fig.tight_layout() - file_name = 'sample_gene_count.pdf' + file_name = "sample_gene_count.pdf" fig.savefig(output_dir / file_name) - log.info(f'Saving figure {file_name} ...') + log.info(f"Saving figure {file_name} ...") plt.close(fig) fig_names.append(file_name) return fig_names @@ -219,50 +248,66 @@ def explore_data(data_config: Path, def plot_results(cfg, validation_results, llr_ds, gs_dict, cutoff_llr, figure_dir): fig_names = [] - file_name = 'llr_compare_dataset.pdf' - plot_llr_comparison(cfg, validation_results, llr_ds, output_file=figure_dir / file_name) + file_name = "llr_compare_dataset.pdf" + plot_llr_comparison( + cfg, validation_results, llr_ds, output_file=figure_dir / file_name + ) fig_names.append(file_name) - if 'rp_pairs' in cfg: - file_names = plot_pair_llr(gs_dict, cfg['feature_type'], output_dir=figure_dir, - rp_pairs=cfg['rp_pairs']) + if "rp_pairs" in cfg: + file_names = plot_pair_llr( + gs_dict, + cfg["feature_type"], + output_dir=figure_dir, + rp_pairs=cfg["rp_pairs"], + ) fig_names.extend(file_names) - if cfg['task'] == 'protein_func': - file_name = 'llr_compare_networks.pdf' + if cfg["task"] == "protein_func": + file_name = "llr_compare_networks.pdf" # if gs_file is not specified, plot the llr comparison between networks # because some the numbers are pre-computed based on default gs_file - if cfg['gs_file'] is None: - plot_llr_compare_networks(validation_results, cfg['lr_cutoff'], cutoff_llr, - output_file=figure_dir / file_name) + if cfg["gs_file"] is None: + plot_llr_compare_networks( + validation_results, + cfg["lr_cutoff"], + cutoff_llr, + output_file=figure_dir / file_name, + ) fig_names.append(file_name) # the information about other networks is fixed for now other_network_info = { - 'name': ['BioGRID', 'BioPlex', 'HI-union', 'STRING'], - 'type': ['BioGRID', 'BioPlex', 'HI', 'STRING'], - 'url': ['https://figshare.com/ndownloader/files/39125054', - 'https://figshare.com/ndownloader/files/39125051', - 'https://figshare.com/ndownloader/files/39125093', - 'https://figshare.com/ndownloader/files/39125090' - ] + "name": ["BioGRID", "BioPlex", "HI-union", "STRING"], + "type": ["BioGRID", "BioPlex", "HI", "STRING"], + "url": [ + "https://figshare.com/ndownloader/files/39125054", + "https://figshare.com/ndownloader/files/39125051", + "https://figshare.com/ndownloader/files/39125093", + "https://figshare.com/ndownloader/files/39125090", + ], } # convert the info to a data frame where the url is read as a dataframe network_info = pd.DataFrame(other_network_info) - network_info['el'] = network_info['url'].apply(lambda x: pd.read_csv(x, - sep='\t', header=None)) - network_info = network_info.drop(columns=['url']) + network_info["el"] = network_info["url"].apply( + lambda x: pd.read_csv(x, sep="\t", header=None) + ) + network_info = network_info.drop(columns=["url"]) # for each funmap, create a dataframe for ft in validation_results: - edge_file_path = validation_results[ft]['edge_list_path'] - funmap_el = pd.read_csv(edge_file_path, sep='\t', header=None) - funmap_df = pd.DataFrame({'name': ['FunMap'], 'type': ['FunMap'], 'el': [funmap_el]}) + edge_file_path = validation_results[ft]["edge_list_path"] + funmap_el = pd.read_csv(edge_file_path, sep="\t", header=None) + funmap_df = pd.DataFrame( + {"name": ["FunMap"], "type": ["FunMap"], "el": [funmap_el]} + ) all_network_info = pd.concat([network_info, funmap_df], ignore_index=True) overlap_info = get_node_edge_overlap(all_network_info) - node_color, edge_color = '#7DC462', '#774FA0' - for (type, color) in zip(['node', 'edge'], [node_color, edge_color]): - fig_name = plot_overlap_venn(f'funmap_{ft}', overlap_info[type], type, color, figure_dir) + node_color, edge_color = "#7DC462", "#774FA0" + for type, color in zip(["node", "edge"], [node_color, edge_color]): + fig_name = plot_overlap_venn( + f"funmap_{ft}", overlap_info[type], type, color, figure_dir + ) fig_names.append(fig_name) fig_name = plot_network_stats(all_network_info, ft, figure_dir) @@ -297,34 +342,58 @@ def plot_1d_llr(ax, feature_df, feature_name, feature_type, data_type, n_bins): df = feature_df.loc[:, [feature_name]] cur_df = df.dropna() cur_df_vals = cur_df.values.reshape(-1) - clr = '#bcbddc' - data_range = {'CC': (-1, 1), 'MR': (0, 1)} - if data_type == 'PRO': - ax.hist(cur_df_vals, bins=n_bins, range=data_range[feature_type], color=clr, - orientation='horizontal', - density=True) - ax.text(0.95, 0.95, data_type, - verticalalignment='top', horizontalalignment='right', + clr = "#bcbddc" + data_range = {"CC": (-1, 1), "MR": (0, 1)} + if data_type == "PRO": + ax.hist( + cur_df_vals, + bins=n_bins, + range=data_range[feature_type], + color=clr, + orientation="horizontal", + density=True, + ) + ax.text( + 0.95, + 0.95, + data_type, + verticalalignment="top", + horizontalalignment="right", transform=ax.transAxes, rotation=-90, - color='black', fontsize=16) + color="black", + fontsize=16, + ) ax.set_ylim(data_range[feature_type]) - ax.set_xlim(0,2.5) + ax.set_xlim(0, 2.5) else: - ax.hist(cur_df_vals, bins=n_bins, range=data_range[feature_type], color=clr, - density=True) - ax.text(0.02, 0.9, data_type, - verticalalignment='top', horizontalalignment='left', + ax.hist( + cur_df_vals, + bins=n_bins, + range=data_range[feature_type], + color=clr, + density=True, + ) + ax.text( + 0.02, + 0.9, + data_type, + verticalalignment="top", + horizontalalignment="left", transform=ax.transAxes, - color='black', fontsize=16) - ax.set_ylabel('density', fontsize=16) + color="black", + fontsize=16, + ) + ax.set_ylabel("density", fontsize=16) ax.set_xlim(data_range[feature_type]) - ax.set_ylim(0,2.5) - ax.tick_params(axis='x', labelsize=16) - ax.tick_params(axis='y', labelsize=16) + ax.set_ylim(0, 2.5) + ax.tick_params(axis="x", labelsize=16) + ax.tick_params(axis="y", labelsize=16) -def plot_2d_llr(ax, feature_df, feature_type, pair_name, rna_feature, pro_feature, n_bins): +def plot_2d_llr( + ax, feature_df, feature_type, pair_name, rna_feature, pro_feature, n_bins +): """ Plots a 2D log likelihood ratio between two features in a scatter plot. @@ -351,48 +420,54 @@ def plot_2d_llr(ax, feature_df, feature_type, pair_name, rna_feature, pro_featur The mesh plot of the log likelihood ratio. """ - data_types = ['RNA', 'PRO'] - label_mapping = {'PRO': 'Protein', 'RNA': 'mRNA'} - feature_label_mapping = {'CC': 'correlation coefficient', - 'MR': 'mutual rank'} + data_types = ["RNA", "PRO"] + label_mapping = {"PRO": "Protein", "RNA": "mRNA"} + feature_label_mapping = {"CC": "correlation coefficient", "MR": "mutual rank"} cnt = {} cnt_pos_neg = {} max_density = -1 - data_range = {'CC': (-1, 1), 'MR': (0, 1)} + data_range = {"CC": (-1, 1), "MR": (0, 1)} for label in [0, 1]: - df = feature_df.loc[:, ['label', rna_feature, pro_feature]] + df = feature_df.loc[:, ["label", rna_feature, pro_feature]] df = df.dropna() - cur_df = df.loc[df['label'] == label, [rna_feature, pro_feature]] - hist_density, _, _ = np.histogram2d(cur_df[rna_feature].values, - cur_df[pro_feature].values, bins=n_bins, - range=np.array([data_range[feature_type], - data_range[feature_type]]), - density=True) + cur_df = df.loc[df["label"] == label, [rna_feature, pro_feature]] + hist_density, _, _ = np.histogram2d( + cur_df[rna_feature].values, + cur_df[pro_feature].values, + bins=n_bins, + range=np.array([data_range[feature_type], data_range[feature_type]]), + density=True, + ) max_density = max(max_density, np.max(hist_density)) for label in [0, 1]: - df = feature_df.loc[:, ['label', rna_feature, pro_feature]] + df = feature_df.loc[:, ["label", rna_feature, pro_feature]] df = df.dropna() - cur_df = df.loc[df['label'] == label, [rna_feature, pro_feature]] + cur_df = df.loc[df["label"] == label, [rna_feature, pro_feature]] cnt[label] = cur_df.shape[0] - hist, _, _ = np.histogram2d(cur_df[rna_feature].values, - cur_df[pro_feature].values, bins=n_bins, - range=np.array([data_range[feature_type], - data_range[feature_type]])) + hist, _, _ = np.histogram2d( + cur_df[rna_feature].values, + cur_df[pro_feature].values, + bins=n_bins, + range=np.array([data_range[feature_type], data_range[feature_type]]), + ) cnt_pos_neg[label] = hist - hh = ax.hist2d(cur_df[rna_feature].values, - cur_df[pro_feature].values, - bins=n_bins, - range=np.array([data_range[feature_type], - data_range[feature_type]]), - vmin=0, - vmax=max_density, - density=True) - - llr_vals = ((cnt_pos_neg[1] + 1)/(cnt_pos_neg[0] + cnt[0]/cnt[1]))/(cnt[1]/cnt[0]) - if feature_type == 'MR': + hh = ax.hist2d( + cur_df[rna_feature].values, + cur_df[pro_feature].values, + bins=n_bins, + range=np.array([data_range[feature_type], data_range[feature_type]]), + vmin=0, + vmax=max_density, + density=True, + ) + + llr_vals = ((cnt_pos_neg[1] + 1) / (cnt_pos_neg[0] + cnt[0] / cnt[1])) / ( + cnt[1] / cnt[0] + ) + if feature_type == "MR": vmin = np.percentile(llr_vals, 5) vmax = np.percentile(llr_vals, 95) symmetric_max = max(abs(vmin), abs(vmax)) @@ -402,50 +477,74 @@ def plot_2d_llr(ax, feature_df, feature_type, pair_name, rna_feature, pro_featur vmin = -4 vmax = 4 cmap = plt.cm.RdBu_r - fig = ax.pcolormesh(hh[1], hh[2], np.transpose(np.log(llr_vals)), vmin=vmin, vmax=vmax, - cmap=cmap) - ax.text(0.02, 0.01, pair_name, - verticalalignment='bottom', horizontalalignment='left', + fig = ax.pcolormesh( + hh[1], hh[2], np.transpose(np.log(llr_vals)), vmin=vmin, vmax=vmax, cmap=cmap + ) + ax.text( + 0.02, + 0.01, + pair_name, + verticalalignment="bottom", + horizontalalignment="left", transform=ax.transAxes, - color='gray', fontsize=24, fontweight='bold') - ax.set_xlabel(f'{label_mapping[data_types[0]]}\n{feature_label_mapping[feature_type]}', fontsize=16) - ax.set_ylabel(f'{label_mapping[data_types[1]]}\n{feature_label_mapping[feature_type]}', fontsize=16) - if feature_type == 'CC': + color="gray", + fontsize=24, + fontweight="bold", + ) + ax.set_xlabel( + f"{label_mapping[data_types[0]]}\n{feature_label_mapping[feature_type]}", + fontsize=16, + ) + ax.set_ylabel( + f"{label_mapping[data_types[1]]}\n{feature_label_mapping[feature_type]}", + fontsize=16, + ) + if feature_type == "CC": ax.set_xticks([-1, -0.5, 0, 0.5, 1]) ax.set_yticks([-1, -0.5, 0, 0.5, 1]) else: ax.set_xticks([0, 0.25, 0.5, 0.75, 1]) ax.set_yticks([0, 0.25, 0.5, 0.75, 1]) - ax.tick_params(axis='x', labelsize=16) - ax.tick_params(axis='y', labelsize=16) + ax.tick_params(axis="x", labelsize=16) + ax.tick_params(axis="y", labelsize=16) return fig def plot_pair_llr(gs_dict, feature_type, output_dir, rp_pairs): n_bins = 20 file_names = [] - feature_type = ['CC', 'MR'] + feature_type = ["CC", "MR"] # plot for each rna-protein pair using CC or MR as feature for rp_pair in rp_pairs: for ft in feature_type: ft = ft.upper() feature_df = gs_dict[ft] - fig, ax = plt.subplots(2, 2, figsize=(10, 10), - gridspec_kw={'width_ratios': [4, 1], - 'height_ratios': [1, 4]}) - rna_feature = rp_pair['rna'] - pro_feature = rp_pair['protein'] - plot_1d_llr(ax[0,0], feature_df, rna_feature, ft, 'RNA', n_bins) - ax[0, 0].xaxis.set_ticks_position('none') + fig, ax = plt.subplots( + 2, + 2, + figsize=(10, 10), + gridspec_kw={"width_ratios": [4, 1], "height_ratios": [1, 4]}, + ) + rna_feature = rp_pair["rna"] + pro_feature = rp_pair["protein"] + plot_1d_llr(ax[0, 0], feature_df, rna_feature, ft, "RNA", n_bins) + ax[0, 0].xaxis.set_ticks_position("none") ax[0, 0].set_xticklabels([]) - ax[0,1].axis('off') + ax[0, 1].axis("off") - heatmap2d = plot_2d_llr(ax[1,0], feature_df, ft, rp_pair['name'], - rna_feature, pro_feature, n_bins) + heatmap2d = plot_2d_llr( + ax[1, 0], + feature_df, + ft, + rp_pair["name"], + rna_feature, + pro_feature, + n_bins, + ) - plot_1d_llr(ax[1, 1], feature_df, pro_feature, ft, 'PRO', n_bins) - ax[1, 1].yaxis.set_ticks_position('none') + plot_1d_llr(ax[1, 1], feature_df, pro_feature, ft, "PRO", n_bins) + ax[1, 1].yaxis.set_ticks_position("none") ax[1, 1].set_yticklabels([]) # add colorbar to the right of the plot @@ -456,7 +555,7 @@ def plot_pair_llr(gs_dict, feature_type, output_dir, rp_pairs): plt.box(on=None) file_name = f"{rp_pair['name']}_rna_pro_{ft}_llr.pdf" file_names.append(file_name) - plt.savefig(output_dir / file_name, bbox_inches='tight') + plt.savefig(output_dir / file_name, bbox_inches="tight") plt.close(fig) return file_names @@ -466,98 +565,143 @@ def plot_llr_compare_networks(validaton_results, cutoff, cutoff_llr, output_file all_networks = [] for ft in validaton_results: - edge_list = pd.read_csv(validaton_results[ft]['edge_list_path'], sep='\t', header=None) + edge_list = pd.read_csv( + validaton_results[ft]["edge_list_path"], sep="\t", header=None + ) n_edge = len(edge_list) n_node = len(set(edge_list.iloc[:, 0]) | set(edge_list.iloc[:, 1])) - all_networks.extend([ - (f'FunMap_{ft}', 'FunMap', n_node, n_edge, cutoff_llr[ft], np.exp(cutoff_llr[ft])) - ]) + all_networks.extend( + [ + ( + f"FunMap_{ft}", + "FunMap", + n_node, + n_edge, + cutoff_llr[ft], + np.exp(cutoff_llr[ft]), + ) + ] + ) # these are pre-computed values all_networks.extend( - [ - # name, type, n, e, llr, lr - ('HuRI', 'HI', 8272, 52548, 2.3139014130648827, 10.11), - ('HI-union', 'HI', 9094, 64006, 2.298975841813893, 9.96), - ('ProHD', 'ProHD', 2680, 61580, 4.039348296, 56.78), - # this is combined_score_700 - ('STRING', 'STRING', 16351, 240314, 5.229377563059293, 186.676572955849), - ('BioGRID', 'BioGRID', 17259, 654490, 2.6524642147396182, 14.18896024552041), - ('BioPlex', 'BioPlex', 13854, 154428, 3.3329858940888375, 28.021887299660047) - ] + [ + # name, type, n, e, llr, lr + ("HuRI", "HI", 8272, 52548, 2.3139014130648827, 10.11), + ("HI-union", "HI", 9094, 64006, 2.298975841813893, 9.96), + ("ProHD", "ProHD", 2680, 61580, 4.039348296, 56.78), + # this is combined_score_700 + ("STRING", "STRING", 16351, 240314, 5.229377563059293, 186.676572955849), + ( + "BioGRID", + "BioGRID", + 17259, + 654490, + 2.6524642147396182, + 14.18896024552041, + ), + ( + "BioPlex", + "BioPlex", + 13854, + 154428, + 3.3329858940888375, + 28.021887299660047, + ), + ] ) log.info(all_networks) - cols = ['name', 'group', 'n', 'e', 'llr', 'lr'] + cols = ["name", "group", "n", "e", "llr", "lr"] network_data = pd.DataFrame(all_networks, columns=cols) - x = np.array(network_data['n']) - y = np.array(network_data['llr']) - e = np.array(network_data['e']) - z = network_data['name'] + x = np.array(network_data["n"]) + y = np.array(network_data["llr"]) + e = np.array(network_data["e"]) + z = network_data["name"] fig, ax = plt.subplots(figsize=(10, 10)) ax.set_axisbelow(True) - ax.xaxis.grid(color = 'gainsboro', linestyle = 'dotted') - ax.yaxis.grid(color = 'gainsboro', linestyle = 'dotted') - ax.get_ygridlines()[4].set_color('salmon') - ax.get_yticklabels()[4].set_color('red') + ax.xaxis.grid(color="gainsboro", linestyle="dotted") + ax.yaxis.grid(color="gainsboro", linestyle="dotted") + ax.get_ygridlines()[4].set_color("salmon") + ax.get_yticklabels()[4].set_color("red") ax2 = ax.twinx() # we have 6 groups, so we need 6 colors - mycmap = matplotlib.colors.ListedColormap(['#de2d26', '#8B6CAF', '#0D95D0', - '#69A953', '#F1C36B', '#DC6C43']) + mycmap = matplotlib.colors.ListedColormap( + ["#de2d26", "#8B6CAF", "#0D95D0", "#69A953", "#F1C36B", "#DC6C43"] + ) # group 0 is FunMap, group 1 is HI, group 2 is ProHD, group 3 is STRING, # group 4 is BioGRID, group 5 is BioPlex # the length of gro - color_group = [0] * len(validaton_results) + [1] * 2 + [2] * 1 + [3] * 1 + [4] * 1 + [5] * 1 - scatter = ax.scatter(x, y, c=color_group, cmap=mycmap, - s=e/1000) + color_group = ( + [0] * len(validaton_results) + [1] * 2 + [2] * 1 + [3] * 1 + [4] * 1 + [5] * 1 + ) + scatter = ax.scatter(x, y, c=color_group, cmap=mycmap, s=e / 1000) ax.set_ylim(2, 6) ax2.set_ylim(np.exp(2.0), np.exp(6)) - ax.set_xlabel('number of genes') + ax.set_xlabel("number of genes") ax.set_yticks([2.0, 2.5, 3, 3.5, np.log(cutoff), 4, 4.5, 5, 5.5, 6]) - ax.set_ylabel('log likelihood ratio') - ax.spines['top'].set_visible(False) - ax.spines['left'].set_visible(False) - ax.spines['right'].set_visible(False) - ax2.set_ylabel('likelihood ratio') - ax2.set_yscale('log', base=np.e) - ax2.set_yticks([np.exp(2), np.exp(2.5), np.exp(3), np.exp(3.5), - cutoff, np.exp(4), np.exp(4.5), np.exp(5), - np.exp(5.5), np.exp(6)]) - ax.tick_params(axis='x', labelsize=12) - ax.tick_params(axis='y', labelsize=12) - ax2.tick_params(axis='y', labelsize=12) - ax2.get_yticklabels()[4].set_color('red') + ax.set_ylabel("log likelihood ratio") + ax.spines["top"].set_visible(False) + ax.spines["left"].set_visible(False) + ax.spines["right"].set_visible(False) + ax2.set_ylabel("likelihood ratio") + ax2.set_yscale("log", base=np.e) + ax2.set_yticks( + [ + np.exp(2), + np.exp(2.5), + np.exp(3), + np.exp(3.5), + cutoff, + np.exp(4), + np.exp(4.5), + np.exp(5), + np.exp(5.5), + np.exp(6), + ] + ) + ax.tick_params(axis="x", labelsize=12) + ax.tick_params(axis="y", labelsize=12) + ax2.tick_params(axis="y", labelsize=12) + ax2.get_yticklabels()[4].set_color("red") ax2.get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter()) - ax2.spines['top'].set_visible(False) - ax2.spines['left'].set_visible(False) - ax2.spines['right'].set_visible(False) + ax2.spines["top"].set_visible(False) + ax2.spines["left"].set_visible(False) + ax2.spines["right"].set_visible(False) - handles1, labels1 = scatter.legend_elements(prop='sizes', num=4, alpha=1, - fmt='{x:.0f} K') - legend1 = ax.legend(handles1, labels1, - loc='upper left', labelspacing=1.8, borderpad=1.0, - title='number of pairs', frameon=True) + handles1, labels1 = scatter.legend_elements( + prop="sizes", num=4, alpha=1, fmt="{x:.0f} K" + ) + legend1 = ax.legend( + handles1, + labels1, + loc="upper left", + labelspacing=1.8, + borderpad=1.0, + title="number of pairs", + frameon=True, + ) ax.add_artist(legend1) leg = ax.get_legend() for i in range(len(leg.legendHandles)): - leg.legendHandles[i].set_color('gray') + leg.legendHandles[i].set_color("gray") ax.xaxis.set_major_formatter(edge_number) for i, txt in enumerate(z): - if txt == 'STRING': - ax.annotate(txt, (x[i]-500, y[i]+0.18), color='gray', fontsize=10) - elif txt == 'HI-union': - ax.annotate(txt, (x[i]-100, y[i]+0.1), color='gray', fontsize=10) - elif txt == 'BioGRID': - ax.annotate(txt, (x[i]-800, y[i]-0.25), color='gray', fontsize=10) + if txt == "STRING": + ax.annotate(txt, (x[i] - 500, y[i] + 0.18), color="gray", fontsize=10) + elif txt == "HI-union": + ax.annotate(txt, (x[i] - 100, y[i] + 0.1), color="gray", fontsize=10) + elif txt == "BioGRID": + ax.annotate(txt, (x[i] - 800, y[i] - 0.25), color="gray", fontsize=10) else: - ax.annotate(txt, (x[i]-100, y[i]-0.2), color='gray', fontsize=10) + ax.annotate(txt, (x[i] - 100, y[i] - 0.2), color="gray", fontsize=10) fig.tight_layout() - fig.savefig(output_file, bbox_inches='tight') + fig.savefig(output_file, bbox_inches="tight") plt.close(fig) @@ -593,25 +737,29 @@ def plot_overlap_venn(network_name, overlap, node_or_edge, color, output_dir): max_area = max(map(sum, data)) def set_venn_scale(ax, true_area, reference_area=max_area): - s = np.sqrt(float(reference_area)/true_area) + s = np.sqrt(float(reference_area) / true_area) ax.set_xlim(-s, s) ax.set_ylim(-s, s) all_axes = [] n_plot = len(overlap) - fig, ax = plt.subplots(1, n_plot, figsize=(5*n_plot, 5)) + fig, ax = plt.subplots(1, n_plot, figsize=(5 * n_plot, 5)) for i, nw in enumerate(overlap): cur_ax = ax[i] all_axes.append(cur_ax) - labels = ('FunMap', nw) - out = venn2(overlap[nw], - set_labels=labels, alpha=1.0, - ax=cur_ax, set_colors=[color, 'white']) - venn2_circles(overlap[nw], ax=cur_ax, linestyle='solid', - color='gray', - linewidth=1) + labels = ("FunMap", nw) + out = venn2( + overlap[nw], + set_labels=labels, + alpha=1.0, + ax=cur_ax, + set_colors=[color, "white"], + ) + venn2_circles( + overlap[nw], ax=cur_ax, linestyle="solid", color="gray", linewidth=1 + ) if out.set_labels: for text in out.set_labels: text.set_fontsize(12) @@ -620,14 +768,14 @@ def set_venn_scale(ax, true_area, reference_area=max_area): text.set_fontsize(10) # add title to the figure - name = 'genes' if node_or_edge == 'node' else 'edges' - fig.suptitle(f'Overlap of {name} ({network_name})', fontsize=16) + name = "genes" if node_or_edge == "node" else "edges" + fig.suptitle(f"Overlap of {name} ({network_name})", fontsize=16) for a, d in zip(flatten(ax), data): - set_venn_scale(a, sum(d)*1.5) + set_venn_scale(a, sum(d) * 1.5) - file_name = f'{network_name}_overlap_{node_or_edge}.pdf' - fig.savefig(output_dir / file_name, bbox_inches='tight') + file_name = f"{network_name}_overlap_{node_or_edge}.pdf" + fig.savefig(output_dir / file_name, bbox_inches="tight") plt.close(fig) return file_name @@ -638,17 +786,17 @@ def plot_network_stats(network_info, feature_type, output_dir): average_shortest_path = {} # these are pre-calculated since they take a long time to compute and # the network is fixed - average_shortest_path ={ - 'BioGRID': 2.74, - 'BioPlex': 3.60, - 'HI-union': 3.70, - 'STRING': 3.95 + average_shortest_path = { + "BioGRID": 2.74, + "BioPlex": 3.60, + "HI-union": 3.70, + "STRING": 3.95, } # if you want to recompute the average shortest path length, # add the network name to this list - network_list = ['FunMap'] + network_list = ["FunMap"] for n in network_list: - network_el = network_info.loc[network_info['name'] == n, 'el'].values[0] + network_el = network_info.loc[network_info["name"] == n, "el"].values[0] cur_network = nx.from_pandas_edgelist(network_el, source=0, target=1) cur_density = nx.density(cur_network) density[n] = cur_density @@ -657,94 +805,111 @@ def plot_network_stats(network_info, feature_type, output_dir): cur_average_shortest_path = nx.average_shortest_path_length(cur_cc) average_shortest_path[n] = cur_average_shortest_path cur_degrees = [val for (_, val) in cur_network.degree()] - if n == 'FunMap': # only fit for FunMap + if n == "FunMap": # only fit for FunMap with warnings.catch_warnings(): warnings.simplefilter("ignore") - fit = powerlaw.Fit(cur_degrees, discrete=True, xmax=250, estimate_discrete=False) - powerlaw.plot_pdf(cur_degrees, linear_bins=True, linestyle='None', marker='o', - markerfacecolor='None', color='#de2d26', - linewidth=3, ax=ax[0]) + fit = powerlaw.Fit( + cur_degrees, discrete=True, xmax=250, estimate_discrete=False + ) + powerlaw.plot_pdf( + cur_degrees, + linear_bins=True, + linestyle="None", + marker="o", + markerfacecolor="None", + color="#de2d26", + linewidth=3, + ax=ax[0], + ) # not plotting the power law fit # fit.power_law.plot_pdf(linestyle='--',color='black', ax=ax[0]) # all the networks in network_info minus FunMap - other_networks = list(set(network_info['name'].tolist()) - set(['FunMap'])) + other_networks = list(set(network_info["name"].tolist()) - set(["FunMap"])) for n in other_networks: - network_el = network_info.loc[network_info['name'] == n, 'el'].values[0] + network_el = network_info.loc[network_info["name"] == n, "el"].values[0] cur_network = nx.from_pandas_edgelist(network_el, source=0, target=1) cur_density = nx.density(cur_network) density[n] = cur_density - ax[0].set_xlabel('degree') - ax[0].set_ylabel('p(x)') - ax[0].spines['top'].set_visible(False) - ax[0].spines['right'].set_visible(False) - ax[0].yaxis.grid(color = 'gainsboro', linestyle = 'dotted') + ax[0].set_xlabel("degree") + ax[0].set_ylabel("p(x)") + ax[0].spines["top"].set_visible(False) + ax[0].spines["right"].set_visible(False) + ax[0].yaxis.grid(color="gainsboro", linestyle="dotted") ax[0].set_axisbelow(True) # global average clustering coefficient # these are pre-calculated since they take a long time to compute and # the network is fixed - avg_cc = { - 'BioGRID': 0.125, - 'BioPlex': 0.103, - 'HI-union': 0.06, - 'STRING': 0.335 - } + avg_cc = {"BioGRID": 0.125, "BioPlex": 0.103, "HI-union": 0.06, "STRING": 0.335} for n in network_list: - network_el = network_info.loc[network_info['name'] == n, 'el'].values[0] + network_el = network_info.loc[network_info["name"] == n, "el"].values[0] cur_network = nx.from_pandas_edgelist(network_el, source=0, target=1) cur_cc = nx.average_clustering(cur_network) avg_cc[n] = cur_cc - network_list = network_info['name'].tolist() - ax[1].bar(network_list, [avg_cc[i] for i in network_list], width=0.5, align='center', - color='#E4C89A') - ax[1].spines['left'].set_position(('outward', 8)) - ax[1].spines['bottom'].set_position(('outward', 5)) - ax[1].spines['top'].set_visible(False) - ax[1].spines['left'].set_visible(False) - ax[1].spines['right'].set_visible(False) - ax[1].set_ylabel('Average clustering coefficient') + network_list = network_info["name"].tolist() + ax[1].bar( + network_list, + [avg_cc[i] for i in network_list], + width=0.5, + align="center", + color="#E4C89A", + ) + ax[1].spines["left"].set_position(("outward", 8)) + ax[1].spines["bottom"].set_position(("outward", 5)) + ax[1].spines["top"].set_visible(False) + ax[1].spines["left"].set_visible(False) + ax[1].spines["right"].set_visible(False) + ax[1].set_ylabel("Average clustering coefficient") ticks_loc = ax[1].get_xticks() ax[1].xaxis.set_major_locator(mticker.FixedLocator(ticks_loc)) - ax[1].set_xticklabels(network_list, rotation=45, ha='right') - ax[1].yaxis.grid(color = 'gainsboro', linestyle = 'dotted') + ax[1].set_xticklabels(network_list, rotation=45, ha="right") + ax[1].yaxis.grid(color="gainsboro", linestyle="dotted") ax[1].set_axisbelow(True) - ax[2].bar(network_list, [density[i] for i in network_list], width=0.5, align='center', - color = '#D8B2C6' - ) - ax[2].spines['left'].set_position(('outward', 8)) - ax[2].spines['bottom'].set_position(('outward', 5)) - ax[2].spines['top'].set_visible(False) - ax[2].spines['left'].set_visible(False) - ax[2].spines['right'].set_visible(False) - ax[2].set_ylabel('Density') + ax[2].bar( + network_list, + [density[i] for i in network_list], + width=0.5, + align="center", + color="#D8B2C6", + ) + ax[2].spines["left"].set_position(("outward", 8)) + ax[2].spines["bottom"].set_position(("outward", 5)) + ax[2].spines["top"].set_visible(False) + ax[2].spines["left"].set_visible(False) + ax[2].spines["right"].set_visible(False) + ax[2].set_ylabel("Density") ticks_loc = ax[2].get_xticks() ax[2].xaxis.set_major_locator(mticker.FixedLocator(ticks_loc)) - ax[2].set_xticklabels(network_list, rotation=45, ha='right') - ax[2].yaxis.grid(color = 'gainsboro', linestyle = 'dotted') + ax[2].set_xticklabels(network_list, rotation=45, ha="right") + ax[2].yaxis.grid(color="gainsboro", linestyle="dotted") ax[2].set_axisbelow(True) - ax[3].bar(network_list, [average_shortest_path[i] for i in network_list], width=0.5, align='center', - color = '#B6D8A6' - ) - ax[3].spines['left'].set_position(('outward', 8)) - ax[3].spines['bottom'].set_position(('outward', 5)) - ax[3].spines['top'].set_visible(False) - ax[3].spines['left'].set_visible(False) - ax[3].spines['right'].set_visible(False) - ax[3].set_ylabel('Average shortest path length') + ax[3].bar( + network_list, + [average_shortest_path[i] for i in network_list], + width=0.5, + align="center", + color="#B6D8A6", + ) + ax[3].spines["left"].set_position(("outward", 8)) + ax[3].spines["bottom"].set_position(("outward", 5)) + ax[3].spines["top"].set_visible(False) + ax[3].spines["left"].set_visible(False) + ax[3].spines["right"].set_visible(False) + ax[3].set_ylabel("Average shortest path length") ticks_loc = ax[3].get_xticks() ax[3].xaxis.set_major_locator(mticker.FixedLocator(ticks_loc)) - ax[3].set_xticklabels(network_list, rotation=45, ha='right') - ax[3].yaxis.grid(color = 'gainsboro', linestyle = 'dotted') + ax[3].set_xticklabels(network_list, rotation=45, ha="right") + ax[3].yaxis.grid(color="gainsboro", linestyle="dotted") ax[3].set_axisbelow(True) - fig.suptitle(f'Network properties of Funmap ({feature_type})', fontsize=16) - file_name = f'funmap_{feature_type}_network_properties.pdf' - fig.savefig(output_dir / file_name, bbox_inches='tight') + fig.suptitle(f"Network properties of Funmap ({feature_type})", fontsize=16) + file_name = f"funmap_{feature_type}_network_properties.pdf" + fig.savefig(output_dir / file_name, bbox_inches="tight") plt.close(fig) return file_name @@ -778,12 +943,12 @@ def merge_and_delete(fig_dir, file_list, output_file): pdf_writer.add_outline_item(os.path.splitext(file)[0], total_page_num) total_page_num = total_page_num + cur_page_num - with open(fig_dir / output_file, 'wb') as fh: + with open(fig_dir / output_file, "wb") as fh: pdf_writer.write(fh) - log.info('figures have been merged.') + log.info("figures have been merged.") for filename in file_list: try: os.remove(fig_dir / filename) except: - log.error(f'{filename} could not be deleted.') + log.error(f"{filename} could not be deleted.") From 74448dad2c1e9a0689aa8f3caafccb8e63317ddc Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 19 Sep 2024 16:26:30 -0500 Subject: [PATCH 20/54] Remove excess logging and fix llr parameters --- funmap/cli.py | 1 + funmap/funmap.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/funmap/cli.py b/funmap/cli.py index 8cd7cf34..31bd0336 100644 --- a/funmap/cli.py +++ b/funmap/cli.py @@ -312,6 +312,7 @@ def run(config_file, force_rerun): max_num_edges, step_size, llr_dataset_file, + extra_feature_file, ) log.info("Done.") else: diff --git a/funmap/funmap.py b/funmap/funmap.py index 90076c9b..73616190 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -313,7 +313,6 @@ def extract_features( # TODO: add extra features if provided if extra_feature is not None: - log.info(f"Loading extra features from {extra_feature}") extra_feature_df = pd.read_csv(extra_feature, sep="\t") extra_feature_df.columns.values[0] = "P1" extra_feature_df.columns.values[1] = "P2" From b885a86c7619f21a811ee629ae423abc0b01c4b4 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 19 Sep 2024 17:08:43 -0500 Subject: [PATCH 21/54] fix return value --- funmap/funmap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funmap/funmap.py b/funmap/funmap.py index 73616190..86cd393b 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -559,7 +559,7 @@ def extract_dataset_feature(all_pairs, feature_file, feature_type="cc"): def extract_extra_features(all_pairs, ef_df): # convert all_pairs to a dataframe df = pd.DataFrame(all_pairs, columns=["P1", "P2"]) - pd.merge(df, ef_df, on=["P1", "P2"], how="left") + return pd.merge(df, ef_df, on=["P1", "P2"], how="left") def dataset_llr( From 077e066199c078cc8c853015948edd283b67d546 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Fri, 20 Sep 2024 09:50:10 -0500 Subject: [PATCH 22/54] skip edge count check for extra features --- funmap/funmap.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/funmap/funmap.py b/funmap/funmap.py index 86cd393b..1ef957ba 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -419,12 +419,18 @@ def train_model(X, y, seed, n_jobs, feature_mapping, model_dir): def compute_llr( - predicted_all_pairs, llr_res_file, start_edge_num, max_num_edges, step_size, gs_test + predicted_all_pairs, + llr_res_file, + start_edge_num, + max_num_edges, + step_size, + gs_test, + is_extra_feat=False, ): # make sure max_num_edges is smaller than the number of non-NA values - assert max_num_edges < np.count_nonzero( + assert is_extra_feat or max_num_edges < np.count_nonzero( ~np.isnan(predicted_all_pairs.iloc[:, -1].values) - ), f"max_num_edges should be smaller than the number of non-NA values" + ), "max_num_edges should be smaller than the number of non-NA values" cur_col_name = "prediction" cur_results = predicted_all_pairs.nlargest(max_num_edges, cur_col_name) @@ -635,12 +641,7 @@ def dataset_llr( subset_df.columns.values[-1] = "prediction" log.info(f"Calculating llr for extra feature {f} ...") cur_llr_res = compute_llr( - subset_df, - None, - start_edge_num, - max_num_edge, - step_size, - gs_test, + subset_df, None, start_edge_num, max_num_edge, step_size, gs_test, True ) cur_llr_res["dataset"] = f + "_EXTRAFEAT" llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True) From 157f64c0cdd45de3009954b577dbe84e13869c24 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Fri, 20 Sep 2024 10:17:10 -0500 Subject: [PATCH 23/54] check if dataset is extra feature before checking type --- funmap/plotting.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/funmap/plotting.py b/funmap/plotting.py index 6a01b74b..57019304 100644 --- a/funmap/plotting.py +++ b/funmap/plotting.py @@ -58,13 +58,13 @@ def plot_llr_comparison( for ds in datasets: start = -1 cur_df = llr_ds[llr_ds["dataset"] == ds] - if name_type_dict[ds].upper() == "RNA": + if ds.endswith("_EXTRAFEAT"): + ds = ds.replace("_EXTRAFEAT", "") + ltype = "-." + elif name_type_dict[ds].upper() == "RNA": ltype = "--" elif name_type_dict[ds].upper() == "PROTEIN": ltype = ":" - elif ds.endswith("_EXTRAFEAT"): - ds = ds.replace("_EXTRAFEAT", "") - ltype = "-." else: ltype = "-" ax.plot(cur_df["k"], cur_df["llr"], linestyle=ltype, label=ds) From 998a418e2a767ec5011149d4ba0a0b446ff35687 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Fri, 20 Sep 2024 17:17:23 -0500 Subject: [PATCH 24/54] Add back LLR calculation for extra features --- funmap/funmap.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/funmap/funmap.py b/funmap/funmap.py index 6434a802..2a4952b7 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -644,7 +644,36 @@ def dataset_llr( llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True) log.info("Calculating llr for all datasets average ... done") llr_ds.to_csv(llr_dataset_file, sep="\t", index=False) - + if extra_feature is not None: + log.info("Calculating LLR for extra features") + extra_feature_df = pd.read_csv(extra_feature, sep="\t") + extra_feature_df.columns.values[0] = "P1" + extra_feature_df.columns.values[1] = "P2" + extra_feature_df[["P1", "P2"]] = extra_feature_df.apply( + lambda row: sorted([row["P1"], row["P2"]]) + if row["P1"] > row["P2"] + else [row["P1"], row["P2"]], + axis=1, + result_type="expand", + ) + extra_feature_df = extra_feature_df.drop_duplicates( + subset=["P1", "P2"], keep="last" + ) + extra_feature_df = extract_extra_features( + all_pairs, extra_feature_df + ) # filter out unused pairs + features = extra_feature_df.columns.values[2:] + for f in features: + subset_df = extra_feature_df[["P1", "P2", f]] + subset_df.columns.values[-1] = "prediction" + log.info(f"Calculating llr for extra feature {f} ...") + cur_llr_res = compute_llr( + subset_df, None, start_edge_num, max_num_edge, step_size, gs_test, True + ) + cur_llr_res["dataset"] = f + "_EXTRAFEAT" + llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True) + llr_ds.to_csv(llr_dataset_file, sep="\t", index=False) + log.info(f"Calculating llr for {dataset} ... done") return llr_ds From f6bb066d64d6576c6d86b3b63dc5c78ba76f8b3d Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Wed, 25 Sep 2024 14:10:58 -0500 Subject: [PATCH 25/54] Process extra feature file once instead of multiple times --- funmap/cli.py | 14 ++++++++++---- funmap/funmap.py | 45 +++++++++------------------------------------ funmap/utils.py | 28 +++++++++++++++++++++++----- 3 files changed, 42 insertions(+), 45 deletions(-) diff --git a/funmap/cli.py b/funmap/cli.py index 0a7e7f40..3560eeb4 100644 --- a/funmap/cli.py +++ b/funmap/cli.py @@ -26,6 +26,7 @@ check_extra_feature_file, check_gold_standard_file, cleanup_experiment, + process_extra_feature, setup_experiment, ) @@ -120,6 +121,11 @@ def run(config_file, force_rerun): setup_logging(config_file) cfg = setup_experiment(config_file) extra_feature_file = cfg["extra_feature_file"] + if extra_feature_file is not None: + log.info("Loading extra feature file into dataframe") + extra_feature_df = process_extra_feature(extra_feature_file) + else: + extra_feature_df = None # if (extra_feature_file is not None) and (not check_extra_feature_file(extra_feature_file)): # return gs_file = cfg["gs_file"] @@ -192,7 +198,7 @@ def run(config_file, force_rerun): "mr_dict": mr_dict, "feature_type": feature_type, "gs_file": gs_file, - "extra_feature_file": extra_feature_file, + "extra_feature_df": extra_feature_df, "valid_id_list": all_valid_ids, "test_size": test_size, "seed": seed, @@ -255,7 +261,7 @@ def run(config_file, force_rerun): "ppi_feature": ppi_feature, "cc_dict": cc_dict, "mr_dict": mr_dict, - "extra_feature_file": extra_feature_file, + "extra_feature_df": extra_feature_df, "prediction_dir": prediction_dir, "output_file": predicted_all_pairs_file, "n_jobs": n_jobs, @@ -326,7 +332,7 @@ def run(config_file, force_rerun): max_num_edges, step_size, llr_dataset_file, - extra_feature_file, + extra_feature_df, ) log.info("Done.") else: @@ -356,7 +362,7 @@ def run(config_file, force_rerun): "feature_type": "cc", "gs_file": gs_file, # no extra feature for plotting - "extra_feature_file": None, + "extra_feature_df": None, "valid_id_list": all_valid_ids, "test_size": test_size, "seed": seed, diff --git a/funmap/funmap.py b/funmap/funmap.py index 2a4952b7..b0e405f7 100644 --- a/funmap/funmap.py +++ b/funmap/funmap.py @@ -23,7 +23,6 @@ from funmap.data_urls import network_info from funmap.logger import setup_logger from funmap.utils import ( - check_extra_feature_file, get_data_dict, is_url_scheme, read_csv_with_md5_check, @@ -290,7 +289,7 @@ def get_1d_indices(i_array, j_array, n): def extract_features( - df, feature_type, cc_dict, ppi_feature=None, extra_feature=None, mr_dict=None + df, feature_type, cc_dict, ppi_feature=None, extra_feature_df=None, mr_dict=None ): """ extract_features - creates the final feature `pandas` dataframe used by xgboost @@ -326,20 +325,7 @@ def extract_features( ) # TODO: add extra features if provided - if extra_feature is not None: - extra_feature_df = pd.read_csv(extra_feature, sep="\t") - extra_feature_df.columns.values[0] = "P1" - extra_feature_df.columns.values[1] = "P2" - extra_feature_df[["P1", "P2"]] = extra_feature_df.apply( - lambda row: sorted([row["P1"], row["P2"]]) - if row["P1"] > row["P2"] - else [row["P1"], row["P2"]], - axis=1, - result_type="expand", - ) - extra_feature_df = extra_feature_df.drop_duplicates( - subset=["P1", "P2"], keep="last" - ) + if extra_feature_df is not None: df.reset_index(drop=True, inplace=True) merged_df = pd.merge( df[["P1", "P2"]], extra_feature_df, on=["P1", "P2"], how="left" @@ -506,7 +492,7 @@ def prepare_gs_data(**kwargs): gs_file = kwargs["gs_file"] gs_file_md5 = None feature_type = kwargs["feature_type"] - extra_feature_file = kwargs["extra_feature_file"] + extra_feature_df = kwargs["extra_feature_df"] valid_id_list = kwargs["valid_id_list"] test_size = kwargs["test_size"] seed = kwargs["seed"] @@ -530,10 +516,10 @@ def prepare_gs_data(**kwargs): else: ppi_feature = None gs_train_df = extract_features( - gs_train, feature_type, cc_dict, ppi_feature, extra_feature_file, mr_dict + gs_train, feature_type, cc_dict, ppi_feature, extra_feature_df, mr_dict ) gs_test_df = extract_features( - gs_test, feature_type, cc_dict, ppi_feature, extra_feature_file, mr_dict + gs_test, feature_type, cc_dict, ppi_feature, extra_feature_df, mr_dict ) # store both the ids with gs_test_df for later use @@ -599,7 +585,7 @@ def dataset_llr( max_num_edge, step_size, llr_dataset_file, - extra_feature, + extra_feature_df, ): llr_ds = pd.DataFrame() all_ids_sorted = sorted(all_ids) @@ -644,21 +630,8 @@ def dataset_llr( llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True) log.info("Calculating llr for all datasets average ... done") llr_ds.to_csv(llr_dataset_file, sep="\t", index=False) - if extra_feature is not None: + if extra_feature_df is not None: log.info("Calculating LLR for extra features") - extra_feature_df = pd.read_csv(extra_feature, sep="\t") - extra_feature_df.columns.values[0] = "P1" - extra_feature_df.columns.values[1] = "P2" - extra_feature_df[["P1", "P2"]] = extra_feature_df.apply( - lambda row: sorted([row["P1"], row["P2"]]) - if row["P1"] > row["P2"] - else [row["P1"], row["P2"]], - axis=1, - result_type="expand", - ) - extra_feature_df = extra_feature_df.drop_duplicates( - subset=["P1", "P2"], keep="last" - ) extra_feature_df = extract_extra_features( all_pairs, extra_feature_df ) # filter out unused pairs @@ -751,7 +724,7 @@ def predict_all_pairs( ppi_feature, cc_dict, mr_dict, - extra_feature_file, + extra_feature_df, prediction_dir, output_file, n_jobs=1, @@ -784,7 +757,7 @@ def process_and_save_chunk(start_idx, chunk_id): feature_type, cc_dict, cur_ppi_feature, - extra_feature_file, + extra_feature_df, mr_dict, ) predictions = model.predict_proba(feature_df) diff --git a/funmap/utils.py b/funmap/utils.py index cc00a09c..81bb68cb 100644 --- a/funmap/utils.py +++ b/funmap/utils.py @@ -1,14 +1,16 @@ import csv -import yaml +import hashlib import os -import tarfile import re -import hashlib +import shutil +import tarfile import urllib -from urllib.parse import urlparse from pathlib import Path +from urllib.parse import urlparse + import pandas as pd -import shutil +import yaml + from funmap.data_urls import misc_urls as urls from funmap.logger import setup_logger @@ -553,3 +555,19 @@ def check_extra_feature_file(file_path, missing_value="NA"): return False return True + + +def process_extra_feature(extra_feature_file) -> pd.DataFrame: + extra_feature_df = pd.read_csv(extra_feature_file, sep="\t") + extra_feature_df.columns.values[0] = "P1" + extra_feature_df.columns.values[1] = "P2" + extra_feature_df[["P1", "P2"]] = extra_feature_df.apply( + lambda row: sorted([row["P1"], row["P2"]]) + if row["P1"] > row["P2"] + else [row["P1"], row["P2"]], + axis=1, + result_type="expand", + ) + extra_feature_df = extra_feature_df.drop_duplicates( + subset=["P1", "P2"], keep="last" + ) From 2b1f63038467be6a1b82a1cc2fb98ae95556aab3 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Fri, 27 Sep 2024 10:51:52 -0500 Subject: [PATCH 26/54] Migrate project structure to use maturin for rust file import --- .github/workflows/CI.yml | 181 +++++++++++++++++++++++++ .gitignore | 11 ++ Cargo.lock | 171 +++++++++++++++++++++++ Cargo.toml | 12 ++ pyproject.toml | 47 +++++++ pyproject_maturin.toml | 16 +++ {funmap => python/funmap}/__init__.py | 0 {funmap => python/funmap}/cli.py | 7 +- {funmap => python/funmap}/data_urls.py | 0 {funmap => python/funmap}/funmap.py | 0 {funmap => python/funmap}/logger.py | 0 {funmap => python/funmap}/plotting.py | 0 {funmap => python/funmap}/saving.py | 0 {funmap => python/funmap}/utils.py | 0 requirements-dev.lock | 116 ++++++++++++++++ requirements.lock | 115 ++++++++++++++++ src/lib.rs | 14 ++ 17 files changed, 689 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/CI.yml create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 pyproject.toml create mode 100644 pyproject_maturin.toml rename {funmap => python/funmap}/__init__.py (100%) rename {funmap => python/funmap}/cli.py (99%) rename {funmap => python/funmap}/data_urls.py (100%) rename {funmap => python/funmap}/funmap.py (100%) rename {funmap => python/funmap}/logger.py (100%) rename {funmap => python/funmap}/plotting.py (100%) rename {funmap => python/funmap}/saving.py (100%) rename {funmap => python/funmap}/utils.py (100%) create mode 100644 requirements-dev.lock create mode 100644 requirements.lock create mode 100644 src/lib.rs diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml new file mode 100644 index 00000000..ad3ec43f --- /dev/null +++ b/.github/workflows/CI.yml @@ -0,0 +1,181 @@ +# This file is autogenerated by maturin v1.7.4 +# To update, run +# +# maturin generate-ci github +# +name: CI + +on: + push: + branches: + - main + - master + tags: + - '*' + pull_request: + workflow_dispatch: + +permissions: + contents: read + +jobs: + linux: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: ubuntu-latest + target: x86_64 + - runner: ubuntu-latest + target: x86 + - runner: ubuntu-latest + target: aarch64 + - runner: ubuntu-latest + target: armv7 + - runner: ubuntu-latest + target: s390x + - runner: ubuntu-latest + target: ppc64le + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + manylinux: auto + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-linux-${{ matrix.platform.target }} + path: dist + + musllinux: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: ubuntu-latest + target: x86_64 + - runner: ubuntu-latest + target: x86 + - runner: ubuntu-latest + target: aarch64 + - runner: ubuntu-latest + target: armv7 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + manylinux: musllinux_1_2 + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-musllinux-${{ matrix.platform.target }} + path: dist + + windows: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: windows-latest + target: x64 + - runner: windows-latest + target: x86 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + architecture: ${{ matrix.platform.target }} + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-windows-${{ matrix.platform.target }} + path: dist + + macos: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: macos-12 + target: x86_64 + - runner: macos-14 + target: aarch64 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-macos-${{ matrix.platform.target }} + path: dist + + sdist: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Build sdist + uses: PyO3/maturin-action@v1 + with: + command: sdist + args: --out dist + - name: Upload sdist + uses: actions/upload-artifact@v4 + with: + name: wheels-sdist + path: dist + + release: + name: Release + runs-on: ubuntu-latest + if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }} + needs: [linux, musllinux, windows, macos, sdist] + permissions: + # Use to sign the release artifacts + id-token: write + # Used to upload release artifacts + contents: write + # Used to generate artifact attestation + attestations: write + steps: + - uses: actions/download-artifact@v4 + - name: Generate artifact attestation + uses: actions/attest-build-provenance@v1 + with: + subject-path: 'wheels-*/*' + - name: Publish to PyPI + if: "startsWith(github.ref, 'refs/tags/')" + uses: PyO3/maturin-action@v1 + env: + MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} + with: + command: upload + args: --non-interactive --skip-existing wheels-*/* diff --git a/.gitignore b/.gitignore index 7ea9ea8f..eba216d4 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,14 @@ ENV/ .DS_Store results *old + +# python generated files +__pycache__/ +*.py[oc] +build/ +dist/ +wheels/ +*.egg-info + +# venv +.venv diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 00000000..ded191fc --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,171 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "funmap_lib" +version = "0.1.0" +dependencies = [ + "pyo3", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indoc" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" + +[[package]] +name = "libc" +version = "0.2.159" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" + +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "portable-atomic" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d30538d42559de6b034bc76fd6dd4c38961b1ee5c6c56e3808c50128fdbc22ce" + +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pyo3" +version = "0.22.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15ee168e30649f7f234c3d49ef5a7a6cbf5134289bc46c29ff3155fa3221c225" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.22.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e61cef80755fe9e46bb8a0b8f20752ca7676dcc07a5277d8b7768c6172e529b3" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.22.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67ce096073ec5405f5ee2b8b31f03a68e02aa10d5d4f565eca04acc41931fa1c" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.22.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2440c6d12bc8f3ae39f1e775266fa5122fd0c8891ce7520fa6048e683ad3de28" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.22.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1be962f0e06da8f8465729ea2cb71a416d2257dff56cbe40a70d3e62a93ae5d1" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "syn" +version = "2.0.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" + +[[package]] +name = "unindent" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..ecac49bc --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "funmap_lib" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[lib] +name = "funmap" +crate-type = ["cdylib"] + +[dependencies] +pyo3 = "0.22.0" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..d78d7f95 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,47 @@ +[project] +name = "funmap" +version = "0.2.0" +description = "generate gene co-function networks using omics data" +authors = [{ name = "Zhiao Shi", email = "zhiao.shi@gmail.com" }] +dependencies = [ + "PyPDF2==3.0.1", + "click==8.1.7", + "h5py==3.10.0", + "imbalanced-learn==0.11.0", + "joblib==1.3.2", + "matplotlib==3.7.3", + "matplotlib_venn==0.11.9", + "networkx==3.1", + "numpy==1.24.4", + "pandas==2.0.3", + "powerlaw==1.5", + "pyarrow==13.0.0", + "pyyaml==6.0.1", + "scikit-learn==1.3.2", + "scipy==1.10.1", + "seaborn==0.13.0", + "tables==3.8.0", + "tqdm==4.66.1", + "xgboost==2.0.0", +] +readme = "README.md" +license = { text = "MIT license" } +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Rust", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] + +[build-system] +requires = ["maturin>=1,<2"] +build-backend = "maturin" + +[tool.maturin] +features = ["pyo3/extension-module"] +python-source = "python" +module-name = "funmap._lib" + +[tool.rye] +managed = true +dev-dependencies = ["maturin>=1.7.4"] diff --git a/pyproject_maturin.toml b/pyproject_maturin.toml new file mode 100644 index 00000000..bb2585da --- /dev/null +++ b/pyproject_maturin.toml @@ -0,0 +1,16 @@ +[build-system] +requires = ["maturin>=1.7,<2.0"] +build-backend = "maturin" + +[project] +name = "funmap" +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Rust", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dynamic = ["version"] +[tool.maturin] +features = ["pyo3/extension-module"] +python-source = "funmap" diff --git a/funmap/__init__.py b/python/funmap/__init__.py similarity index 100% rename from funmap/__init__.py rename to python/funmap/__init__.py diff --git a/funmap/cli.py b/python/funmap/cli.py similarity index 99% rename from funmap/cli.py rename to python/funmap/cli.py index 3560eeb4..09640f4f 100644 --- a/funmap/cli.py +++ b/python/funmap/cli.py @@ -6,7 +6,7 @@ import click import numpy as np import pandas as pd - +from funmap import _lib as funmap_lib from funmap import __version__ from funmap.data_urls import misc_urls as urls from funmap.funmap import ( @@ -86,6 +86,11 @@ def qc(config_file, force_rerun): log.info("QC complete") +@cli.command() +def rust(): + print(f"sum: {funmap_lib.sum_as_string(5, 10)}") + + @cli.command(help="run funmap") @click.option( "--config-file", diff --git a/funmap/data_urls.py b/python/funmap/data_urls.py similarity index 100% rename from funmap/data_urls.py rename to python/funmap/data_urls.py diff --git a/funmap/funmap.py b/python/funmap/funmap.py similarity index 100% rename from funmap/funmap.py rename to python/funmap/funmap.py diff --git a/funmap/logger.py b/python/funmap/logger.py similarity index 100% rename from funmap/logger.py rename to python/funmap/logger.py diff --git a/funmap/plotting.py b/python/funmap/plotting.py similarity index 100% rename from funmap/plotting.py rename to python/funmap/plotting.py diff --git a/funmap/saving.py b/python/funmap/saving.py similarity index 100% rename from funmap/saving.py rename to python/funmap/saving.py diff --git a/funmap/utils.py b/python/funmap/utils.py similarity index 100% rename from funmap/utils.py rename to python/funmap/utils.py diff --git a/requirements-dev.lock b/requirements-dev.lock new file mode 100644 index 00000000..d3de5995 --- /dev/null +++ b/requirements-dev.lock @@ -0,0 +1,116 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false +# universal: false + +-e file:. +blosc2==2.0.0 + # via tables +click==8.1.7 + # via funmap +contourpy==1.3.0 + # via matplotlib +cycler==0.12.1 + # via matplotlib +cython==3.0.11 + # via tables +fonttools==4.54.1 + # via matplotlib +h5py==3.10.0 + # via funmap +imbalanced-learn==0.11.0 + # via funmap +joblib==1.3.2 + # via funmap + # via imbalanced-learn + # via scikit-learn +kiwisolver==1.4.7 + # via matplotlib +matplotlib==3.7.3 + # via funmap + # via matplotlib-venn + # via powerlaw + # via seaborn +matplotlib-venn==0.11.9 + # via funmap +maturin==1.7.4 +mpmath==1.3.0 + # via powerlaw +msgpack==1.1.0 + # via blosc2 +networkx==3.1 + # via funmap +numexpr==2.10.1 + # via tables +numpy==1.24.4 + # via contourpy + # via funmap + # via h5py + # via imbalanced-learn + # via matplotlib + # via matplotlib-venn + # via numexpr + # via pandas + # via powerlaw + # via pyarrow + # via scikit-learn + # via scipy + # via seaborn + # via tables + # via xgboost +packaging==24.1 + # via matplotlib + # via tables +pandas==2.0.3 + # via funmap + # via seaborn +pillow==10.4.0 + # via matplotlib +powerlaw==1.5 + # via funmap +py-cpuinfo==9.0.0 + # via tables +pyarrow==13.0.0 + # via funmap +pyparsing==3.1.4 + # via matplotlib +pypdf2==3.0.1 + # via funmap +python-dateutil==2.9.0.post0 + # via matplotlib + # via pandas +pytz==2024.2 + # via pandas +pyyaml==6.0.1 + # via funmap +scikit-learn==1.3.2 + # via funmap + # via imbalanced-learn +scipy==1.10.1 + # via funmap + # via imbalanced-learn + # via matplotlib-venn + # via powerlaw + # via scikit-learn + # via xgboost +seaborn==0.13.0 + # via funmap +six==1.16.0 + # via python-dateutil +tables==3.8.0 + # via funmap +threadpoolctl==3.5.0 + # via imbalanced-learn + # via scikit-learn +tqdm==4.66.1 + # via funmap +tzdata==2024.2 + # via pandas +xgboost==2.0.0 + # via funmap diff --git a/requirements.lock b/requirements.lock new file mode 100644 index 00000000..4ca03409 --- /dev/null +++ b/requirements.lock @@ -0,0 +1,115 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false +# universal: false + +-e file:. +blosc2==2.0.0 + # via tables +click==8.1.7 + # via funmap +contourpy==1.3.0 + # via matplotlib +cycler==0.12.1 + # via matplotlib +cython==3.0.11 + # via tables +fonttools==4.54.1 + # via matplotlib +h5py==3.10.0 + # via funmap +imbalanced-learn==0.11.0 + # via funmap +joblib==1.3.2 + # via funmap + # via imbalanced-learn + # via scikit-learn +kiwisolver==1.4.7 + # via matplotlib +matplotlib==3.7.3 + # via funmap + # via matplotlib-venn + # via powerlaw + # via seaborn +matplotlib-venn==0.11.9 + # via funmap +mpmath==1.3.0 + # via powerlaw +msgpack==1.1.0 + # via blosc2 +networkx==3.1 + # via funmap +numexpr==2.10.1 + # via tables +numpy==1.24.4 + # via contourpy + # via funmap + # via h5py + # via imbalanced-learn + # via matplotlib + # via matplotlib-venn + # via numexpr + # via pandas + # via powerlaw + # via pyarrow + # via scikit-learn + # via scipy + # via seaborn + # via tables + # via xgboost +packaging==24.1 + # via matplotlib + # via tables +pandas==2.0.3 + # via funmap + # via seaborn +pillow==10.4.0 + # via matplotlib +powerlaw==1.5 + # via funmap +py-cpuinfo==9.0.0 + # via tables +pyarrow==13.0.0 + # via funmap +pyparsing==3.1.4 + # via matplotlib +pypdf2==3.0.1 + # via funmap +python-dateutil==2.9.0.post0 + # via matplotlib + # via pandas +pytz==2024.2 + # via pandas +pyyaml==6.0.1 + # via funmap +scikit-learn==1.3.2 + # via funmap + # via imbalanced-learn +scipy==1.10.1 + # via funmap + # via imbalanced-learn + # via matplotlib-venn + # via powerlaw + # via scikit-learn + # via xgboost +seaborn==0.13.0 + # via funmap +six==1.16.0 + # via python-dateutil +tables==3.8.0 + # via funmap +threadpoolctl==3.5.0 + # via imbalanced-learn + # via scikit-learn +tqdm==4.66.1 + # via funmap +tzdata==2024.2 + # via pandas +xgboost==2.0.0 + # via funmap diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 00000000..38f97a17 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,14 @@ +use pyo3::prelude::*; + +/// Formats the sum of two numbers as string. +#[pyfunction] +fn sum_as_string(a: usize, b: usize) -> PyResult { + Ok((a + b).to_string()) +} + +/// A Python module implemented in Rust. +#[pymodule] +fn _lib(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_function(wrap_pyfunction!(sum_as_string, m)?)?; + Ok(()) +} From 1a0db667ae8f9cf3c21c3ddab5692dda6b95b89c Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Fri, 27 Sep 2024 12:55:17 -0500 Subject: [PATCH 27/54] Allow funmap to be run as cli --- pyproject.toml | 2 ++ pyproject_maturin.toml | 16 ---------------- src/lib.rs | 3 ++- 3 files changed, 4 insertions(+), 17 deletions(-) delete mode 100644 pyproject_maturin.toml diff --git a/pyproject.toml b/pyproject.toml index d78d7f95..2f5df509 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,8 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] +[project.scripts] +funmap = "funmap.cli:cli" [build-system] requires = ["maturin>=1,<2"] diff --git a/pyproject_maturin.toml b/pyproject_maturin.toml deleted file mode 100644 index bb2585da..00000000 --- a/pyproject_maturin.toml +++ /dev/null @@ -1,16 +0,0 @@ -[build-system] -requires = ["maturin>=1.7,<2.0"] -build-backend = "maturin" - -[project] -name = "funmap" -requires-python = ">=3.8" -classifiers = [ - "Programming Language :: Rust", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", -] -dynamic = ["version"] -[tool.maturin] -features = ["pyo3/extension-module"] -python-source = "funmap" diff --git a/src/lib.rs b/src/lib.rs index 38f97a17..69bb5572 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,7 +8,8 @@ fn sum_as_string(a: usize, b: usize) -> PyResult { /// A Python module implemented in Rust. #[pymodule] -fn _lib(m: &Bound<'_, PyModule>) -> PyResult<()> { +#[pyo3(name = "_lib")] +fn funmap_lib(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(sum_as_string, m)?)?; Ok(()) } From 9e534788ccd3e3ae6fac594d1b744138ba4e1987 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Fri, 27 Sep 2024 16:54:53 -0500 Subject: [PATCH 28/54] formatting changes --- python/funmap/data_urls.py | 23 ++++++++++++----------- python/funmap/logger.py | 9 ++++++--- python/funmap/saving.py | 5 ++++- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/python/funmap/data_urls.py b/python/funmap/data_urls.py index 222ce369..e0c6915c 100644 --- a/python/funmap/data_urls.py +++ b/python/funmap/data_urls.py @@ -1,17 +1,18 @@ misc_urls = { - 'reactome_gold_standard': 'https://figshare.com/ndownloader/files/38647601', - 'reactome_gold_standard_md5': '671942763b6a7c32506cba1ed9900fe6', - 'funmap_blacklist': 'https://figshare.com/ndownloader/files/39033977', - 'mapping_file': 'https://figshare.com/ndownloader/files/39033971' + "reactome_gold_standard": "https://figshare.com/ndownloader/files/38647601", + "reactome_gold_standard_md5": "671942763b6a7c32506cba1ed9900fe6", + "funmap_blacklist": "https://figshare.com/ndownloader/files/39033977", + "mapping_file": "https://figshare.com/ndownloader/files/39033971", } # the information about other networks is fixed for now network_info = { - 'name': ['BioGRID', 'BioPlex', 'HI-union', 'STRING'], - 'type': ['BioGRID', 'BioPlex', 'HI', 'STRING'], - 'url': ['https://figshare.com/ndownloader/files/39125054', - 'https://figshare.com/ndownloader/files/39125051', - 'https://figshare.com/ndownloader/files/39125093', - 'https://figshare.com/ndownloader/files/39125090' - ] + "name": ["BioGRID", "BioPlex", "HI-union", "STRING"], + "type": ["BioGRID", "BioPlex", "HI", "STRING"], + "url": [ + "https://figshare.com/ndownloader/files/39125054", + "https://figshare.com/ndownloader/files/39125051", + "https://figshare.com/ndownloader/files/39125093", + "https://figshare.com/ndownloader/files/39125090", + ], } diff --git a/python/funmap/logger.py b/python/funmap/logger.py index ef53b84f..d994cdf9 100644 --- a/python/funmap/logger.py +++ b/python/funmap/logger.py @@ -7,6 +7,7 @@ LOG_LEVEL = logging.INFO + def setup_logging(run_config, log_config="logging.yml") -> None: """ Setup ``logging.config`` @@ -44,7 +45,7 @@ def setup_logging(run_config, log_config="logging.yml") -> None: handler_critical.setLevel(logging.CRITICAL) # Create formatters (if needed) - formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") handler_debug.setFormatter(formatter) handler_info.setFormatter(formatter) handler_warning.setFormatter(formatter) @@ -69,7 +70,9 @@ def setup_logging(run_config, log_config="logging.yml") -> None: logger.addHandler(console_handler) # Log a warning message to the console - logger.warning(f'"{log_config}" not found. Using basicConfig with custom log files.') + logger.warning( + f'"{log_config}" not found. Using basicConfig with custom log files.' + ) return with open(log_config, "rt") as f: @@ -85,6 +88,6 @@ def setup_logging(run_config, log_config="logging.yml") -> None: def setup_logger(name): - log = logging.getLogger(f'funmap.{name}') + log = logging.getLogger(f"funmap.{name}") log.setLevel(LOG_LEVEL) return log diff --git a/python/funmap/saving.py b/python/funmap/saving.py index a92eac8c..fbbff116 100644 --- a/python/funmap/saving.py +++ b/python/funmap/saving.py @@ -3,6 +3,7 @@ LOG_DIR = "logs" + def ensure_exists(p: Path) -> Path: """ Helper to ensure a directory exists. @@ -11,14 +12,16 @@ def ensure_exists(p: Path) -> Path: p.mkdir(parents=True, exist_ok=True) return p + def arch_path(config_file) -> Path: """ Construct a path based on the name of a configuration file eg. 'saved/EfficientNet' """ config = yaml.safe_load(open(config_file)) - p = Path(config['results_dir']) / config['name'] + p = Path(config["results_dir"]) / config["name"] return ensure_exists(p) + def log_path(config_file) -> Path: p = arch_path(config_file) / LOG_DIR return ensure_exists(p) From bc18d7281e78fcb54a9bb94ef1d63f7ae39e58d9 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Fri, 27 Sep 2024 16:56:50 -0500 Subject: [PATCH 29/54] Add only_extra_features key to config --- python/funmap/cli.py | 2 +- python/funmap/funmap.py | 99 +++++++++++++++++----------------- python/funmap/utils.py | 117 ++++++++++++++++++++++------------------ 3 files changed, 113 insertions(+), 105 deletions(-) diff --git a/python/funmap/cli.py b/python/funmap/cli.py index 09640f4f..7635dcfe 100644 --- a/python/funmap/cli.py +++ b/python/funmap/cli.py @@ -222,7 +222,7 @@ def run(config_file, force_rerun): os.path.exists(file_path) for file_path in ml_model_file.values() ) if all_model_exist: - log.info(f"Trained model(s) exists. Loading model(s) ...") + log.info("Trained model(s) exists. Loading model(s) ...") ml_model_dict = {} # feature: ex or ei for feature in ml_model_file: diff --git a/python/funmap/funmap.py b/python/funmap/funmap.py index b0e405f7..78e0cf3c 100644 --- a/python/funmap/funmap.py +++ b/python/funmap/funmap.py @@ -147,53 +147,59 @@ def compute_features(cfg, feature_type, min_sample_count, output_dir): """Compute the pearson correlation coefficient for each edge in the list of edges and for each""" data_dict, all_valid_ids = get_data_dict(cfg, min_sample_count) cc_dict = {} - for i in data_dict: - cc_file = os.path.join(output_dir, f"cc_{i}.h5") - cc_dict[i] = cc_file - mr_dict = {} - for i in data_dict: - mr_file = os.path.join(output_dir, f"mr_{i}.h5") - mr_dict[i] = mr_file - - all_cc_exist = all(os.path.exists(file_path) for file_path in cc_dict.values()) - if all_cc_exist: - log.info("All cc files exist. Skipping feature computation.") - if feature_type == "cc": - return cc_dict, mr_dict, all_valid_ids + if not cfg["only_extra_features"]: + for i in data_dict: + cc_file = os.path.join(output_dir, f"cc_{i}.h5") + cc_dict[i] = cc_file + + mr_dict = {} + for i in data_dict: + mr_file = os.path.join(output_dir, f"mr_{i}.h5") + mr_dict[i] = mr_file + + all_cc_exist = all(os.path.exists(file_path) for file_path in cc_dict.values()) + if all_cc_exist: + log.info("All cc files exist. Skipping feature computation.") + if feature_type == "cc": + return cc_dict, mr_dict, all_valid_ids - if feature_type == "mr": - all_mr_exist = all(os.path.exists(file_path) for file_path in mr_dict.values()) - if all_cc_exist and all_mr_exist: - log.info("All mr files exist. Skipping feature computation.") - return cc_dict, mr_dict, all_valid_ids - - log.debug(f"Computing {feature_type} features") - for i in data_dict: - cc_file = cc_dict[i] - if os.path.exists(cc_file): - continue - log.info(f"Computing pearson correlation coefficient matrix for {i}") - x = data_dict[i].values.astype(np.float32) - df = pd.DataFrame(x) - corr_matrix = df.corr(method="pearson", min_periods=min_sample_count) - arr = corr_matrix.values - upper_indices = np.triu_indices(arr.shape[0]) - with h5py.File(cc_dict[i], "w") as hf: - # only store the upper triangle part - hf.create_dataset("cc", data=arr[upper_indices]) - hf.create_dataset("ids", data=data_dict[i].columns.values.astype("S")) - cc_dict[i] = cc_file - - # compute pairwise mutual rank features if feature_type == "mr": - log.info(f"Computing mutual rank matrix for {i}") - arr_mr = pairwise_mutual_rank(arr) - upper_indices = np.triu_indices(arr_mr.shape[0]) - with h5py.File(mr_dict[i], "w") as hf: + all_mr_exist = all( + os.path.exists(file_path) for file_path in mr_dict.values() + ) + if all_cc_exist and all_mr_exist: + log.info("All mr files exist. Skipping feature computation.") + return cc_dict, mr_dict, all_valid_ids + + log.debug(f"Computing {feature_type} features") + for i in data_dict: + cc_file = cc_dict[i] + if os.path.exists(cc_file): + continue + log.info(f"Computing pearson correlation coefficient matrix for {i}") + x = data_dict[i].values.astype(np.float32) + df = pd.DataFrame(x) + corr_matrix = df.corr(method="pearson", min_periods=min_sample_count) + arr = corr_matrix.values + upper_indices = np.triu_indices(arr.shape[0]) + with h5py.File(cc_dict[i], "w") as hf: # only store the upper triangle part - hf.create_dataset("mr", data=arr_mr[upper_indices]) + hf.create_dataset("cc", data=arr[upper_indices]) hf.create_dataset("ids", data=data_dict[i].columns.values.astype("S")) + cc_dict[i] = cc_file + + # compute pairwise mutual rank features + if feature_type == "mr": + log.info(f"Computing mutual rank matrix for {i}") + arr_mr = pairwise_mutual_rank(arr) + upper_indices = np.triu_indices(arr_mr.shape[0]) + with h5py.File(mr_dict[i], "w") as hf: + # only store the upper triangle part + hf.create_dataset("mr", data=arr_mr[upper_indices]) + hf.create_dataset( + "ids", data=data_dict[i].columns.values.astype("S") + ) return cc_dict, mr_dict, all_valid_ids @@ -232,15 +238,6 @@ def assemble_feature_df(h5_file_mapping, df, dataset="cc"): file_keys = list(h5_file_mapping.keys()) feature_df = pd.DataFrame(columns=file_keys + ["label"]) - def get_1d_indices(i_array, j_array, n): - # Ensure i and j are within bounds - mask = (i_array < n) & (j_array < n) - i = np.minimum(i_array[mask], j_array[mask]) - j = np.maximum(i_array[mask], j_array[mask]) - - # Calculate 1D indices - return i * n - i * (i - 1) // 2 + (j - i) - def get_1d_indices(i_array, j_array, n): # Ensure i and j are within bounds mask = (i_array < n) & (j_array < n) diff --git a/python/funmap/utils.py b/python/funmap/utils.py index 81bb68cb..533d52ca 100644 --- a/python/funmap/utils.py +++ b/python/funmap/utils.py @@ -86,49 +86,50 @@ def get_data_dict(config, min_sample_count=15): the data from the corresponding file. """ - data_file = config["data_path"] + all_valid_ids = set() data_dict = {} - if "filter_noncoding_genes" in config and config["filter_noncoding_genes"]: - mapping = pd.read_csv(urls["mapping_file"], sep="\t") - # extract the data file from the tar.gz file - tmp_dir = "tmp_data" - if not os.path.exists(tmp_dir): - os.mkdir(tmp_dir) - os.system(f"tar -xzf {data_file} --strip-components=1 -C {tmp_dir}") - # gene ids are gene symbols - for dt in config["data_files"]: - log.info(f"processing ... {dt['name']}") - cur_feature = dt["name"] - # cur_file = get_obj_from_tgz(data_file, dt['path']) + if not config["only_extra_features"]: + data_file = config["data_path"] + if "filter_noncoding_genes" in config and config["filter_noncoding_genes"]: + mapping = pd.read_csv(urls["mapping_file"], sep="\t") # extract the data file from the tar.gz file - # cur_data= get_obj_from_tgz(data_file, dt['path']) - cur_data = pd.read_csv( - os.path.join(tmp_dir, dt["path"]), sep="\t", index_col=0, header=0 - ) - if cur_data.shape[1] < min_sample_count: - log.info(f"... {dt['name']} ... not enough samples, skipped") - continue - cur_data = cur_data.T - # exclude cohort with sample number < min_sample_count - # remove noncoding genes first - if config["filter_noncoding_genes"]: - coding = mapping.loc[mapping["coding"] == "coding", ["gene_name"]] - coding_genes = list(set(coding["gene_name"].to_list())) - cur_data = cur_data[[c for c in cur_data.columns if c in coding_genes]] - # duplicated columns, for now select the last column - cur_data = cur_data.loc[:, ~cur_data.columns.duplicated(keep="last")] - data_dict[cur_feature] = cur_data - - shutil.rmtree(tmp_dir) - - log.info("filtering out non valid ids ...") - all_valid_ids = set() - for i in data_dict: - cur_data = data_dict[i] - is_valid = cur_data.notna().sum() >= min_sample_count - # valid_count = np.sum(is_valid) - valid_p = cur_data.columns[is_valid].values - all_valid_ids = all_valid_ids.union(set(valid_p)) + tmp_dir = "tmp_data" + if not os.path.exists(tmp_dir): + os.mkdir(tmp_dir) + os.system(f"tar -xzf {data_file} --strip-components=1 -C {tmp_dir}") + # gene ids are gene symbols + for dt in config["data_files"]: + log.info(f"processing ... {dt['name']}") + cur_feature = dt["name"] + # cur_file = get_obj_from_tgz(data_file, dt['path']) + # extract the data file from the tar.gz file + # cur_data= get_obj_from_tgz(data_file, dt['path']) + cur_data = pd.read_csv( + os.path.join(tmp_dir, dt["path"]), sep="\t", index_col=0, header=0 + ) + if cur_data.shape[1] < min_sample_count: + log.info(f"... {dt['name']} ... not enough samples, skipped") + continue + cur_data = cur_data.T + # exclude cohort with sample number < min_sample_count + # remove noncoding genes first + if config["filter_noncoding_genes"]: + coding = mapping.loc[mapping["coding"] == "coding", ["gene_name"]] + coding_genes = list(set(coding["gene_name"].to_list())) + cur_data = cur_data[[c for c in cur_data.columns if c in coding_genes]] + # duplicated columns, for now select the last column + cur_data = cur_data.loc[:, ~cur_data.columns.duplicated(keep="last")] + data_dict[cur_feature] = cur_data + + shutil.rmtree(tmp_dir) + + log.info("filtering out non valid ids ...") + for i in data_dict: + cur_data = data_dict[i] + is_valid = cur_data.notna().sum() >= min_sample_count + # valid_count = np.sum(is_valid) + valid_p = cur_data.columns[is_valid].values + all_valid_ids = all_valid_ids.union(set(valid_p)) if config["extra_feature_file"] is not None: log.info("Importing extra feature file") # TODO: Import extra feature file @@ -342,6 +343,7 @@ def get_config(cfg_file: Path): "max_num_edges": 250000, "step_size": 1000, "lr_cutoff": 50, + "only_extra_features": False, } with open(cfg_file, "r") as fh: @@ -352,6 +354,12 @@ def get_config(cfg_file: Path): cfg["task"] = cfg_dict["task"] assert cfg["task"] in ["protein_func", "kinase_func"] + if "only_extra_features" in cfg_dict: + cfg["only_extra_features"] = cfg_dict["only_extra_features"] + if cfg["only_extra_features"] and "extra_feature_file" not in cfg_dict: + msg = "Extra feature file is not defined when only_extra_features = True" + print(msg) + raise ValueError(msg) if "seed" in cfg_dict: cfg["seed"] = cfg_dict["seed"] @@ -404,21 +412,24 @@ def get_config(cfg_file: Path): if "results_dir" in cfg_dict: cfg["results_dir"] = cfg_dict["results_dir"] + "/" + cfg_dict["name"] - if "data_files" not in cfg_dict: + if "data_files" not in cfg_dict and not cfg.get("only_extra_features", False): raise ValueError("data_files not specified in config file") # Check all files listed under data_files are also in the tar.gz file - data_files = cfg_dict["data_files"] - # List all the files in the tar.gz file - with tarfile.open(cfg["data_path"], "r:gz") as tar: - tar_files = {Path(file).name for file in tar.getnames()} - - # Check if all files in data_files are in tar_files - if not all(file["path"] in tar_files for file in data_files): - print("Files listed under data_files are not in the tar.gz file!") - raise ValueError("Files listed under data_files are not in the tar.gz file!") - - cfg["data_files"] = cfg_dict["data_files"] + if not cfg["only_extra_features"]: + data_files = cfg_dict["data_files"] + # List all the files in the tar.gz file + with tarfile.open(cfg["data_path"], "r:gz") as tar: + tar_files = {Path(file).name for file in tar.getnames()} + + # Check if all files in data_files are in tar_files + if not all(file["path"] in tar_files for file in data_files): + print("Files listed under data_files are not in the tar.gz file!") + raise ValueError( + "Files listed under data_files are not in the tar.gz file!" + ) + + cfg["data_files"] = cfg_dict["data_files"] if "rp_pairs" in cfg_dict: cfg["rp_pairs"] = cfg_dict["rp_pairs"] From 000ae50261b19ae8283de052cf24cf8436283c1e Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Mon, 30 Sep 2024 09:39:09 -0500 Subject: [PATCH 30/54] update to use latest packages --- pyproject.toml | 43 ++++++++++++++++++++++--------------------- requirements-dev.lock | 43 ++++++++++++++++++++++++------------------- requirements.lock | 43 ++++++++++++++++++++++++------------------- 3 files changed, 70 insertions(+), 59 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2f5df509..043ed966 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,27 +3,6 @@ name = "funmap" version = "0.2.0" description = "generate gene co-function networks using omics data" authors = [{ name = "Zhiao Shi", email = "zhiao.shi@gmail.com" }] -dependencies = [ - "PyPDF2==3.0.1", - "click==8.1.7", - "h5py==3.10.0", - "imbalanced-learn==0.11.0", - "joblib==1.3.2", - "matplotlib==3.7.3", - "matplotlib_venn==0.11.9", - "networkx==3.1", - "numpy==1.24.4", - "pandas==2.0.3", - "powerlaw==1.5", - "pyarrow==13.0.0", - "pyyaml==6.0.1", - "scikit-learn==1.3.2", - "scipy==1.10.1", - "seaborn==0.13.0", - "tables==3.8.0", - "tqdm==4.66.1", - "xgboost==2.0.0", -] readme = "README.md" license = { text = "MIT license" } requires-python = ">=3.8" @@ -32,6 +11,28 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] +dependencies = [ + "h5py>=3.12.1", + "numpy>=2.1.1", + "seaborn>=0.13.2", + "tqdm>=4.66.5", + "xgboost>=2.1.1", + "pyyaml>=6.0.2", + "scipy>=1.14.1", + "pyarrow>=17.0.0", + "pandas>=2.2.3", + "joblib>=1.4.2", + "matplotlib>=3.9.2", + "scikit-learn>=1.5.2", + "imbalanced-learn>=0.12.3", + "pypdf2>=3.0.1", + "matplotlib-venn>=1.1.1", + "networkx>=3.3", + "powerlaw>=1.5", + "click>=8.1.7", + "tables>=3.10.1", +] + [project.scripts] funmap = "funmap.cli:cli" diff --git a/requirements-dev.lock b/requirements-dev.lock index d3de5995..8ff4123e 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -10,7 +10,7 @@ # universal: false -e file:. -blosc2==2.0.0 +blosc2==2.7.1 # via tables click==8.1.7 # via funmap @@ -18,37 +18,39 @@ contourpy==1.3.0 # via matplotlib cycler==0.12.1 # via matplotlib -cython==3.0.11 - # via tables fonttools==4.54.1 # via matplotlib -h5py==3.10.0 +h5py==3.12.1 # via funmap -imbalanced-learn==0.11.0 +imbalanced-learn==0.12.3 # via funmap -joblib==1.3.2 +joblib==1.4.2 # via funmap # via imbalanced-learn # via scikit-learn kiwisolver==1.4.7 # via matplotlib -matplotlib==3.7.3 +matplotlib==3.9.2 # via funmap # via matplotlib-venn # via powerlaw # via seaborn -matplotlib-venn==0.11.9 +matplotlib-venn==1.1.1 # via funmap maturin==1.7.4 mpmath==1.3.0 # via powerlaw msgpack==1.1.0 # via blosc2 -networkx==3.1 +ndindex==1.9.2 + # via blosc2 +networkx==3.3 # via funmap numexpr==2.10.1 + # via blosc2 # via tables -numpy==1.24.4 +numpy==2.1.1 + # via blosc2 # via contourpy # via funmap # via h5py @@ -67,7 +69,7 @@ numpy==1.24.4 packaging==24.1 # via matplotlib # via tables -pandas==2.0.3 +pandas==2.2.3 # via funmap # via seaborn pillow==10.4.0 @@ -75,8 +77,9 @@ pillow==10.4.0 powerlaw==1.5 # via funmap py-cpuinfo==9.0.0 + # via blosc2 # via tables -pyarrow==13.0.0 +pyarrow==17.0.0 # via funmap pyparsing==3.1.4 # via matplotlib @@ -87,30 +90,32 @@ python-dateutil==2.9.0.post0 # via pandas pytz==2024.2 # via pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via funmap -scikit-learn==1.3.2 +scikit-learn==1.5.2 # via funmap # via imbalanced-learn -scipy==1.10.1 +scipy==1.14.1 # via funmap # via imbalanced-learn # via matplotlib-venn # via powerlaw # via scikit-learn # via xgboost -seaborn==0.13.0 +seaborn==0.13.2 # via funmap six==1.16.0 # via python-dateutil -tables==3.8.0 +tables==3.10.1 # via funmap threadpoolctl==3.5.0 # via imbalanced-learn # via scikit-learn -tqdm==4.66.1 +tqdm==4.66.5 # via funmap +typing-extensions==4.12.2 + # via tables tzdata==2024.2 # via pandas -xgboost==2.0.0 +xgboost==2.1.1 # via funmap diff --git a/requirements.lock b/requirements.lock index 4ca03409..43e3816b 100644 --- a/requirements.lock +++ b/requirements.lock @@ -10,7 +10,7 @@ # universal: false -e file:. -blosc2==2.0.0 +blosc2==2.7.1 # via tables click==8.1.7 # via funmap @@ -18,36 +18,38 @@ contourpy==1.3.0 # via matplotlib cycler==0.12.1 # via matplotlib -cython==3.0.11 - # via tables fonttools==4.54.1 # via matplotlib -h5py==3.10.0 +h5py==3.12.1 # via funmap -imbalanced-learn==0.11.0 +imbalanced-learn==0.12.3 # via funmap -joblib==1.3.2 +joblib==1.4.2 # via funmap # via imbalanced-learn # via scikit-learn kiwisolver==1.4.7 # via matplotlib -matplotlib==3.7.3 +matplotlib==3.9.2 # via funmap # via matplotlib-venn # via powerlaw # via seaborn -matplotlib-venn==0.11.9 +matplotlib-venn==1.1.1 # via funmap mpmath==1.3.0 # via powerlaw msgpack==1.1.0 # via blosc2 -networkx==3.1 +ndindex==1.9.2 + # via blosc2 +networkx==3.3 # via funmap numexpr==2.10.1 + # via blosc2 # via tables -numpy==1.24.4 +numpy==2.1.1 + # via blosc2 # via contourpy # via funmap # via h5py @@ -66,7 +68,7 @@ numpy==1.24.4 packaging==24.1 # via matplotlib # via tables -pandas==2.0.3 +pandas==2.2.3 # via funmap # via seaborn pillow==10.4.0 @@ -74,8 +76,9 @@ pillow==10.4.0 powerlaw==1.5 # via funmap py-cpuinfo==9.0.0 + # via blosc2 # via tables -pyarrow==13.0.0 +pyarrow==17.0.0 # via funmap pyparsing==3.1.4 # via matplotlib @@ -86,30 +89,32 @@ python-dateutil==2.9.0.post0 # via pandas pytz==2024.2 # via pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via funmap -scikit-learn==1.3.2 +scikit-learn==1.5.2 # via funmap # via imbalanced-learn -scipy==1.10.1 +scipy==1.14.1 # via funmap # via imbalanced-learn # via matplotlib-venn # via powerlaw # via scikit-learn # via xgboost -seaborn==0.13.0 +seaborn==0.13.2 # via funmap six==1.16.0 # via python-dateutil -tables==3.8.0 +tables==3.10.1 # via funmap threadpoolctl==3.5.0 # via imbalanced-learn # via scikit-learn -tqdm==4.66.1 +tqdm==4.66.5 # via funmap +typing-extensions==4.12.2 + # via tables tzdata==2024.2 # via pandas -xgboost==2.0.0 +xgboost==2.1.1 # via funmap From 29e8c14f7a31a15c52318074ad91f23d78645b94 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Mon, 30 Sep 2024 09:57:07 -0500 Subject: [PATCH 31/54] fix processing of extra_feature_df --- python/funmap/cli.py | 7 ++----- python/funmap/utils.py | 1 + 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/python/funmap/cli.py b/python/funmap/cli.py index 7635dcfe..2fec1579 100644 --- a/python/funmap/cli.py +++ b/python/funmap/cli.py @@ -126,13 +126,10 @@ def run(config_file, force_rerun): setup_logging(config_file) cfg = setup_experiment(config_file) extra_feature_file = cfg["extra_feature_file"] + extra_feature_df = None if extra_feature_file is not None: log.info("Loading extra feature file into dataframe") extra_feature_df = process_extra_feature(extra_feature_file) - else: - extra_feature_df = None - # if (extra_feature_file is not None) and (not check_extra_feature_file(extra_feature_file)): - # return gs_file = cfg["gs_file"] if (gs_file is not None) and (not check_gold_standard_file(gs_file)): return @@ -367,7 +364,7 @@ def run(config_file, force_rerun): "feature_type": "cc", "gs_file": gs_file, # no extra feature for plotting - "extra_feature_df": None, + "extra_feature_df": extra_feature_df, "valid_id_list": all_valid_ids, "test_size": test_size, "seed": seed, diff --git a/python/funmap/utils.py b/python/funmap/utils.py index 533d52ca..7293ff30 100644 --- a/python/funmap/utils.py +++ b/python/funmap/utils.py @@ -582,3 +582,4 @@ def process_extra_feature(extra_feature_file) -> pd.DataFrame: extra_feature_df = extra_feature_df.drop_duplicates( subset=["P1", "P2"], keep="last" ) + return extra_feature_df From 46c5fa84c9222f6882cb959872cbae30c0894c11 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Tue, 1 Oct 2024 15:04:52 -0500 Subject: [PATCH 32/54] Add documentation and fix messages --- python/funmap/cli.py | 2 +- python/funmap/funmap.py | 2 +- src/lib.rs | 20 +++++++++++++++++++- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/python/funmap/cli.py b/python/funmap/cli.py index 2fec1579..498b6bde 100644 --- a/python/funmap/cli.py +++ b/python/funmap/cli.py @@ -152,7 +152,7 @@ def run(config_file, force_rerun): max_num_edges = cfg["max_num_edges"] step_size = cfg["step_size"] start_edge_num = cfg["start_edge_num"] - + only_extra_features = cfg["only_extra_features"] results_dir = Path(cfg["results_dir"]) saved_data_dir = results_dir / cfg["subdirs"]["saved_data_dir"] model_dir = results_dir / cfg["subdirs"]["saved_model_dir"] diff --git a/python/funmap/funmap.py b/python/funmap/funmap.py index 78e0cf3c..50d5a37a 100644 --- a/python/funmap/funmap.py +++ b/python/funmap/funmap.py @@ -643,7 +643,7 @@ def dataset_llr( cur_llr_res["dataset"] = f + "_EXTRAFEAT" llr_ds = pd.concat([llr_ds, cur_llr_res], axis=0, ignore_index=True) llr_ds.to_csv(llr_dataset_file, sep="\t", index=False) - log.info(f"Calculating llr for {dataset} ... done") + log.info(f"Calculating llr for {f} ... done") return llr_ds diff --git a/src/lib.rs b/src/lib.rs index 69bb5572..a829e04d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,23 @@ use pyo3::prelude::*; +/// funmap_lib +/// Supports the addition of extra features in the format of gene pairs +/// +/// Files can reach up to 10 GB, so Rust is used to speed up and optimize the merging process. +/// +/// # Step 1: Identify all unique genes +/// +/// This step identifies all the unique genes found in both the data and extra-feature files. +/// +/// This allows for a universal order to genes that will be referenced in other steps +/// +/// Input: All data files and extra feature files +/// Output: .pkl file containing the unique genes found in alphabetical order (A before B) +/// +/// # Step 2: Align extra features +/// +/// WARN: May need to re-align the regular data files +/// For each extra feature file, re-index the rows according to the order of genes in Step 1. /// Formats the sum of two numbers as string. #[pyfunction] fn sum_as_string(a: usize, b: usize) -> PyResult { @@ -8,7 +26,7 @@ fn sum_as_string(a: usize, b: usize) -> PyResult { /// A Python module implemented in Rust. #[pymodule] -#[pyo3(name = "_lib")] +#[pyo3(name = "_lib")] // module name is _lib. (imports as funmap._lib). This is to hide these functions from regular users. fn funmap_lib(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(sum_as_string, m)?)?; Ok(()) From d34fc2c9966dd8b0b1770951ee0c4aabeeb65596 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Tue, 1 Oct 2024 16:17:56 -0500 Subject: [PATCH 33/54] add unique gene identification --- Cargo.lock | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + src/lib.rs | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index ded191fc..075d2231 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,19 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "getrandom", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "autocfg" version = "1.4.0" @@ -18,9 +31,21 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" name = "funmap_lib" version = "0.1.0" dependencies = [ + "ahash", "pyo3", ] +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + [[package]] name = "heck" version = "0.5.0" @@ -169,3 +194,35 @@ name = "unindent" version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index ecac49bc..b5dc393a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,4 +9,5 @@ name = "funmap" crate-type = ["cdylib"] [dependencies] +ahash = "0.8.11" pyo3 = "0.22.0" diff --git a/src/lib.rs b/src/lib.rs index a829e04d..06a84168 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,12 @@ +use ahash::{HashSet, HashSetExt}; use pyo3::prelude::*; +use std::{ + fs::File, + io::{BufRead, BufReader}, +}; /// funmap_lib +/// /// Supports the addition of extra features in the format of gene pairs /// /// Files can reach up to 10 GB, so Rust is used to speed up and optimize the merging process. @@ -17,7 +23,60 @@ use pyo3::prelude::*; /// # Step 2: Align extra features /// /// WARN: May need to re-align the regular data files +/// /// For each extra feature file, re-index the rows according to the order of genes in Step 1. +#[pyfunction] +fn process_files( + expression_paths: Vec, + extra_feature_paths: Vec, +) -> PyResult { + // Step 1: Identify all unique genes + // Across both expression and extra_feature_paths + // Create final unique_gene pkl file + + let mut uniq_gene = HashSet::new(); + + // Read expression data where first column is gene information; + for file_path in expression_paths.iter() { + let file = File::open(file_path)?; + let reader = BufReader::new(file); + let mut has_header = true; + for line in reader.lines() { + let line = line?; + if has_header { + // skip header + has_header = false; + continue; + } + let row: Vec<&str> = line.split('\t').collect(); + if row.len() > 1 { + uniq_gene.insert(row[0].to_string()); // add gene to set + } + } + } + + // Read extra feature data, where first and second column are genes + for file_path in extra_feature_paths.iter() { + let file = File::open(file_path)?; + let reader = BufReader::new(file); + let mut has_header = true; + for line in reader.lines() { + let line = line?; + if has_header { + // skip header + has_header = false; + continue; + } + let row: Vec<&str> = line.split('\t').collect(); + if row.len() > 2 { + uniq_gene.insert(row[0].to_string()); // add first gene to set + uniq_gene.insert(row[1].to_string()); // add second gene to set + } + } + } + Ok(true) +} + /// Formats the sum of two numbers as string. #[pyfunction] fn sum_as_string(a: usize, b: usize) -> PyResult { @@ -29,5 +88,6 @@ fn sum_as_string(a: usize, b: usize) -> PyResult { #[pyo3(name = "_lib")] // module name is _lib. (imports as funmap._lib). This is to hide these functions from regular users. fn funmap_lib(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(sum_as_string, m)?)?; + m.add_function(wrap_pyfunction!(process_files, m)?)?; Ok(()) } From 47dcd5e10c61a67f0a89d61986b915e7e1eb6411 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Tue, 1 Oct 2024 16:55:16 -0500 Subject: [PATCH 34/54] add format error checking --- src/lib.rs | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 06a84168..06fe4056 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,5 @@ use ahash::{HashSet, HashSetExt}; -use pyo3::prelude::*; +use pyo3::{exceptions::PyValueError, prelude::*}; use std::{ fs::File, io::{BufRead, BufReader}, @@ -29,6 +29,7 @@ use std::{ fn process_files( expression_paths: Vec, extra_feature_paths: Vec, + output_folder: String, ) -> PyResult { // Step 1: Identify all unique genes // Across both expression and extra_feature_paths @@ -46,6 +47,13 @@ fn process_files( if has_header { // skip header has_header = false; + let row: Vec<&str> = line.split('\t').collect(); + if row.len() < 2 { + return Err(PyValueError::new_err(format!( + "Expression data file at {} does not have enough columns.", + file_path + ))); + } continue; } let row: Vec<&str> = line.split('\t').collect(); @@ -65,6 +73,13 @@ fn process_files( if has_header { // skip header has_header = false; + let row: Vec<&str> = line.split('\t').collect(); + if row.len() < 3 { + return Err(PyValueError::new_err(format!( + "Extra feature file at {} does not have enough columns.", + file_path + ))); + } continue; } let row: Vec<&str> = line.split('\t').collect(); @@ -74,6 +89,9 @@ fn process_files( } } } + + // Save to pickle + Ok(true) } From ca2d1f615fe8eb5d13b8fdf49479f442e31d8aad Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Wed, 2 Oct 2024 11:13:23 -0500 Subject: [PATCH 35/54] save uniq_gene to pkl file in output_folder --- Cargo.lock | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + src/lib.rs | 13 +++++++++- 3 files changed, 87 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 075d2231..6150eee1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -21,6 +21,12 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "cfg-if" version = "1.0.0" @@ -33,6 +39,7 @@ version = "0.1.0" dependencies = [ "ahash", "pyo3", + "serde-pickle", ] [[package]] @@ -58,6 +65,12 @@ version = "2.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" +[[package]] +name = "iter-read" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c397ca3ea05ad509c4ec451fea28b4771236a376ca1c69fd5143aae0cf8f93c4" + [[package]] name = "libc" version = "0.2.159" @@ -73,6 +86,34 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + [[package]] name = "once_cell" version = "1.19.0" @@ -166,6 +207,39 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "serde" +version = "1.0.210" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde-pickle" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c762ad136a26407c6a80825813600ceeab5e613660d93d79a41f0ec877171e71" +dependencies = [ + "byteorder", + "iter-read", + "num-bigint", + "num-traits", + "serde", +] + +[[package]] +name = "serde_derive" +version = "1.0.210" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "syn" version = "2.0.77" diff --git a/Cargo.toml b/Cargo.toml index b5dc393a..80d83754 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,3 +11,4 @@ crate-type = ["cdylib"] [dependencies] ahash = "0.8.11" pyo3 = "0.22.0" +serde-pickle = "1.1.1" diff --git a/src/lib.rs b/src/lib.rs index 06fe4056..9c4ad424 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,11 +1,13 @@ use ahash::{HashSet, HashSetExt}; use pyo3::{exceptions::PyValueError, prelude::*}; +use serde_pickle::SerOptions; use std::{ fs::File, io::{BufRead, BufReader}, + path::Path, }; -/// funmap_lib +/// process_files /// /// Supports the addition of extra features in the format of gene pairs /// @@ -91,7 +93,16 @@ fn process_files( } // Save to pickle + // TODO: Look at other file formats + let mut uniq_gene: Vec = uniq_gene.iter().cloned().collect(); + + // Sort genes alphabetically + uniq_gene.sort(); + let folder_path = Path::new(&output_folder); + let uniq_gene_file_path = folder_path.join("uniq_gene.pkl"); + let mut w = File::create(uniq_gene_file_path)?; + serde_pickle::to_writer(&mut w, &uniq_gene, SerOptions::default()).unwrap(); Ok(true) } From 4e850bce85124ac189271faab2a7301b85d917c7 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Fri, 4 Oct 2024 14:36:23 -0500 Subject: [PATCH 36/54] start align_file function --- src/lib.rs | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 9c4ad424..60a11da3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,9 +24,15 @@ use std::{ /// /// # Step 2: Align extra features /// -/// WARN: May need to re-align the regular data files -/// /// For each extra feature file, re-index the rows according to the order of genes in Step 1. +/// +/// Then save each column as a separate pkl file +/// +/// Input: unique genes from Step 1, extra feature files. +/// Output: One pkl file for each feature across all extra feature files +/// +/// Function +/// Output: list of all output feature pkl files #[pyfunction] fn process_files( expression_paths: Vec, @@ -104,6 +110,33 @@ fn process_files( let mut w = File::create(uniq_gene_file_path)?; serde_pickle::to_writer(&mut w, &uniq_gene, SerOptions::default()).unwrap(); Ok(true) + + // Re-align each file +} + +fn align_file(path: &String, uniq_gene: &Vec) -> PyResult { + // Read extra feature data, where first and second column are genes + let file = File::open(path)?; + let reader = BufReader::new(file); + let mut has_header = true; + for line in reader.lines() { + let line = line?; + if has_header { + // skip header + has_header = false; + let row: Vec<&str> = line.split('\t').collect(); + if row.len() < 3 { + return Err(PyValueError::new_err(format!( + "Extra feature file at {} does not have enough columns.", + file_path + ))); + } + continue; + } + let row: Vec<&str> = line.split('\t').collect(); + if row.len() > 2 {} + } + Ok(true) } /// Formats the sum of two numbers as string. From 799f39b15493ed6236efae58a04297b71121b4d2 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Fri, 4 Oct 2024 14:38:39 -0500 Subject: [PATCH 37/54] fix rust compilation error --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 60a11da3..80ccaaca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -128,7 +128,7 @@ fn align_file(path: &String, uniq_gene: &Vec) -> PyResult { if row.len() < 3 { return Err(PyValueError::new_err(format!( "Extra feature file at {} does not have enough columns.", - file_path + path ))); } continue; From c67e533a417073e12942fe243bb1ea6f9ff79a5e Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Mon, 7 Oct 2024 12:19:13 -0500 Subject: [PATCH 38/54] add code for calculating new_index for each feature file --- .gitignore | 1 + python/funmap/cli.py | 2 +- src/lib.rs | 27 ++++++++++++++++++++++----- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index eba216d4..4184eca0 100644 --- a/.gitignore +++ b/.gitignore @@ -120,3 +120,4 @@ wheels/ # venv .venv +test_data diff --git a/python/funmap/cli.py b/python/funmap/cli.py index 498b6bde..29a66390 100644 --- a/python/funmap/cli.py +++ b/python/funmap/cli.py @@ -88,7 +88,7 @@ def qc(config_file, force_rerun): @cli.command() def rust(): - print(f"sum: {funmap_lib.sum_as_string(5, 10)}") + funmap_lib.process_files(["test_data/dia.tsv"], [], "test_data") @cli.command(help="run funmap") diff --git a/src/lib.rs b/src/lib.rs index 80ccaaca..24566f35 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ -use ahash::{HashSet, HashSetExt}; +use ahash::{AHashMap, HashSet, HashSetExt}; use pyo3::{exceptions::PyValueError, prelude::*}; use serde_pickle::SerOptions; use std::{ @@ -109,16 +109,24 @@ fn process_files( let uniq_gene_file_path = folder_path.join("uniq_gene.pkl"); let mut w = File::create(uniq_gene_file_path)?; serde_pickle::to_writer(&mut w, &uniq_gene, SerOptions::default()).unwrap(); - Ok(true) - + let n = uniq_gene.len(); // Re-align each file + // Create a HashMap to store the indices of each string + let mut gene_index_map: AHashMap<&String, usize> = AHashMap::new(); + for (index, gene) in uniq_gene.iter().enumerate() { + gene_index_map.insert(gene, index); + } + + // One column of indices, and one column of values. Separated by file + Ok(true) } -fn align_file(path: &String, uniq_gene: &Vec) -> PyResult { +fn align_file(path: &String, uniq_gene: &AHashMap<&String, usize>, n: usize) -> PyResult { // Read extra feature data, where first and second column are genes let file = File::open(path)?; let reader = BufReader::new(file); let mut has_header = true; + let mut indices: Vec = Vec::new(); for line in reader.lines() { let line = line?; if has_header { @@ -134,7 +142,16 @@ fn align_file(path: &String, uniq_gene: &Vec) -> PyResult { continue; } let row: Vec<&str> = line.split('\t').collect(); - if row.len() > 2 {} + if row.len() > 2 { + let index1 = uniq_gene.get(&row[0].to_string()).unwrap(); + let index2 = uniq_gene.get(&row[1].to_string()).unwrap(); + let (i, j) = if index1 <= index2 { + (index1, index2) + } else { + (index2, index1) + }; + let new_index = i * n - i * (i - 1) / 2 + (j - i); + } } Ok(true) } From 1af42a1560375fd16358428dc7bec3fb4dfac744 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Tue, 8 Oct 2024 11:08:40 -0500 Subject: [PATCH 39/54] File creation --- python/funmap/cli.py | 4 +++- src/lib.rs | 32 +++++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/python/funmap/cli.py b/python/funmap/cli.py index 29a66390..26f3d0bb 100644 --- a/python/funmap/cli.py +++ b/python/funmap/cli.py @@ -88,7 +88,9 @@ def qc(config_file, force_rerun): @cli.command() def rust(): - funmap_lib.process_files(["test_data/dia.tsv"], [], "test_data") + funmap_lib.process_files( + ["test_data/dia.tsv"], ["test_data/all_no_methyl.tsv"], "test_data" + ) @cli.command(help="run funmap") diff --git a/src/lib.rs b/src/lib.rs index 24566f35..606025d7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,7 +3,7 @@ use pyo3::{exceptions::PyValueError, prelude::*}; use serde_pickle::SerOptions; use std::{ fs::File, - io::{BufRead, BufReader}, + io::{BufRead, BufReader, BufWriter, Write}, path::Path, }; @@ -117,16 +117,26 @@ fn process_files( gene_index_map.insert(gene, index); } + for file_path in extra_feature_paths { + align_file(&file_path, &gene_index_map, n as i32, folder_path)?; + } // One column of indices, and one column of values. Separated by file Ok(true) } -fn align_file(path: &String, uniq_gene: &AHashMap<&String, usize>, n: usize) -> PyResult { +fn align_file( + path: &String, + uniq_gene: &AHashMap<&String, usize>, + n: i32, + output_folder: &Path, +) -> PyResult { // Read extra feature data, where first and second column are genes let file = File::open(path)?; let reader = BufReader::new(file); let mut has_header = true; let mut indices: Vec = Vec::new(); + let mut feature_count = 0; + let mut writers = Vec::new(); for line in reader.lines() { let line = line?; if has_header { @@ -139,6 +149,13 @@ fn align_file(path: &String, uniq_gene: &AHashMap<&String, usize>, n: usize) -> path ))); } + feature_count = row.len() - 2; + for i in 0..feature_count { + let file_path = output_folder.join(format!("{}.col", row[i + 2])); + let f = File::create(file_path)?; + let bf = BufWriter::new(f); + writers.push(bf); + } continue; } let row: Vec<&str> = line.split('\t').collect(); @@ -146,11 +163,16 @@ fn align_file(path: &String, uniq_gene: &AHashMap<&String, usize>, n: usize) -> let index1 = uniq_gene.get(&row[0].to_string()).unwrap(); let index2 = uniq_gene.get(&row[1].to_string()).unwrap(); let (i, j) = if index1 <= index2 { - (index1, index2) + (*index1 as i32, *index2 as i32) } else { - (index2, index1) + (*index2 as i32, *index1 as i32) }; - let new_index = i * n - i * (i - 1) / 2 + (j - i); + let new_index = (i * n - i * (i - 1) / 2 + (j - i)) as usize; + indices.push(new_index); + for i in 0..feature_count { + writers[i].write_all(row[i + 2].as_bytes())?; + writers[i].write_all(b"\n")?; + } } } Ok(true) From 6de0073b95a13241ddf2800684b7c2e540429fb1 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Tue, 8 Oct 2024 15:36:17 -0500 Subject: [PATCH 40/54] create index file --- src/lib.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 606025d7..a2954c01 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -137,6 +137,12 @@ fn align_file( let mut indices: Vec = Vec::new(); let mut feature_count = 0; let mut writers = Vec::new(); + let original_file = Path::new(path).file_name().unwrap().to_str().unwrap(); + let index_file_path = output_folder.join(format!("{}.index", original_file)); + let f = File::create(index_file_path)?; + let bf = BufWriter::new(f); + writers.push(bf); + for line in reader.lines() { let line = line?; if has_header { @@ -169,9 +175,11 @@ fn align_file( }; let new_index = (i * n - i * (i - 1) / 2 + (j - i)) as usize; indices.push(new_index); + writers[0].write_all(new_index.to_string().as_bytes())?; + writers[0].write_all(b"\n")?; for i in 0..feature_count { - writers[i].write_all(row[i + 2].as_bytes())?; - writers[i].write_all(b"\n")?; + writers[i + 1].write_all(row[i + 2].as_bytes())?; + writers[i + 1].write_all(b"\n")?; } } } From 289185d233b2df1ca97ac32529644adf38b2bbe5 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Tue, 8 Oct 2024 16:29:10 -0500 Subject: [PATCH 41/54] Remove unnecessary file read --- python/funmap/funmap.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/funmap/funmap.py b/python/funmap/funmap.py index 50d5a37a..14b02454 100644 --- a/python/funmap/funmap.py +++ b/python/funmap/funmap.py @@ -540,8 +540,6 @@ def get_1d_indices(i_array, j_array, n): # Calculate 1D indices return i * n - i * (i - 1) // 2 + (j - i) - with h5py.File(feature_file, "r") as h5_file: - gene_ids = h5_file["ids"][:] df = pd.DataFrame(all_pairs, columns=["P1", "P2"]) with h5py.File(feature_file, "r") as h5_file: gene_ids = h5_file["ids"][:] From a49d5ecf520af0c3776fa1f3c8b49b1d8eb40345 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 10 Oct 2024 14:09:30 -0500 Subject: [PATCH 42/54] Use new feature_df creation --- python/funmap/cli.py | 22 +++++++++++++++++----- python/funmap/funmap.py | 30 +++++++++++++++++++++--------- python/funmap/utils.py | 36 +++++++++++++++++++++++++++++++++++- src/lib.rs | 15 ++++++++++++--- 4 files changed, 85 insertions(+), 18 deletions(-) diff --git a/python/funmap/cli.py b/python/funmap/cli.py index 26f3d0bb..29f50c77 100644 --- a/python/funmap/cli.py +++ b/python/funmap/cli.py @@ -6,8 +6,9 @@ import click import numpy as np import pandas as pd -from funmap import _lib as funmap_lib + from funmap import __version__ +from funmap import _lib as funmap_lib from funmap.data_urls import misc_urls as urls from funmap.funmap import ( compute_features, @@ -88,8 +89,14 @@ def qc(config_file, force_rerun): @cli.command() def rust(): + mapping = pd.read_csv(urls["mapping_file"], sep="\t") + coding = mapping.loc[mapping["coding"] == "coding", ["gene_name"]] + coding_genes = list(set(coding["gene_name"].to_list())) funmap_lib.process_files( - ["test_data/dia.tsv"], ["test_data/all_no_methyl.tsv"], "test_data" + ["test_data/dia.tsv"], + ["test_data/all_no_methyl.tsv"], + "test_data", + coding_genes, ) @@ -127,11 +134,12 @@ def run(config_file, force_rerun): setup_logging(config_file) cfg = setup_experiment(config_file) - extra_feature_file = cfg["extra_feature_file"] + extra_feature_file = cfg["extra_feature_folder"] extra_feature_df = None - if extra_feature_file is not None: + uniq_gene = None + if extra_feature_folder is not None: log.info("Loading extra feature file into dataframe") - extra_feature_df = process_extra_feature(extra_feature_file) + (uniq_gene, extra_feature_df) = new_extra_feature(extra_feature_file) gs_file = cfg["gs_file"] if (gs_file is not None) and (not check_gold_standard_file(gs_file)): return @@ -195,6 +203,10 @@ def run(config_file, force_rerun): cc_dict, mr_dict, all_valid_ids = compute_features( cfg, feature_type, min_sample_count, saved_data_dir ) + + if uniq_gene is not None: + all_valid_ids = uniq_gene + gs_args = { "task": task, "saved_data_dir": saved_data_dir, diff --git a/python/funmap/funmap.py b/python/funmap/funmap.py index 14b02454..fecb1108 100644 --- a/python/funmap/funmap.py +++ b/python/funmap/funmap.py @@ -286,7 +286,13 @@ def get_1d_indices(i_array, j_array, n): def extract_features( - df, feature_type, cc_dict, ppi_feature=None, extra_feature_df=None, mr_dict=None + df, + feature_type, + cc_dict, + ppi_feature=None, + extra_feature_df=None, + mr_dict=None, + chunk_data: tuple[int, int] | None = None, ): """ extract_features - creates the final feature `pandas` dataframe used by xgboost @@ -323,12 +329,17 @@ def extract_features( # TODO: add extra features if provided if extra_feature_df is not None: - df.reset_index(drop=True, inplace=True) - merged_df = pd.merge( - df[["P1", "P2"]], extra_feature_df, on=["P1", "P2"], how="left" - ) - merged_df = merged_df.drop(columns=["P1", "P2"]) - feature_df = pd.merge(feature_df, merged_df, left_index=True, right_index=True) + if chunk_data is not None: + (start_idx, chunk_size) = chunk_data + feature_df = pd.concat( + [feature_df, extra_feature_df[start_idx : start_idx + chunk_size]], + axis=1, + ) + else: + feature_df = pd.concat( + [feature_df, extra_feature_df], + axis=1, + ) # move 'label' column to the end of the dataframe if it exists if "label" in feature_df.columns: feature_df = feature_df[ @@ -513,10 +524,10 @@ def prepare_gs_data(**kwargs): else: ppi_feature = None gs_train_df = extract_features( - gs_train, feature_type, cc_dict, ppi_feature, extra_feature_df, mr_dict + gs_train, feature_type, cc_dict, ppi_feature, extra_feature_df, mr_dict, None ) gs_test_df = extract_features( - gs_test, feature_type, cc_dict, ppi_feature, extra_feature_df, mr_dict + gs_test, feature_type, cc_dict, ppi_feature, extra_feature_df, mr_dict, None ) # store both the ids with gs_test_df for later use @@ -754,6 +765,7 @@ def process_and_save_chunk(start_idx, chunk_id): cur_ppi_feature, extra_feature_df, mr_dict, + (start_idx, chunk_size), ) predictions = model.predict_proba(feature_df) prediction_df = pd.DataFrame(predictions[:, 1], columns=["prediction"]) diff --git a/python/funmap/utils.py b/python/funmap/utils.py index 7293ff30..b296601f 100644 --- a/python/funmap/utils.py +++ b/python/funmap/utils.py @@ -7,7 +7,8 @@ import urllib from pathlib import Path from urllib.parse import urlparse - +import glob +import pickle import pandas as pd import yaml @@ -568,6 +569,39 @@ def check_extra_feature_file(file_path, missing_value="NA"): return True +def read_files_to_dataframe(file_paths): + # Read each file into a DataFrame and concatenate + dataframes = [] + for file_path in file_paths: + # Read file into DataFrame; each line becomes a row in a single column + df = pd.read_csv(file_path, header=None, names=["data"]) + dataframes.append(df) + + # Concatenate all DataFrames into one + final_df = pd.concat(dataframes, ignore_index=True) + return final_df + + +def reorder_dataframe(final_df, index_file_path): + # Read the index file + with open(index_file_path, "r") as f: + indices = [int(line.strip()) for line in f.readlines()] + + # Reorder the DataFrame using the indices + reordered_df = final_df.iloc[indices].reset_index(drop=True) + return reordered_df + + +def new_extra_feature(extra_feature_folder): + index_file = list(glob.glob(f"{extra_feature_folder}/*.index"))[0] + features = list(glob.glob(f"{extra_feature_folder}/*.col")) + with open(f"{extra_feature_folder}/uniq_gene.pkl", "rb") as r: + uniq_gene = pickle.load(r) + df = read_files_to_dataframe(features) + df = read_files_to_dataframe(df, index_file) + return (uniq_gene, index_file) + + def process_extra_feature(extra_feature_file) -> pd.DataFrame: extra_feature_df = pd.read_csv(extra_feature_file, sep="\t") extra_feature_df.columns.values[0] = "P1" diff --git a/src/lib.rs b/src/lib.rs index a2954c01..ecd46da7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ -use ahash::{AHashMap, HashSet, HashSetExt}; +use ahash::{AHashMap, AHashSet, HashSet, HashSetExt}; use pyo3::{exceptions::PyValueError, prelude::*}; use serde_pickle::SerOptions; use std::{ @@ -33,11 +33,15 @@ use std::{ /// /// Function /// Output: list of all output feature pkl files +/// +/// TODO: Protein-coding gene filtering #[pyfunction] +#[pyo3(signature = (expression_paths, extra_feature_paths, output_folder, valid_ids=None))] fn process_files( expression_paths: Vec, extra_feature_paths: Vec, output_folder: String, + valid_ids: Option>, ) -> PyResult { // Step 1: Identify all unique genes // Across both expression and extra_feature_paths @@ -98,11 +102,16 @@ fn process_files( } } + let mut uniq_gene: Vec = if let Some(valid_ids) = valid_ids { + let valid_ids = AHashSet::from_iter(valid_ids); + uniq_gene.union(&valid_ids).cloned().collect() + } else { + uniq_gene.iter().cloned().collect() + }; + // Save to pickle // TODO: Look at other file formats - let mut uniq_gene: Vec = uniq_gene.iter().cloned().collect(); - // Sort genes alphabetically uniq_gene.sort(); let folder_path = Path::new(&output_folder); From 3cab234448bd9bbe14f647681f894e8509b30e05 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 10 Oct 2024 14:12:14 -0500 Subject: [PATCH 43/54] temporarily hard code paths --- python/funmap/cli.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/funmap/cli.py b/python/funmap/cli.py index 29f50c77..95b1338e 100644 --- a/python/funmap/cli.py +++ b/python/funmap/cli.py @@ -93,8 +93,13 @@ def rust(): coding = mapping.loc[mapping["coding"] == "coding", ["gene_name"]] coding_genes = list(set(coding["gene_name"].to_list())) funmap_lib.process_files( - ["test_data/dia.tsv"], - ["test_data/all_no_methyl.tsv"], + [ + "dummy/dia.tsv", + "dummy/methyl.tsv", + "dummy/RNAseq.tsv", + "dummy/tmt_abundance", + ], + ["dummy/all_no_methyl.tsv"], "test_data", coding_genes, ) From 457a0b11b66a9013e7f8d6fe803422024e1ee872 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 10 Oct 2024 14:26:27 -0500 Subject: [PATCH 44/54] better error messaging --- src/lib.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index ecd46da7..fa4a87e5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -51,7 +51,7 @@ fn process_files( // Read expression data where first column is gene information; for file_path in expression_paths.iter() { - let file = File::open(file_path)?; + let file = File::open(file_path).expect("Could not read expression data"); let reader = BufReader::new(file); let mut has_header = true; for line in reader.lines() { @@ -77,7 +77,7 @@ fn process_files( // Read extra feature data, where first and second column are genes for file_path in extra_feature_paths.iter() { - let file = File::open(file_path)?; + let file = File::open(file_path).expect("Could not read extra feature file"); let reader = BufReader::new(file); let mut has_header = true; for line in reader.lines() { @@ -116,7 +116,7 @@ fn process_files( uniq_gene.sort(); let folder_path = Path::new(&output_folder); let uniq_gene_file_path = folder_path.join("uniq_gene.pkl"); - let mut w = File::create(uniq_gene_file_path)?; + let mut w = File::create(uniq_gene_file_path).expect("Could not cread uniq_gene.pkl"); serde_pickle::to_writer(&mut w, &uniq_gene, SerOptions::default()).unwrap(); let n = uniq_gene.len(); // Re-align each file @@ -127,7 +127,8 @@ fn process_files( } for file_path in extra_feature_paths { - align_file(&file_path, &gene_index_map, n as i32, folder_path)?; + align_file(&file_path, &gene_index_map, n as i32, folder_path) + .expect("Error aligning file"); } // One column of indices, and one column of values. Separated by file Ok(true) From a4f0bc2763ef4800c8da850e4fa64ffb9ec6d1e0 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 10 Oct 2024 14:32:11 -0500 Subject: [PATCH 45/54] add extra_feature_folder to config creation --- python/funmap/cli.py | 2 +- python/funmap/utils.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/python/funmap/cli.py b/python/funmap/cli.py index 95b1338e..745e02f9 100644 --- a/python/funmap/cli.py +++ b/python/funmap/cli.py @@ -97,7 +97,7 @@ def rust(): "dummy/dia.tsv", "dummy/methyl.tsv", "dummy/RNAseq.tsv", - "dummy/tmt_abundance", + "dummy/tmt_abundance.tsv", ], ["dummy/all_no_methyl.tsv"], "test_data", diff --git a/python/funmap/utils.py b/python/funmap/utils.py index b296601f..2babc5b3 100644 --- a/python/funmap/utils.py +++ b/python/funmap/utils.py @@ -345,6 +345,7 @@ def get_config(cfg_file: Path): "step_size": 1000, "lr_cutoff": 50, "only_extra_features": False, + "extra_feature_folder": None, } with open(cfg_file, "r") as fh: @@ -370,6 +371,8 @@ def get_config(cfg_file: Path): if "extra_feature_file" in cfg_dict: cfg["extra_feature_file"] = cfg_dict["extra_feature_file"] + if "extra_feature_folder" in cfg_dict: + cfg["extra_feature_folder"] = cfg_dict["extra_feature_folder"] if "gs_file" in cfg_dict: cfg["gs_file"] = cfg_dict["gs_file"] From 0a478d354280e2f5868ef590742b2f68c29ab94d Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 10 Oct 2024 14:33:35 -0500 Subject: [PATCH 46/54] rename variables --- python/funmap/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/funmap/cli.py b/python/funmap/cli.py index 745e02f9..49abc19a 100644 --- a/python/funmap/cli.py +++ b/python/funmap/cli.py @@ -139,12 +139,12 @@ def run(config_file, force_rerun): setup_logging(config_file) cfg = setup_experiment(config_file) - extra_feature_file = cfg["extra_feature_folder"] + extra_feature_foler = cfg["extra_feature_folder"] extra_feature_df = None uniq_gene = None if extra_feature_folder is not None: log.info("Loading extra feature file into dataframe") - (uniq_gene, extra_feature_df) = new_extra_feature(extra_feature_file) + (uniq_gene, extra_feature_df) = new_extra_feature(extra_feature_folder) gs_file = cfg["gs_file"] if (gs_file is not None) and (not check_gold_standard_file(gs_file)): return From 7fa7c0beb4020603a748667e0898a540ddc82c8c Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 10 Oct 2024 14:33:49 -0500 Subject: [PATCH 47/54] fix naming --- python/funmap/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/funmap/cli.py b/python/funmap/cli.py index 49abc19a..53060df4 100644 --- a/python/funmap/cli.py +++ b/python/funmap/cli.py @@ -139,7 +139,7 @@ def run(config_file, force_rerun): setup_logging(config_file) cfg = setup_experiment(config_file) - extra_feature_foler = cfg["extra_feature_folder"] + extra_feature_folder = cfg["extra_feature_folder"] extra_feature_df = None uniq_gene = None if extra_feature_folder is not None: From ac7eb2faee99de811901a335ac79f352127e3e9c Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 10 Oct 2024 14:34:45 -0500 Subject: [PATCH 48/54] add missing imports --- python/funmap/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/funmap/cli.py b/python/funmap/cli.py index 53060df4..9b762b24 100644 --- a/python/funmap/cli.py +++ b/python/funmap/cli.py @@ -27,7 +27,7 @@ check_extra_feature_file, check_gold_standard_file, cleanup_experiment, - process_extra_feature, + new_extra_feature, setup_experiment, ) From 36ca663e9bc7be6554b3b97fcbe631fad34cf469 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 10 Oct 2024 14:37:09 -0500 Subject: [PATCH 49/54] fix function call --- python/funmap/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/funmap/utils.py b/python/funmap/utils.py index 2babc5b3..1f77ce27 100644 --- a/python/funmap/utils.py +++ b/python/funmap/utils.py @@ -601,7 +601,7 @@ def new_extra_feature(extra_feature_folder): with open(f"{extra_feature_folder}/uniq_gene.pkl", "rb") as r: uniq_gene = pickle.load(r) df = read_files_to_dataframe(features) - df = read_files_to_dataframe(df, index_file) + df = reorder_dataframe(df, index_file) return (uniq_gene, index_file) From 61a80a0be1ed669f1b32d5a659a8462f9e5a9bc0 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 10 Oct 2024 14:47:31 -0500 Subject: [PATCH 50/54] fix reordering dataframe --- python/funmap/utils.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/python/funmap/utils.py b/python/funmap/utils.py index 1f77ce27..2019cad6 100644 --- a/python/funmap/utils.py +++ b/python/funmap/utils.py @@ -585,13 +585,21 @@ def read_files_to_dataframe(file_paths): return final_df -def reorder_dataframe(final_df, index_file_path): +def reorder_dataframe(final_df, index_file_path, max_length): + reordered_df = pd.DataFrame(index=range(max_length), columns=final_df.columns) # Read the index file with open(index_file_path, "r") as f: indices = [int(line.strip()) for line in f.readlines()] - # Reorder the DataFrame using the indices - reordered_df = final_df.iloc[indices].reset_index(drop=True) + # Ensure the length of the indices is less than or equal to max_length + if len(indices) > max_length: + raise ValueError("More indices provided than the max_length.") + + # Fill in the reordered DataFrame based on the provided indices + for i, index in enumerate(indices): + if index < max_length: + reordered_df.iloc[index] = final_df.iloc[i] + return reordered_df @@ -600,8 +608,12 @@ def new_extra_feature(extra_feature_folder): features = list(glob.glob(f"{extra_feature_folder}/*.col")) with open(f"{extra_feature_folder}/uniq_gene.pkl", "rb") as r: uniq_gene = pickle.load(r) + i = j = len(uniq_gene) - 1 + n = i + 1 + max_len = i * n - i * (i - 1) // 2 + (j - i) + df = read_files_to_dataframe(features) - df = reorder_dataframe(df, index_file) + df = reorder_dataframe(df, index_file, max_len) return (uniq_gene, index_file) From 8279aecb3cf822530c6a9b14394e8ea3bebe0a01 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 10 Oct 2024 14:48:06 -0500 Subject: [PATCH 51/54] logging --- python/funmap/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/funmap/utils.py b/python/funmap/utils.py index 2019cad6..3ba69fcf 100644 --- a/python/funmap/utils.py +++ b/python/funmap/utils.py @@ -613,6 +613,7 @@ def new_extra_feature(extra_feature_folder): max_len = i * n - i * (i - 1) // 2 + (j - i) df = read_files_to_dataframe(features) + log.info(df.head()) df = reorder_dataframe(df, index_file, max_len) return (uniq_gene, index_file) From a177b404cc1e412d2aee2d61b99771213a118dc1 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 10 Oct 2024 14:50:21 -0500 Subject: [PATCH 52/54] fix column names --- python/funmap/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/funmap/utils.py b/python/funmap/utils.py index 3ba69fcf..ba3a9e2f 100644 --- a/python/funmap/utils.py +++ b/python/funmap/utils.py @@ -577,7 +577,11 @@ def read_files_to_dataframe(file_paths): dataframes = [] for file_path in file_paths: # Read file into DataFrame; each line becomes a row in a single column - df = pd.read_csv(file_path, header=None, names=["data"]) + df = pd.read_csv( + file_path, + header=None, + names=[os.path.basename(file_path).replace(".col", "")], + ) dataframes.append(df) # Concatenate all DataFrames into one From e90285d4a6fab858f0bcd0f62a193b7afbe86a24 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 10 Oct 2024 15:35:32 -0500 Subject: [PATCH 53/54] fix return type --- python/funmap/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/funmap/utils.py b/python/funmap/utils.py index ba3a9e2f..9a843796 100644 --- a/python/funmap/utils.py +++ b/python/funmap/utils.py @@ -619,7 +619,7 @@ def new_extra_feature(extra_feature_folder): df = read_files_to_dataframe(features) log.info(df.head()) df = reorder_dataframe(df, index_file, max_len) - return (uniq_gene, index_file) + return (uniq_gene, df) def process_extra_feature(extra_feature_file) -> pd.DataFrame: From 4be7af8692ce8ad2cbf700a2b014085c39851e6d Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Fri, 11 Oct 2024 12:06:58 -0500 Subject: [PATCH 54/54] build sqlite db for extra_features --- Cargo.lock | 135 ++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 2 + src/lib.rs | 142 +++++++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 263 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6150eee1..aa9638c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -21,24 +21,74 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" +[[package]] +name = "bitflags" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" + [[package]] name = "byteorder" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "cc" +version = "1.1.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58e804ac3194a48bb129643eb1d62fcc20d18c6b8c181704489353d13120bcd1" +dependencies = [ + "shlex", +] + [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "csv" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +dependencies = [ + "memchr", +] + +[[package]] +name = "fallible-iterator" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + [[package]] name = "funmap_lib" version = "0.1.0" dependencies = [ "ahash", + "csv", "pyo3", + "rusqlite", "serde-pickle", ] @@ -53,6 +103,24 @@ dependencies = [ "wasi", ] +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", +] + +[[package]] +name = "hashlink" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af" +dependencies = [ + "hashbrown", +] + [[package]] name = "heck" version = "0.5.0" @@ -71,12 +139,35 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c397ca3ea05ad509c4ec451fea28b4771236a376ca1c69fd5143aae0cf8f93c4" +[[package]] +name = "itoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" + [[package]] name = "libc" version = "0.2.159" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" +[[package]] +name = "libsqlite3-sys" +version = "0.30.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e99fb7a497b1e3339bc746195567ed8d3e24945ecd636e3619d20b9de9e9149" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + [[package]] name = "memoffset" version = "0.9.1" @@ -120,6 +211,12 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "pkg-config" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" + [[package]] name = "portable-atomic" version = "1.8.0" @@ -207,6 +304,26 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rusqlite" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7753b721174eb8ff87a9a0e799e2d7bc3749323e773db92e0984debb00019d6e" +dependencies = [ + "bitflags", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "smallvec", +] + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + [[package]] name = "serde" version = "1.0.210" @@ -240,6 +357,18 @@ dependencies = [ "syn", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + [[package]] name = "syn" version = "2.0.77" @@ -269,6 +398,12 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.5" diff --git a/Cargo.toml b/Cargo.toml index 80d83754..e3c902b7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,5 +10,7 @@ crate-type = ["cdylib"] [dependencies] ahash = "0.8.11" +csv = "1.3.0" pyo3 = "0.22.0" +rusqlite = { version = "0.32.1", features = ["bundled"] } serde-pickle = "1.1.1" diff --git a/src/lib.rs b/src/lib.rs index fa4a87e5..14090ecb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,7 @@ use ahash::{AHashMap, AHashSet, HashSet, HashSetExt}; +use csv::ReaderBuilder; use pyo3::{exceptions::PyValueError, prelude::*}; +use rusqlite::{params, params_from_iter, Connection, Result}; use serde_pickle::SerOptions; use std::{ fs::File, @@ -127,12 +129,118 @@ fn process_files( } for file_path in extra_feature_paths { - align_file(&file_path, &gene_index_map, n as i32, folder_path) - .expect("Error aligning file"); + new(&file_path, &gene_index_map, n as i32, folder_path).expect("Error aligning file"); } // One column of indices, and one column of values. Separated by file Ok(true) } +// Function to parse a string into a float, returning None for invalid or NaN values +fn safe_parse_float(s: &str) -> Option { + match s.parse::() { + Ok(f) if f.is_nan() => None, // Explicitly keep NaNs as None + Ok(f) => Some(f), + Err(_) => None, // If it can't be parsed, treat as None (null) + } +} +fn new( + path: &String, + uniq_gene: &AHashMap<&String, usize>, + n: i32, + output_folder: &Path, +) -> PyResult { + // Create SQLite connection + let conn = Connection::open("db.sqlite").unwrap(); + + // Open the TSV file + let file_path = path; + let mut rdr = ReaderBuilder::new() + .delimiter(b'\t') + .from_path(file_path) + .unwrap(); + + // Get the headers to determine feature names dynamically + let headers = rdr.headers().unwrap().clone(); + + // Features are all columns after the first two columns + let feature_names: Vec<&str> = headers + .iter() + .skip(2) // Skip the first two columns + .collect(); + + // Dynamically create the SQL table with the appropriate number of features, all as FLOAT + let feature_columns: Vec = feature_names + .iter() + .map(|name| format!("{} FLOAT", name)) + .collect(); + + let create_table_query = format!( + "CREATE TABLE IF NOT EXISTS gene_data ( + index_id TEXT PRIMARY KEY, + {} + )", + feature_columns.join(", ") + ); + conn.execute(&create_table_query, []).unwrap(); + + // Prepare to write features to a separate file (feature names only) + let feature_file = File::create("features.txt")?; + let mut feature_writer = BufWriter::new(feature_file); + + // Write feature names to the file + for feature in &feature_names { + writeln!(feature_writer, "{}", feature)?; + } + + // Read each record from TSV and insert it into the database + for result in rdr.records() { + let record = result.unwrap(); + let column1 = &record[0]; + let column2 = &record[1]; + let index1 = uniq_gene.get(&column1.to_string()); + let index2 = uniq_gene.get(&column2.to_string()); + if let (Some(index1), Some(index2)) = (index1, index2) { + let (i, j) = if index1 <= index2 { + (*index1 as i32, *index2 as i32) + } else { + (*index2 as i32, *index1 as i32) + }; + let index_id = (i * n - i * (i - 1) / 2 + (j - i)) as usize; + + // Extract feature values, converting them to Option to handle nulls/NaNs + let feature_values: Vec> = record + .iter() + .skip(2) // Skip the first two columns + .map(safe_parse_float) // Handle nulls and invalid values + .collect(); + + // Insert into SQLite database using dynamic query + let insert_query = format!( + "INSERT OR REPLACE INTO gene_data (index_id, {}) + VALUES (?1, {})", + feature_names.join(", "), + feature_values + .iter() + .enumerate() + .map(|(i, _)| format!("?{}", i + 2)) + .collect::>() + .join(", ") + ); + + // Collect the parameters for the query, using None for null/NaN values + let mut params_vec: Vec<&(dyn rusqlite::ToSql + Sync)> = vec![&index_id]; + for val in &feature_values { + match val { + Some(v) => params_vec.push(v), + None => params_vec.push(&rusqlite::types::Null), + } + } + + conn.execute(&insert_query, params_from_iter(params_vec.iter())) + .unwrap(); + } + } + Ok(true) +} fn align_file( path: &String, @@ -176,20 +284,22 @@ fn align_file( } let row: Vec<&str> = line.split('\t').collect(); if row.len() > 2 { - let index1 = uniq_gene.get(&row[0].to_string()).unwrap(); - let index2 = uniq_gene.get(&row[1].to_string()).unwrap(); - let (i, j) = if index1 <= index2 { - (*index1 as i32, *index2 as i32) - } else { - (*index2 as i32, *index1 as i32) - }; - let new_index = (i * n - i * (i - 1) / 2 + (j - i)) as usize; - indices.push(new_index); - writers[0].write_all(new_index.to_string().as_bytes())?; - writers[0].write_all(b"\n")?; - for i in 0..feature_count { - writers[i + 1].write_all(row[i + 2].as_bytes())?; - writers[i + 1].write_all(b"\n")?; + let index1 = uniq_gene.get(&row[0].to_string()); + let index2 = uniq_gene.get(&row[1].to_string()); + if let (Some(index1), Some(index2)) = (index1, index2) { + let (i, j) = if index1 <= index2 { + (*index1 as i32, *index2 as i32) + } else { + (*index2 as i32, *index1 as i32) + }; + let new_index = (i * n - i * (i - 1) / 2 + (j - i)) as usize; + indices.push(new_index); + writers[0].write_all(new_index.to_string().as_bytes())?; + writers[0].write_all(b"\n")?; + for i in 0..feature_count { + writers[i + 1].write_all(row[i + 2].as_bytes())?; + writers[i + 1].write_all(b"\n")?; + } } } }