-
Notifications
You must be signed in to change notification settings - Fork 14
Fix FRoGS compatibility on modern Python / TF 2.x / macOS #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,60 @@ | ||
| """Smoke test for the modernized FRoGS utilities. | ||
|
|
||
| Exercises: | ||
| * ``src/utils/parallel.py``: sequential + parallel paths of ``parallel.map``. | ||
| * ``src/utils/io_utils.py``: transparent ``.gz`` fallback in ``read_csv_auto``. | ||
|
|
||
| Run it from the ``demo/`` directory:: | ||
|
|
||
| python smoke_test.py | ||
|
|
||
| No model weights or large datasets are required; we only read the first | ||
| few rows of the shipped L1000 file to prove the code path works. | ||
| """ | ||
| from __future__ import annotations | ||
|
|
||
| import os | ||
| import sys | ||
| import time | ||
|
|
||
| HERE = os.path.dirname(os.path.abspath(__file__)) | ||
| REPO_ROOT = os.path.dirname(HERE) | ||
| SRC = os.path.join(REPO_ROOT, "src") | ||
| sys.path.insert(0, SRC) | ||
|
|
||
| from utils import parallel # noqa: E402 (import after sys.path tweak) | ||
| from utils.io_utils import read_csv_auto, resolve_data_path # noqa: E402 | ||
|
|
||
|
|
||
| def square(x: int) -> int: | ||
| time.sleep(0.01) | ||
| return x * x | ||
|
|
||
|
|
||
| def main() -> int: | ||
| print("[1/3] Sequential parallel.map...") | ||
| out = parallel.map(square, list(range(8)), n_CPU=1, progress=True) | ||
| assert out == [x * x for x in range(8)], out | ||
| print(" OK", out) | ||
|
|
||
| print("[2/3] Parallel parallel.map (n_CPU=2)...") | ||
| out = parallel.map(square, list(range(8)), n_CPU=2, progress=True) | ||
| assert out == [x * x for x in range(8)], out | ||
| print(" OK", out) | ||
|
|
||
| print("[3/3] read_csv_auto with transparent .gz fallback...") | ||
| # The shipped repo has `L1000_PhaseI_and_II.csv.gz` but scripts default | ||
| # to the `.csv` name. resolve_data_path should find the .gz sibling. | ||
| requested = os.path.join(REPO_ROOT, "data", "L1000_PhaseI_and_II.csv") | ||
| resolved = resolve_data_path(requested) | ||
| print(f" resolved {requested!r} -> {resolved!r}") | ||
| df = read_csv_auto(requested, nrows=3) | ||
| print(" OK, head:") | ||
| print(df.head(3).to_string(index=False)) | ||
|
|
||
| print("All smoke checks passed.") | ||
| return 0 | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| raise SystemExit(main()) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| # FRoGS runtime dependencies. | ||
| # | ||
| # The ranges below reflect versions that are known to work with the | ||
| # modernized code in this repo (Python 3.9–3.12, TensorFlow 2.x, | ||
| # macOS / Linux). Loosen them cautiously. | ||
|
|
||
| numpy>=1.21,<3 | ||
| pandas>=1.3,<3 | ||
| scikit-learn>=1.0 | ||
| scipy>=1.7 | ||
| tensorflow>=2.8,<3 | ||
| tqdm>=4.60 | ||
| matplotlib>=3.4 | ||
| requests>=2.28 | ||
| # goatools is used by src/utils/random_walk.py to traverse GO term parents. | ||
| goatools>=1.2 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,16 +5,24 @@ | |
| import glob | ||
| import math | ||
| import csv | ||
| import warnings | ||
| import tensorflow as tf | ||
| from utils import parallel | ||
| from tensorflow.python.keras import backend as K | ||
| from utils.io_utils import read_csv_auto, validate_required_files, ensure_dir | ||
| from tensorflow.keras import layers, losses | ||
| from tensorflow import keras | ||
| from tensorflow.keras.models import Model | ||
| from sklearn.metrics.pairwise import cosine_similarity | ||
| from sklearn.preprocessing import normalize | ||
| import argparse | ||
|
|
||
| if int(tf.__version__.split('.')[0]) < 2: | ||
| warnings.warn( | ||
| f"FRoGS requires TensorFlow >= 2.x (installed: {tf.__version__}); " | ||
| "behaviour is undefined on older versions.", | ||
| RuntimeWarning, | ||
| ) | ||
|
|
||
| def parse_args(): | ||
| parser = argparse.ArgumentParser(description='Train l1000 model') | ||
| parser.add_argument('--cpdlist_file', default='../data/compound_list_shRNA.txt', | ||
|
|
@@ -52,7 +60,7 @@ def get_model(fp_dim, hid_dim = 2048): | |
|
|
||
| merge = layers.Multiply()([denseout_cpd, denseout_target]) | ||
| denseclassifier = keras.Sequential([ | ||
| layers.Dense(hid_dim/4), | ||
| layers.Dense(hid_dim // 4), | ||
| layers.BatchNormalization(), | ||
| layers.ReLU(), | ||
| layers.Dense(1) | ||
|
|
@@ -289,6 +297,20 @@ def compute_gene_weight(mat, sim_mean, sim_std): | |
| args = parse_args() | ||
| cpdlist_file, target_file, sig_file, perttype, emb_go, emb_archs4, epochs, outdir, modeldir = args.cpdlist_file, args.target_file, args.sig_file, args.perttype, args.emb_go, args.emb_archs4, args.epochs, args.outdir, args.modeldir | ||
|
|
||
| # Validate all required inputs up-front so the user learns about every | ||
| # missing file at once rather than one failure at a time. | ||
| ok, missing = validate_required_files([cpdlist_file, target_file, sig_file, emb_go, emb_archs4]) | ||
| if not ok: | ||
| sys.stderr.write( | ||
| "ERROR: the following required input files could not be found\n" | ||
| "(a .gz counterpart was also tried for each):\n - " | ||
| + "\n - ".join(missing) | ||
| + "\n" | ||
| ) | ||
| sys.exit(2) | ||
| ensure_dir(outdir) | ||
| ensure_dir(modeldir) | ||
|
Comment on lines
+300
to
+312
|
||
|
|
||
| with open(emb_archs4, mode='r') as infile: | ||
| reader = csv.reader(infile) | ||
| archs4_emb = {rows[0]:np.array(rows[1:], dtype=np.float32) for rows in reader} | ||
|
|
@@ -298,7 +320,7 @@ def compute_gene_weight(mat, sim_mean, sim_std): | |
| go_emb = {rows[0]:np.array(rows[1:], dtype=np.float32) for rows in reader} | ||
|
|
||
| mean_std_dict_go, mean_std_dict_archs4 = gene_sim_mean_std() | ||
| t_target=pd.read_csv(target_file) | ||
| t_target = read_csv_auto(target_file) | ||
| cpd2target={} | ||
| target_gene_id = t_target.Broad_target_gene_id.tolist() | ||
|
|
||
|
|
@@ -330,7 +352,7 @@ def compute_gene_weight(mat, sim_mean, sim_std): | |
|
|
||
| #read gene signature file | ||
| id2sig = {} | ||
| t_sig=pd.read_csv(sig_file) | ||
| t_sig = read_csv_auto(sig_file) | ||
| pert_sig = [] | ||
| tasks = [] | ||
| cnt = 0 | ||
|
|
@@ -354,7 +376,7 @@ def compute_gene_weight(mat, sim_mean, sim_std): | |
| tasks.append((l1k, S_hit)) | ||
| all_cl.add(cl) | ||
|
|
||
| id_map=pd.read_csv('../data/term2gene_id.csv') | ||
| id_map = read_csv_auto('../data/term2gene_id.csv') | ||
| term2geneid = {} | ||
| for idx in id_map.index: | ||
| term2geneid[id_map.loc[idx, 'term_name']] = str(id_map.loc[idx, 'gene_id']) | ||
|
|
@@ -384,7 +406,7 @@ def compute_gene_weight(mat, sim_mean, sim_std): | |
|
|
||
| print("Compute embeddings of gene signatures...") | ||
| sig2vec_dic = {} | ||
| rslt=parallel.map(get_vec, tasks, n_CPU=10, progress=False) | ||
| rslt = parallel.map(get_vec, tasks, n_CPU=10, progress=True) | ||
| for dic in rslt: | ||
| for k in dic: | ||
| sig2vec_dic[k] = dic[k] | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same issue as in
l1000_model.py:validate_required_files()will accept*.gzalternates, but the code later callsopen(emb_archs4, ...),open(emb_go, ...), andopen(cpdlist_file)using the original (possibly missing) paths. To avoid “validated but still FileNotFoundError”, resolve and use the actual existing path(s) after validation.