Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions demo/smoke_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""Smoke test for the modernized FRoGS utilities.

Exercises:
* ``src/utils/parallel.py``: sequential + parallel paths of ``parallel.map``.
* ``src/utils/io_utils.py``: transparent ``.gz`` fallback in ``read_csv_auto``.

Run it from the ``demo/`` directory::

python smoke_test.py

No model weights or large datasets are required; we only read the first
few rows of the shipped L1000 file to prove the code path works.
"""
from __future__ import annotations

import os
import sys
import time

HERE = os.path.dirname(os.path.abspath(__file__))
REPO_ROOT = os.path.dirname(HERE)
SRC = os.path.join(REPO_ROOT, "src")
sys.path.insert(0, SRC)

from utils import parallel # noqa: E402 (import after sys.path tweak)
from utils.io_utils import read_csv_auto, resolve_data_path # noqa: E402


def square(x: int) -> int:
time.sleep(0.01)
return x * x


def main() -> int:
print("[1/3] Sequential parallel.map...")
out = parallel.map(square, list(range(8)), n_CPU=1, progress=True)
assert out == [x * x for x in range(8)], out
print(" OK", out)

print("[2/3] Parallel parallel.map (n_CPU=2)...")
out = parallel.map(square, list(range(8)), n_CPU=2, progress=True)
assert out == [x * x for x in range(8)], out
print(" OK", out)

print("[3/3] read_csv_auto with transparent .gz fallback...")
# The shipped repo has `L1000_PhaseI_and_II.csv.gz` but scripts default
# to the `.csv` name. resolve_data_path should find the .gz sibling.
requested = os.path.join(REPO_ROOT, "data", "L1000_PhaseI_and_II.csv")
resolved = resolve_data_path(requested)
print(f" resolved {requested!r} -> {resolved!r}")
df = read_csv_auto(requested, nrows=3)
print(" OK, head:")
print(df.head(3).to_string(index=False))

print("All smoke checks passed.")
return 0


if __name__ == "__main__":
raise SystemExit(main())
16 changes: 16 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# FRoGS runtime dependencies.
#
# The ranges below reflect versions that are known to work with the
# modernized code in this repo (Python 3.9–3.12, TensorFlow 2.x,
# macOS / Linux). Loosen them cautiously.

numpy>=1.21,<3
pandas>=1.3,<3
scikit-learn>=1.0
scipy>=1.7
tensorflow>=2.8,<3
tqdm>=4.60
matplotlib>=3.4
requests>=2.28
# goatools is used by src/utils/random_walk.py to traverse GO term parents.
goatools>=1.2
1 change: 0 additions & 1 deletion src/gene_vec_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import gc
from utils.random_walk import random_walk_w_restart as rwr
from utils.sampling_util import rw_sampling
from tensorflow.python.keras import backend as K
from tensorflow.keras import layers, losses
from tensorflow import keras
from tensorflow.keras.models import Model
Expand Down
29 changes: 24 additions & 5 deletions src/l1000_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,24 @@
import glob
import math
import csv
import warnings
import tensorflow as tf
from utils import parallel
from tensorflow.python.keras import backend as K
from utils.io_utils import read_csv_auto, validate_required_files, ensure_dir
from tensorflow.keras import layers, losses
from tensorflow import keras
from tensorflow.keras.models import Model
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import argparse

if int(tf.__version__.split('.')[0]) < 2:
warnings.warn(
f"FRoGS requires TensorFlow >= 2.x (installed: {tf.__version__}); "
"behaviour is undefined on older versions.",
RuntimeWarning,
)

def parse_args():
parser = argparse.ArgumentParser(description='Train l1000 model')
parser.add_argument('--cpdlist_file', default='../data/compound_list_test.txt',
Expand Down Expand Up @@ -46,7 +54,7 @@ def get_model(fp_dim, hid_dim = 2048):

merge = layers.Multiply()([denseout_cpd, denseout_target])
denseclassifier = keras.Sequential([
layers.Dense(hid_dim/4),
layers.Dense(hid_dim // 4),
layers.BatchNormalization(),
layers.ReLU(),
layers.Dense(1)
Expand Down Expand Up @@ -202,6 +210,17 @@ def compute_gene_weight(mat, sim_mean, sim_std):
args = parse_args()
cpdlist_file, sig_file, perttype, emb_go, emb_archs4, outdir = args.cpdlist_file, args.sig_file, args.perttype, args.emb_go, args.emb_archs4, args.outdir

ok, missing = validate_required_files([cpdlist_file, sig_file, emb_go, emb_archs4])
if not ok:
sys.stderr.write(
"ERROR: the following required input files could not be found\n"
"(a .gz counterpart was also tried for each):\n - "
+ "\n - ".join(missing)
+ "\n"
)
sys.exit(2)
ensure_dir(outdir)

Comment on lines +213 to +223
Copy link

Copilot AI Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same issue as in l1000_model.py: validate_required_files() will accept *.gz alternates, but the code later calls open(emb_archs4, ...), open(emb_go, ...), and open(cpdlist_file) using the original (possibly missing) paths. To avoid “validated but still FileNotFoundError”, resolve and use the actual existing path(s) after validation.

Copilot uses AI. Check for mistakes.
with open(emb_archs4, mode='r') as infile:
reader = csv.reader(infile)
archs4_emb = {rows[0]:np.array(rows[1:], dtype=np.float32) for rows in reader}
Expand All @@ -214,7 +233,7 @@ def compute_gene_weight(mat, sim_mean, sim_std):

#read gene signature file
id2sig = {}
t_sig=pd.read_csv(sig_file)
t_sig = read_csv_auto(sig_file)
pert_sig = []
tasks = []
cnt = 0
Expand All @@ -237,7 +256,7 @@ def compute_gene_weight(mat, sim_mean, sim_std):
tasks.append((l1k, S_hit))
all_cl.add(cl)

id_map=pd.read_csv('../data/term2gene_id.csv')
id_map = read_csv_auto('../data/term2gene_id.csv')
term2geneid = {}
for idx in id_map.index:
term2geneid[id_map.loc[idx, 'term_name']] = str(id_map.loc[idx, 'gene_id'])
Expand Down Expand Up @@ -268,7 +287,7 @@ def compute_gene_weight(mat, sim_mean, sim_std):

print("Compute embeddings of gene signatures...")
sig2vec_dic = {}
rslt=parallel.map(get_vec, tasks, n_CPU=10, progress=False)
rslt = parallel.map(get_vec, tasks, n_CPU=10, progress=True)
for dic in rslt:
for k in dic:
sig2vec_dic[k] = dic[k]
Expand Down
34 changes: 28 additions & 6 deletions src/l1000_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,24 @@
import glob
import math
import csv
import warnings
import tensorflow as tf
from utils import parallel
from tensorflow.python.keras import backend as K
from utils.io_utils import read_csv_auto, validate_required_files, ensure_dir
from tensorflow.keras import layers, losses
from tensorflow import keras
from tensorflow.keras.models import Model
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import argparse

if int(tf.__version__.split('.')[0]) < 2:
warnings.warn(
f"FRoGS requires TensorFlow >= 2.x (installed: {tf.__version__}); "
"behaviour is undefined on older versions.",
RuntimeWarning,
)

def parse_args():
parser = argparse.ArgumentParser(description='Train l1000 model')
parser.add_argument('--cpdlist_file', default='../data/compound_list_shRNA.txt',
Expand Down Expand Up @@ -52,7 +60,7 @@ def get_model(fp_dim, hid_dim = 2048):

merge = layers.Multiply()([denseout_cpd, denseout_target])
denseclassifier = keras.Sequential([
layers.Dense(hid_dim/4),
layers.Dense(hid_dim // 4),
layers.BatchNormalization(),
layers.ReLU(),
layers.Dense(1)
Expand Down Expand Up @@ -289,6 +297,20 @@ def compute_gene_weight(mat, sim_mean, sim_std):
args = parse_args()
cpdlist_file, target_file, sig_file, perttype, emb_go, emb_archs4, epochs, outdir, modeldir = args.cpdlist_file, args.target_file, args.sig_file, args.perttype, args.emb_go, args.emb_archs4, args.epochs, args.outdir, args.modeldir

# Validate all required inputs up-front so the user learns about every
# missing file at once rather than one failure at a time.
ok, missing = validate_required_files([cpdlist_file, target_file, sig_file, emb_go, emb_archs4])
if not ok:
sys.stderr.write(
"ERROR: the following required input files could not be found\n"
"(a .gz counterpart was also tried for each):\n - "
+ "\n - ".join(missing)
+ "\n"
)
sys.exit(2)
ensure_dir(outdir)
ensure_dir(modeldir)
Comment on lines +300 to +312
Copy link

Copilot AI Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

validate_required_files() treats a .gz sibling as acceptable, but the script continues using the original paths (e.g., open(emb_archs4, ...) and open(cpdlist_file) below). This means validation can succeed when only *.gz exists, and then the script still fails at the first open(...). Consider resolving each validated path (e.g., via resolve_data_path) and reassigning cpdlist_file/emb_go/emb_archs4/etc. to the resolved filename (and using gzip.open when appropriate), or limiting validation to the exact filenames you later open directly.

Copilot uses AI. Check for mistakes.

with open(emb_archs4, mode='r') as infile:
reader = csv.reader(infile)
archs4_emb = {rows[0]:np.array(rows[1:], dtype=np.float32) for rows in reader}
Expand All @@ -298,7 +320,7 @@ def compute_gene_weight(mat, sim_mean, sim_std):
go_emb = {rows[0]:np.array(rows[1:], dtype=np.float32) for rows in reader}

mean_std_dict_go, mean_std_dict_archs4 = gene_sim_mean_std()
t_target=pd.read_csv(target_file)
t_target = read_csv_auto(target_file)
cpd2target={}
target_gene_id = t_target.Broad_target_gene_id.tolist()

Expand Down Expand Up @@ -330,7 +352,7 @@ def compute_gene_weight(mat, sim_mean, sim_std):

#read gene signature file
id2sig = {}
t_sig=pd.read_csv(sig_file)
t_sig = read_csv_auto(sig_file)
pert_sig = []
tasks = []
cnt = 0
Expand All @@ -354,7 +376,7 @@ def compute_gene_weight(mat, sim_mean, sim_std):
tasks.append((l1k, S_hit))
all_cl.add(cl)

id_map=pd.read_csv('../data/term2gene_id.csv')
id_map = read_csv_auto('../data/term2gene_id.csv')
term2geneid = {}
for idx in id_map.index:
term2geneid[id_map.loc[idx, 'term_name']] = str(id_map.loc[idx, 'gene_id'])
Expand Down Expand Up @@ -384,7 +406,7 @@ def compute_gene_weight(mat, sim_mean, sim_std):

print("Compute embeddings of gene signatures...")
sig2vec_dic = {}
rslt=parallel.map(get_vec, tasks, n_CPU=10, progress=False)
rslt = parallel.map(get_vec, tasks, n_CPU=10, progress=True)
for dic in rslt:
for k in dic:
sig2vec_dic[k] = dic[k]
Expand Down
Loading
Loading