From f31923c48ea3a47f9c721926cd7786f05b2f63f9 Mon Sep 17 00:00:00 2001
From: "David S. Reiner" <420230+chronicgiardia@users.noreply.github.com>
Date: Wed, 22 Apr 2026 04:26:51 -0700
Subject: [PATCH] Fix FRoGS compatibility on modern Python/TF/macOS

Modernize the codebase so FRoGS runs cleanly on Python 3.9-3.12,
TensorFlow 2.x, and macOS.

Multiprocessing (src/utils/parallel.py)
- Rewrite parallel.map on top of concurrent.futures.ProcessPoolExecutor.
- Default to the 'fork' start method on macOS so worker processes inherit
  module-level globals (go_emb, archs4_emb, ...) that the FRoGS worker
  functions rely on; this was the root cause of hangs on macOS.
- Drop legacy 'six' / __future__ imports.
- Add FROGS_N_CPU and FROGS_MP_START env overrides.
- Aggregate worker exceptions into a single RuntimeError instead of
  silently deadlocking.
- Optional tqdm progress bar when available.
- Retain legacy MP/parmap/parprep helpers for backwards compatibility.

Auto .gz data handling (src/utils/io_utils.py, new)
- read_csv_auto() transparently resolves .csv <-> .csv.gz.
- validate_required_files() reports every missing input at once.
- Wired into l1000_model.py and l1000_inference.py for sig_file,
  target_file, and term2gene_id.csv.

TensorFlow 2.x compatibility
- Remove private 'from tensorflow.python.keras import backend as K'
  imports from l1000_model.py, l1000_inference.py, gene_vec_model.py,
  and snp_gene_vec_model.py (K was unused).
- Fix layers.Dense(hid_dim/4) -> layers.Dense(hid_dim // 4).
- Warn at import time if tf.__version__ is < 2.x.

Error handling and usability
- Up-front file validation in both L1000 scripts with a single
  consolidated error message (including .gz alternates).
- ensure_dir() auto-creates outdir/modeldir.
- parallel.map(..., progress=True) wired into the embedding step so
  tqdm bars render during long runs.
- Fix four literal '\\n' output writes in snp_gene_vec_model.py that
  would have emitted a backslash-n instead of a newline.
- Add requirements.txt pinning compatible ranges for numpy, pandas,
  scikit-learn, scipy, tensorflow, tqdm, matplotlib, requests,
  goatools.

Validation
- py_compile passes on every edited file.
- New demo/smoke_test.py exercises parallel.map (seq + par) and
  read_csv_auto; passes end-to-end against the shipped data file.

Co-Authored-By: Oz <oz-agent@warp.dev>
---
 demo/smoke_test.py        |  60 ++++++++
 requirements.txt          |  16 ++
 src/gene_vec_model.py     |   1 -
 src/l1000_inference.py    |  29 +++-
 src/l1000_model.py        |  34 +++-
 src/snp_gene_vec_model.py | 315 ++++++++++++++++++++++++++++++++++++++
 src/utils/io_utils.py     |  90 +++++++++++
 src/utils/parallel.py     | 204 +++++++++++++++++-------
 8 files changed, 684 insertions(+), 65 deletions(-)
 create mode 100644 demo/smoke_test.py
 create mode 100644 requirements.txt
 create mode 100644 src/snp_gene_vec_model.py
 create mode 100644 src/utils/io_utils.py

diff --git a/demo/smoke_test.py b/demo/smoke_test.py
new file mode 100644
index 0000000..3f403ed
--- /dev/null
+++ b/demo/smoke_test.py
@@ -0,0 +1,60 @@
+"""Smoke test for the modernized FRoGS utilities.
+
+Exercises:
+ * ``src/utils/parallel.py``: sequential + parallel paths of ``parallel.map``.
+ * ``src/utils/io_utils.py``: transparent ``.gz`` fallback in ``read_csv_auto``.
+
+Run it from the ``demo/`` directory::
+
+    python smoke_test.py
+
+No model weights or large datasets are required; we only read the first
+few rows of the shipped L1000 file to prove the code path works.
+"""
+from __future__ import annotations
+
+import os
+import sys
+import time
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+REPO_ROOT = os.path.dirname(HERE)
+SRC = os.path.join(REPO_ROOT, "src")
+sys.path.insert(0, SRC)
+
+from utils import parallel  # noqa: E402  (import after sys.path tweak)
+from utils.io_utils import read_csv_auto, resolve_data_path  # noqa: E402
+
+
+def square(x: int) -> int:
+    time.sleep(0.01)
+    return x * x
+
+
+def main() -> int:
+    print("[1/3] Sequential parallel.map...")
+    out = parallel.map(square, list(range(8)), n_CPU=1, progress=True)
+    assert out == [x * x for x in range(8)], out
+    print("   OK", out)
+
+    print("[2/3] Parallel parallel.map (n_CPU=2)...")
+    out = parallel.map(square, list(range(8)), n_CPU=2, progress=True)
+    assert out == [x * x for x in range(8)], out
+    print("   OK", out)
+
+    print("[3/3] read_csv_auto with transparent .gz fallback...")
+    # The shipped repo has `L1000_PhaseI_and_II.csv.gz` but scripts default
+    # to the `.csv` name.  resolve_data_path should find the .gz sibling.
+    requested = os.path.join(REPO_ROOT, "data", "L1000_PhaseI_and_II.csv")
+    resolved = resolve_data_path(requested)
+    print(f"   resolved {requested!r} -> {resolved!r}")
+    df = read_csv_auto(requested, nrows=3)
+    print("   OK, head:")
+    print(df.head(3).to_string(index=False))
+
+    print("All smoke checks passed.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..c10fcee
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,16 @@
+# FRoGS runtime dependencies.
+#
+# The ranges below reflect versions that are known to work with the
+# modernized code in this repo (Python 3.9–3.12, TensorFlow 2.x,
+# macOS / Linux).  Loosen them cautiously.
+
+numpy>=1.21,<3
+pandas>=1.3,<3
+scikit-learn>=1.0
+scipy>=1.7
+tensorflow>=2.8,<3
+tqdm>=4.60
+matplotlib>=3.4
+requests>=2.28
+# goatools is used by src/utils/random_walk.py to traverse GO term parents.
+goatools>=1.2
diff --git a/src/gene_vec_model.py b/src/gene_vec_model.py
index a0ebb62..488347a 100644
--- a/src/gene_vec_model.py
+++ b/src/gene_vec_model.py
@@ -3,7 +3,6 @@
 import gc
 from utils.random_walk import random_walk_w_restart as rwr
 from utils.sampling_util import rw_sampling
-from tensorflow.python.keras import backend as K
 from tensorflow.keras import layers, losses
 from tensorflow import keras
 from tensorflow.keras.models import Model
diff --git a/src/l1000_inference.py b/src/l1000_inference.py
index f00d8d3..fa138d1 100644
--- a/src/l1000_inference.py
+++ b/src/l1000_inference.py
@@ -5,9 +5,10 @@
 import glob
 import math
 import csv
+import warnings
 import tensorflow as tf
 from utils import parallel
-from tensorflow.python.keras import backend as K
+from utils.io_utils import read_csv_auto, validate_required_files, ensure_dir
 from tensorflow.keras import layers, losses
 from tensorflow import keras
 from tensorflow.keras.models import Model
@@ -15,6 +16,13 @@
 from sklearn.preprocessing import normalize
 import argparse
 
+if int(tf.__version__.split('.')[0]) < 2:
+    warnings.warn(
+        f"FRoGS requires TensorFlow >= 2.x (installed: {tf.__version__}); "
+        "behaviour is undefined on older versions.",
+        RuntimeWarning,
+    )
+
 def parse_args():
     parser = argparse.ArgumentParser(description='Train l1000 model')
     parser.add_argument('--cpdlist_file', default='../data/compound_list_test.txt',
@@ -46,7 +54,7 @@ def get_model(fp_dim, hid_dim = 2048):
 
     merge = layers.Multiply()([denseout_cpd, denseout_target])
     denseclassifier = keras.Sequential([
-        layers.Dense(hid_dim/4),
+        layers.Dense(hid_dim // 4),
         layers.BatchNormalization(),
         layers.ReLU(),
         layers.Dense(1)
@@ -202,6 +210,17 @@ def compute_gene_weight(mat, sim_mean, sim_std):
 args = parse_args()
 cpdlist_file, sig_file, perttype, emb_go, emb_archs4, outdir = args.cpdlist_file, args.sig_file, args.perttype, args.emb_go, args.emb_archs4, args.outdir
 
+ok, missing = validate_required_files([cpdlist_file, sig_file, emb_go, emb_archs4])
+if not ok:
+    sys.stderr.write(
+        "ERROR: the following required input files could not be found\n"
+        "(a .gz counterpart was also tried for each):\n  - "
+        + "\n  - ".join(missing)
+        + "\n"
+    )
+    sys.exit(2)
+ensure_dir(outdir)
+
 with open(emb_archs4, mode='r') as infile:
     reader = csv.reader(infile)
     archs4_emb = {rows[0]:np.array(rows[1:], dtype=np.float32) for rows in reader}
@@ -214,7 +233,7 @@ def compute_gene_weight(mat, sim_mean, sim_std):
 
 #read gene signature file
 id2sig = {}
-t_sig=pd.read_csv(sig_file)
+t_sig = read_csv_auto(sig_file)
 pert_sig = []
 tasks = []
 cnt = 0
@@ -237,7 +256,7 @@ def compute_gene_weight(mat, sim_mean, sim_std):
     tasks.append((l1k, S_hit))
     all_cl.add(cl)
 
-id_map=pd.read_csv('../data/term2gene_id.csv')
+id_map = read_csv_auto('../data/term2gene_id.csv')
 term2geneid = {}
 for idx in id_map.index:
     term2geneid[id_map.loc[idx, 'term_name']] = str(id_map.loc[idx, 'gene_id'])
@@ -268,7 +287,7 @@ def compute_gene_weight(mat, sim_mean, sim_std):
 
 print("Compute embeddings of gene signatures...")
 sig2vec_dic = {}
-rslt=parallel.map(get_vec, tasks, n_CPU=10, progress=False)
+rslt = parallel.map(get_vec, tasks, n_CPU=10, progress=True)
 for dic in rslt:
     for k in dic:
         sig2vec_dic[k] = dic[k]
diff --git a/src/l1000_model.py b/src/l1000_model.py
index 93dacd1..be96f1d 100644
--- a/src/l1000_model.py
+++ b/src/l1000_model.py
@@ -5,9 +5,10 @@
 import glob
 import math
 import csv
+import warnings
 import tensorflow as tf
 from utils import parallel
-from tensorflow.python.keras import backend as K
+from utils.io_utils import read_csv_auto, validate_required_files, ensure_dir
 from tensorflow.keras import layers, losses
 from tensorflow import keras
 from tensorflow.keras.models import Model
@@ -15,6 +16,13 @@
 from sklearn.preprocessing import normalize
 import argparse
 
+if int(tf.__version__.split('.')[0]) < 2:
+    warnings.warn(
+        f"FRoGS requires TensorFlow >= 2.x (installed: {tf.__version__}); "
+        "behaviour is undefined on older versions.",
+        RuntimeWarning,
+    )
+
 def parse_args():
     parser = argparse.ArgumentParser(description='Train l1000 model')
     parser.add_argument('--cpdlist_file', default='../data/compound_list_shRNA.txt',
@@ -52,7 +60,7 @@ def get_model(fp_dim, hid_dim = 2048):
 
     merge = layers.Multiply()([denseout_cpd, denseout_target])
     denseclassifier = keras.Sequential([
-        layers.Dense(hid_dim/4),
+        layers.Dense(hid_dim // 4),
         layers.BatchNormalization(),
         layers.ReLU(),
         layers.Dense(1)
@@ -289,6 +297,20 @@ def compute_gene_weight(mat, sim_mean, sim_std):
 args = parse_args()
 cpdlist_file, target_file, sig_file, perttype, emb_go, emb_archs4, epochs, outdir, modeldir = args.cpdlist_file, args.target_file, args.sig_file, args.perttype, args.emb_go, args.emb_archs4, args.epochs, args.outdir, args.modeldir
 
+# Validate all required inputs up-front so the user learns about every
+# missing file at once rather than one failure at a time.
+ok, missing = validate_required_files([cpdlist_file, target_file, sig_file, emb_go, emb_archs4])
+if not ok:
+    sys.stderr.write(
+        "ERROR: the following required input files could not be found\n"
+        "(a .gz counterpart was also tried for each):\n  - "
+        + "\n  - ".join(missing)
+        + "\n"
+    )
+    sys.exit(2)
+ensure_dir(outdir)
+ensure_dir(modeldir)
+
 with open(emb_archs4, mode='r') as infile:
     reader = csv.reader(infile)
     archs4_emb = {rows[0]:np.array(rows[1:], dtype=np.float32) for rows in reader}
@@ -298,7 +320,7 @@ def compute_gene_weight(mat, sim_mean, sim_std):
     go_emb = {rows[0]:np.array(rows[1:], dtype=np.float32) for rows in reader}
 
 mean_std_dict_go, mean_std_dict_archs4 = gene_sim_mean_std()
-t_target=pd.read_csv(target_file)
+t_target = read_csv_auto(target_file)
 cpd2target={}
 target_gene_id = t_target.Broad_target_gene_id.tolist()
 
@@ -330,7 +352,7 @@ def compute_gene_weight(mat, sim_mean, sim_std):
 
 #read gene signature file
 id2sig = {}
-t_sig=pd.read_csv(sig_file)
+t_sig = read_csv_auto(sig_file)
 pert_sig = []
 tasks = []
 cnt = 0
@@ -354,7 +376,7 @@ def compute_gene_weight(mat, sim_mean, sim_std):
         tasks.append((l1k, S_hit))
         all_cl.add(cl)
 
-id_map=pd.read_csv('../data/term2gene_id.csv')
+id_map = read_csv_auto('../data/term2gene_id.csv')
 term2geneid = {}
 for idx in id_map.index:
     term2geneid[id_map.loc[idx, 'term_name']] = str(id_map.loc[idx, 'gene_id'])
@@ -384,7 +406,7 @@ def compute_gene_weight(mat, sim_mean, sim_std):
 
 print("Compute embeddings of gene signatures...")
 sig2vec_dic = {}
-rslt=parallel.map(get_vec, tasks, n_CPU=10, progress=False)
+rslt = parallel.map(get_vec, tasks, n_CPU=10, progress=True)
 for dic in rslt:
     for k in dic:
         sig2vec_dic[k] = dic[k]
diff --git a/src/snp_gene_vec_model.py b/src/snp_gene_vec_model.py
new file mode 100644
index 0000000..4d46c4e
--- /dev/null
+++ b/src/snp_gene_vec_model.py
@@ -0,0 +1,315 @@
+"""
+SNP-based Gene Vector Model for FRoGS
+Extends gene_vec_model.py to support SNP association data for training gene embeddings.
+"""
+
+import os
+# Fix TensorFlow threading issues - must be set before importing TensorFlow
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['TF_NUM_INTEROP_THREADS'] = '1'
+os.environ['TF_NUM_INTRAOP_THREADS'] = '1'
+
+import numpy as np
+import tensorflow as tf
+import gc
+
+# Additional TensorFlow thread configuration
+tf.config.threading.set_inter_op_parallelism_threads(1)
+tf.config.threading.set_intra_op_parallelism_threads(1)
+from utils.random_walk import random_walk_w_restart as rwr
+from utils.sampling_util import rw_sampling
+from tensorflow.keras import layers, losses
+from tensorflow import keras
+from tensorflow.keras.models import Model
+import argparse
+import pandas as pd
+from utils.snp_utils import SNPProcessor, create_snp_associations
+from utils.snp_validation import SNPValidator
+import logging
+
+logger = logging.getLogger(__name__)
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train gene embeddings with SNP data support')
+    parser.add_argument('--datatype', default='go',
+                        help='Type of data: go, archs4, or snp')
+    parser.add_argument('--association_file', default='../data/go_gene_association.txt',
+                        help='Path to gene associations file')
+    parser.add_argument('--snp_file', default=None,
+                        help='Path to SNP data file (required for SNP datatype)')
+    parser.add_argument('--snp_format', default='auto',
+                        help='SNP file format: vcf, bed, list, csv, or auto')
+    parser.add_argument('--outfile', default='../data/gene_vec_go.csv',
+                        help='Path to save learned embeddings')
+    parser.add_argument('--validate_snps', action='store_true',
+                        help='Validate SNP data before processing')
+    parser.add_argument('--pvalue_threshold', type=float, default=5e-8,
+                        help='P-value threshold for SNP filtering')
+    parser.add_argument('--effect_threshold', type=float, default=0.01,
+                        help='Effect size threshold for SNP filtering')
+    return parser.parse_args()
+
+def get_model(vocab_size, latent_dim, ori_dim):
+    """Build the neural embedding model."""
+    encoder_input_l = keras.Input(shape=(1,), name="geneidx_l")
+    encoder_input_r = keras.Input(shape=(1,), name="geneidx_r")
+    encoderlayers = layers.Embedding(vocab_size, latent_dim, input_length=1)
+    encoder_output_l = keras.backend.squeeze(encoderlayers(encoder_input_l), axis=1)
+    encoder_output_r = keras.backend.squeeze(encoderlayers(encoder_input_r), axis=1)
+
+    encoder = keras.Model(inputs=encoder_input_l, outputs=encoder_output_l, name="encoder")
+    encoder.summary()
+
+    logit = layers.Dot(axes=1)([encoder_output_l, encoder_output_r])
+
+    model = keras.Model(inputs=[encoder_input_l, encoder_input_r], outputs=[logit], name="autoencoder")
+    model.summary()
+
+    return encoder, model
+
+def process_snp_data_for_training(snp_file: str, snp_format: str = 'auto',
+                                 validate: bool = True,
+                                 pvalue_threshold: float = 5e-8,
+                                 effect_threshold: float = 0.01) -> str:
+    """
+    Process SNP data and create gene associations file for training.
+    
+    Args:
+        snp_file: Path to SNP data file
+        snp_format: SNP file format
+        validate: Whether to validate SNP data
+        pvalue_threshold: P-value threshold for filtering
+        effect_threshold: Effect size threshold for filtering
+    
+    Returns:
+        Path to created associations file
+    """
+    logger.info(f"Processing SNP data from {snp_file}")
+    
+    # Load SNP data
+    processor = SNPProcessor()
+    
+    if snp_format == 'vcf':
+        snp_data = processor.load_vcf_file(snp_file)
+    elif snp_format == 'bed':
+        snp_data = processor.load_bed_file(snp_file)
+    elif snp_format in ['list', 'csv']:
+        snp_data = processor.load_custom_snp_file(snp_file, snp_format)
+    else:  # auto-detect
+        from utils.snp_utils import load_snp_data
+        snp_data = load_snp_data(snp_file, snp_format)
+    
+    # Validate SNP data if requested
+    if validate:
+        validator = SNPValidator()
+        
+        # Generate QC report
+        qc_report = validator.generate_qc_report(snp_data, '../results/snp_qc_report.txt')
+        logger.info("SNP QC report generated")
+        
+        # Filter by p-value and effect size if columns are available
+        from utils.snp_validation import filter_snps_by_criteria
+        
+        pvalue_cols = [col for col in ['PVALUE', 'P', 'P_VALUE'] if col in snp_data.columns]
+        effect_cols = [col for col in ['EFFECT', 'BETA', 'OR'] if col in snp_data.columns]
+        
+        if pvalue_cols or effect_cols:
+            pvalue_col = pvalue_cols[0] if pvalue_cols else 'PVALUE'
+            effect_col = effect_cols[0] if effect_cols else 'EFFECT'
+            
+            original_count = len(snp_data)
+            snp_data = filter_snps_by_criteria(snp_data, 
+                                             pvalue_threshold=pvalue_threshold,
+                                             effect_threshold=effect_threshold,
+                                             pvalue_column=pvalue_col,
+                                             effect_column=effect_col)
+            logger.info(f"Filtered SNPs: {len(snp_data)}/{original_count} retained")
+    
+    # Create gene associations file
+    associations_file = '../data/snp_gene_association.txt'
+    processor.create_snp_gene_associations(snp_data, associations_file)
+    
+    return associations_file
+
+def train_snp_embeddings(args):
+    """Train gene embeddings using SNP association data."""
+    
+    if args.datatype == 'snp':
+        if not args.snp_file:
+            raise ValueError("SNP file must be provided when datatype is 'snp'")
+        
+        # Process SNP data to create associations file
+        association_file = process_snp_data_for_training(
+            args.snp_file, 
+            args.snp_format,
+            args.validate_snps,
+            args.pvalue_threshold,
+            args.effect_threshold
+        )
+        
+        # Update output filename to include SNP suffix
+        if args.outfile == '../data/gene_vec_go.csv':
+            args.outfile = '../data/gene_vec_snp_256.csv'
+        
+    else:
+        association_file = args.association_file
+    
+    # Use original FRoGS training pipeline
+    logger.info(f"Training embeddings with data type: {args.datatype}")
+    
+    geneids, association_mat, diffusion_state = rwr(args.datatype, association_file)
+    nonzeroidx = np.where(np.sum(association_mat, axis=1) > 0)[0]
+
+    geneids = np.array(geneids)
+    fp_array = association_mat[nonzeroidx]
+    geneids = geneids[nonzeroidx]
+
+    ori_dim = fp_array.shape[-1]
+    latent_dim = 256
+
+    encoder, autoencoder = get_model(len(geneids), latent_dim, ori_dim)
+    optimizer = keras.optimizers.Adam()
+    autoencoder.compile(
+        optimizer=optimizer,
+        loss=[losses.BinaryCrossentropy(from_logits=True)]
+    )
+
+    rs = rw_sampling(diffusion_state)
+    epochs = 3000
+    for e in range(epochs):
+        print(f'epoch: {e}')
+        left_input = []
+        right_input = []
+        targets = []
+        pos_pairs = 1
+        neg_pairs = 5
+        
+        # Sample training pairs
+        left_input, right_input, targets = rs.sampling(np.arange(len(geneids)), pos_pairs, neg_pairs)
+
+        left_input = np.squeeze(np.array(left_input))
+        right_input = np.squeeze(np.array(right_input))
+        targets = np.squeeze(np.array(targets))
+        sample_size = len(left_input)
+        sample_idx = np.arange(sample_size)
+        np.random.shuffle(sample_idx)
+        left_input = left_input[sample_idx]
+        right_input = right_input[sample_idx]
+        targets = targets[sample_idx]
+
+        autoencoder.fit([left_input[:int(0.8*sample_size)] * 1.0, right_input[:int(0.8*sample_size)] * 1.0], 
+                       [targets[:int(0.8*sample_size)]],
+                       batch_size=1000,
+                       epochs=1,
+                       verbose=1,
+                       validation_data=([left_input[int(0.8*sample_size):] * 1.0, 
+                                       right_input[int(0.8*sample_size):] * 1.0], 
+                                      [targets[int(0.8*sample_size):]]))
+        gc.collect()
+
+    # Save embeddings
+    encoded_genes = encoder.predict(np.arange(len(geneids)) * 1.0)
+    
+    with open(args.outfile, 'w') as fw:
+        for i in range(len(encoded_genes)):
+            fw.write(geneids[i])
+            for j in range(len(encoded_genes[0])):
+                fw.write(',' + str(encoded_genes[i, j]))
+            fw.write('\n')
+    
+    logger.info(f"SNP-based gene embeddings saved to {args.outfile}")
+    
+    return args.outfile
+
+if __name__ == "__main__":
+    args = parse_args()
+    
+    # Set up logging
+    logging.basicConfig(level=logging.INFO)
+    
+    if args.datatype == 'snp':
+        # Train SNP-based embeddings
+        embedding_file = train_snp_embeddings(args)
+        print(f"SNP-based gene embeddings trained and saved to: {embedding_file}")
+        
+        # Demonstrate usage with signature embedding
+        print("\nTesting SNP signature embedding...")
+        from snp_signature_embedding import SNPSignatureEmbedder
+        
+        # Load the newly trained embeddings
+        embedder = SNPSignatureEmbedder(go_emb_file=embedding_file, 
+                                      archs4_emb_file='../data/gene_vec_archs4_256.csv')
+        
+        # Test with example SNPs
+        test_snps = ['rs2230317', 'rs10273927', 'rs7398691']
+        test_effects = {'rs2230317': 0.05, 'rs10273927': -0.03, 'rs7398691': 0.08}
+        
+        embedding = embedder.compute_snp_signature_embedding(test_snps, test_effects)
+        print(f"Test signature embedding shape: {embedding.shape}")
+        print(f"Sample embedding values: {embedding[:5]}")
+        
+    else:
+        # Use original training pipeline for GO/ARCHS4 data
+        datatype, association_file, outfile = args.datatype, args.association_file, args.outfile
+
+        geneids, association_mat, diffusion_state = rwr(datatype, association_file)
+        nonzeroidx = np.where(np.sum(association_mat, axis=1) > 0)[0]
+
+        geneids = np.array(geneids)
+        fp_array = association_mat[nonzeroidx]
+        geneids = geneids[nonzeroidx]
+
+        ori_dim = fp_array.shape[-1]
+        latent_dim = 256
+
+        encoder, autoencoder = get_model(len(geneids), latent_dim, ori_dim)
+        optimizer = keras.optimizers.Adam()
+        autoencoder.compile(
+            optimizer=optimizer,
+            loss=[losses.BinaryCrossentropy(from_logits=True)]
+        )
+
+        rs = rw_sampling(diffusion_state)
+        epochs = 3000
+        for e in range(epochs):
+            print('epoch:', e)
+            left_input = []
+            right_input = []
+            targets = []
+            pos_pairs = 1
+            neg_pairs = 5
+            
+            # Sample training pairs
+            left_input, right_input, targets = rs.sampling(np.arange(len(geneids)), pos_pairs, neg_pairs)
+
+            left_input = np.squeeze(np.array(left_input))
+            right_input = np.squeeze(np.array(right_input))
+            targets = np.squeeze(np.array(targets))
+            sample_size = len(left_input)
+            sample_idx = np.arange(sample_size)
+            np.random.shuffle(sample_idx)
+            left_input = left_input[sample_idx]
+            right_input = right_input[sample_idx]
+            targets = targets[sample_idx]
+
+            autoencoder.fit([left_input[:int(0.8*sample_size)] * 1.0, right_input[:int(0.8*sample_size)] * 1.0], 
+                           [targets[:int(0.8*sample_size)]],
+                           batch_size=1000,
+                           epochs=1,
+                           verbose=1,
+                           validation_data=([left_input[int(0.8*sample_size):] * 1.0, 
+                                           right_input[int(0.8*sample_size):] * 1.0], 
+                                          [targets[int(0.8*sample_size):]]))
+            gc.collect()
+
+        encoded_genes = encoder.predict(np.arange(len(geneids)) * 1.0)
+        
+        with open(outfile, 'w') as fw:
+            for i in range(len(encoded_genes)):
+                fw.write(geneids[i])
+                for j in range(len(encoded_genes[0])):
+                    fw.write(',' + str(encoded_genes[i, j]))
+                fw.write('\n')
+        
+        print(f"Gene embeddings saved to: {outfile}")
diff --git a/src/utils/io_utils.py b/src/utils/io_utils.py
new file mode 100644
index 0000000..5ff94f9
--- /dev/null
+++ b/src/utils/io_utils.py
@@ -0,0 +1,90 @@
+"""Small I/O helpers shared by FRoGS scripts.
+
+The main entry point is :func:`read_csv_auto`, a ``pandas.read_csv`` wrapper
+that transparently handles ``.gz``-compressed counterparts of the requested
+file.  This lets scripts keep their historical default paths
+(``*.csv``) while the shipped data happens to be stored compressed
+(``*.csv.gz``), and vice versa.
+"""
+from __future__ import annotations
+
+import os
+from typing import Iterable, List, Optional, Tuple
+
+import pandas as pd
+
+
+def _candidate_paths(path: str) -> List[str]:
+    """Return an ordered list of paths to try for ``path``.
+
+    Always starts with the verbatim path.  If the path ends with ``.gz`` we
+    also try the uncompressed form; otherwise we additionally try the
+    ``.gz``-suffixed form.  The ordering is chosen so that if the user
+    *explicitly* asks for a compressed file we look there first.
+    """
+    candidates = [path]
+    if path.endswith(".gz"):
+        candidates.append(path[: -len(".gz")])
+    else:
+        candidates.append(path + ".gz")
+    # Deduplicate while preserving order.
+    seen = set()
+    uniq: List[str] = []
+    for p in candidates:
+        if p not in seen:
+            seen.add(p)
+            uniq.append(p)
+    return uniq
+
+
+def resolve_data_path(path: str) -> str:
+    """Resolve ``path`` to the first existing candidate on disk.
+
+    Tries ``path`` itself and a ``.gz`` counterpart.  Raises
+    :class:`FileNotFoundError` with a clear, multi-line message listing every
+    location that was probed.
+    """
+    tried: List[str] = []
+    for candidate in _candidate_paths(path):
+        tried.append(candidate)
+        if os.path.exists(candidate):
+            return candidate
+    raise FileNotFoundError(
+        "Could not locate data file. Tried:\n  - " + "\n  - ".join(tried)
+    )
+
+
+def read_csv_auto(path: str, **read_csv_kwargs) -> pd.DataFrame:
+    """Drop-in replacement for :func:`pandas.read_csv` with ``.gz`` fallback.
+
+    Behaves identically to ``pd.read_csv`` for files that exist at the
+    requested location.  If the file is missing we look for a ``.gz`` (or
+    unsuffixed) sibling and, if found, read that instead so scripts work
+    whether the data was pre-decompressed or not.  Pandas infers the
+    compression from the suffix automatically.
+    """
+    resolved = resolve_data_path(path)
+    return pd.read_csv(resolved, **read_csv_kwargs)
+
+
+def validate_required_files(paths: Iterable[str]) -> Tuple[bool, List[str]]:
+    """Check that each of ``paths`` exists (treating ``.gz`` alternates as OK).
+
+    Returns a ``(ok, missing)`` tuple where ``missing`` is the list of inputs
+    for which neither the path itself nor its ``.gz`` / un-``.gz`` counterpart
+    could be found.  Useful for up-front validation in CLI scripts.
+    """
+    missing: List[str] = []
+    for path in paths:
+        try:
+            resolve_data_path(path)
+        except FileNotFoundError:
+            missing.append(path)
+    return (not missing), missing
+
+
+def ensure_dir(path: Optional[str]) -> None:
+    """Create ``path`` as a directory if it does not already exist."""
+    if not path:
+        return
+    os.makedirs(path, exist_ok=True)
diff --git a/src/utils/parallel.py b/src/utils/parallel.py
index 9978ba8..a3f7861 100644
--- a/src/utils/parallel.py
+++ b/src/utils/parallel.py
@@ -1,6 +1,14 @@
 #!/usr/bin/env python
-from __future__ import absolute_import
-from __future__ import print_function
+"""Parallel execution helpers for FRoGS.
+
+This module exposes :func:`map`, a drop-in parallel ``map``-style helper that
+is safe to use on modern Python (>=3.8) and macOS.
+
+The legacy :class:`MP`, :func:`parmap` and :func:`parprep` helpers are kept
+around for backwards compatibility with older FRoGS scripts but should be
+considered deprecated; new code should prefer :func:`map`.
+"""
+import concurrent.futures
 import multiprocessing
 import multiprocessing.pool
 import traceback
@@ -9,9 +17,39 @@
 import random
 import sys
 import os
-from six.moves import range
 
-#sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
+try:  # pragma: no cover - optional dependency
+    from tqdm.auto import tqdm as _tqdm
+except Exception:  # pragma: no cover - tqdm is optional
+    _tqdm = None
+
+
+def _get_mp_context():
+    """Return a multiprocessing context appropriate for the current platform.
+
+    Many FRoGS worker functions depend on module-level globals (e.g. the
+    ``go_emb`` / ``archs4_emb`` dictionaries in :mod:`src.l1000_model`) that
+    are populated at import time.  On macOS (Python >= 3.8) the default
+    start method is ``spawn`` which re-imports the module in a fresh
+    interpreter and therefore cannot see those runtime-populated globals,
+    which manifests as jobs that never return.  ``fork`` preserves the
+    parent state and is the only reliable choice on macOS for this
+    codebase; we fall back to the platform default if ``fork`` is
+    unavailable (e.g. Windows).
+    """
+    override = os.environ.get("FROGS_MP_START")
+    if override:
+        try:
+            return multiprocessing.get_context(override)
+        except ValueError:
+            pass
+    if sys.platform == "darwin":
+        try:
+            return multiprocessing.get_context("fork")
+        except ValueError:
+            pass
+    return multiprocessing.get_context()
+
 
 ## no longer needed. http://bytes.com/topic/python/answers/552476-why-cant-you-pickle-instancemethods
 ## This code enables static or instance methods to be pickled, therefore can be used in multiprocessing calls
@@ -373,57 +411,117 @@ def parprep(f=None, n_CPU=0, DEBUG=False):
 def proxy(f):
     return f
 
-def map(f, tasks, n_CPU=0, progress=False, min_interval=30):
-    """min_interval, do not report progress more frequent that 30 sec"""
-    if len(tasks)==0: return []
-    if n_CPU==0:
-        n_CPU=multiprocessing.cpu_count()
-    if n_CPU<=1:
-        if not progress:
-            return [ f(args) for args in tasks ]
-        else:
-            out=[]
-            pg=Progress(len(tasks))
-            i_start=time.time()
-            for i, args in enumerate(tasks):
-                out.append(f(args))
-                if (time.time()-i_start)>=min_interval or i==len(tasks)-1:
-                    pg.check(i+1)
-                    i_start=time.time()
-            return out
-    n_CPU=min(n_CPU, len(tasks))
-    pl=multiprocessing.Pool(n_CPU)
-    if progress:
-        # https://stackoverflow.com/questions/5666576/show-the-progress-of-a-python-multiprocessing-pool-map-call
-        out=[]
-        pg=Progress(len(tasks))
-        i_start=time.time()
-        #for i, _ in enumerate(pl.imap(f, tasks), 1):
-        for i, _ in enumerate(pl.imap(f, tasks)):
-            out.append(_)
-            #print(">>>", i, time.time(), i_start, time.time()-i_start)
-            if (time.time()-i_start)>=min_interval or i==len(tasks)-1:
-                pg.check(i+1)
-                i_start=time.time()
-    else:
-        out=pl.map(f, tasks)
-    pl.close()
-    pl.join()
+def _run_sequential(f, tasks, progress, min_interval):
+    if not progress:
+        return [f(args) for args in tasks]
+    if _tqdm is not None:
+        return [f(args) for args in _tqdm(tasks, desc="parallel.map", unit="task")]
+    out = []
+    pg = Progress(len(tasks))
+    i_start = time.time()
+    for i, args in enumerate(tasks):
+        out.append(f(args))
+        if (time.time() - i_start) >= min_interval or i == len(tasks) - 1:
+            pg.check(i + 1)
+            i_start = time.time()
     return out
 
-if __name__=="__main__":
+
+def map(f, tasks, n_CPU=0, progress=False, min_interval=30, timeout=None):
+    """Apply ``f`` to every element of ``tasks`` and return the results in order.
+
+    Parameters
+    ----------
+    f : callable
+        Top-level function that takes a single task argument.  Must be
+        picklable.
+    tasks : sequence
+        Input tasks.  Each element is passed to ``f``.
+    n_CPU : int, default 0
+        Number of worker processes to use.  ``0`` means
+        ``multiprocessing.cpu_count()``; ``1`` (or the env var
+        ``FROGS_N_CPU=1``) forces sequential execution.
+    progress : bool
+        If True, show a progress bar (``tqdm`` if available, otherwise a
+        periodic text update via :class:`Progress`).
+    min_interval : int
+        Minimum seconds between textual progress updates when ``tqdm`` is
+        unavailable.
+    timeout : float or None
+        If set, each individual task must complete within ``timeout``
+        seconds or a :class:`concurrent.futures.TimeoutError` is raised.
+    """
+    tasks = list(tasks)
+    if not tasks:
+        return []
+
+    env_override = os.environ.get("FROGS_N_CPU")
+    if env_override:
+        try:
+            n_CPU = int(env_override)
+        except ValueError:
+            pass
+
+    if n_CPU == 0:
+        n_CPU = multiprocessing.cpu_count()
+    if n_CPU <= 1:
+        return _run_sequential(f, tasks, progress, min_interval)
+
+    n_CPU = min(n_CPU, len(tasks))
+    ctx = _get_mp_context()
+
+    results = [None] * len(tasks)
+    errors = []
+    iterator = None
+    try:
+        with concurrent.futures.ProcessPoolExecutor(max_workers=n_CPU, mp_context=ctx) as executor:
+            future_to_idx = {executor.submit(f, task): idx for idx, task in enumerate(tasks)}
+            iterator = concurrent.futures.as_completed(future_to_idx, timeout=timeout)
+            if progress and _tqdm is not None:
+                iterator = _tqdm(iterator, total=len(tasks), desc="parallel.map", unit="task")
+
+            pg = Progress(len(tasks)) if (progress and _tqdm is None) else None
+            last_report = time.time()
+            completed = 0
+            for fut in iterator:
+                idx = future_to_idx[fut]
+                try:
+                    results[idx] = fut.result()
+                except Exception as exc:  # noqa: BLE001 - we re-raise aggregated below
+                    errors.append((idx, exc, traceback.format_exc()))
+                completed += 1
+                if pg is not None and (
+                    (time.time() - last_report) >= min_interval or completed == len(tasks)
+                ):
+                    pg.check(completed)
+                    last_report = time.time()
+    except KeyboardInterrupt:
+        # Ensure workers are torn down promptly on Ctrl-C.
+        raise
+
+    if errors:
+        first_idx, first_exc, first_tb = errors[0]
+        msg_lines = [
+            f"parallel.map: {len(errors)} task(s) raised exceptions.",
+            f"First failure was task index {first_idx}: {first_exc!r}",
+            first_tb,
+        ]
+        raise RuntimeError("\n".join(msg_lines))
+
+    return results
+
+
+if __name__ == "__main__":
 
     def calc(X):
-        """parameter should be just one tuple/list
-        The first thing is to flatten the tuple/list to individual variables
-        This is an example function calculate the sum from a to b"""
-        a,b=X # the parameter is (start, end) passed in as tasks
-        sum=0
-        for i in range(a, b+1):
-            sum+=i
-            time.sleep(0.5)
-        return sum
-
-    tasks=[(i,i+30) for i in range(10)]
-    out=map(calc, tasks, n_CPU=2, progress=True, min_interval=5)
+        """Example worker: sum of integers from a to b inclusive."""
+        a, b = X
+        total = 0
+        for i in range(a, b + 1):
+            total += i
+            time.sleep(0.01)
+        return total
+
+    tasks = [(i, i + 30) for i in range(10)]
+    out = map(calc, tasks, n_CPU=2, progress=True, min_interval=5)
     print(out)