diff --git a/EMT_data_analysis/analysis_scripts/Metric_computation.py b/EMT_data_analysis/analysis_scripts/Metric_computation.py index 50d75c3..2331ec9 100755 --- a/EMT_data_analysis/analysis_scripts/Metric_computation.py +++ b/EMT_data_analysis/analysis_scripts/Metric_computation.py @@ -5,6 +5,7 @@ the files from feature_extraction.py file are stored. ''' import warnings +import time import numpy as np import pandas as pd import scipy.ndimage @@ -13,15 +14,70 @@ from scipy.signal import savgol_filter from EMT_data_analysis.tools import io from pathlib import Path +from joblib import Parallel, delayed warnings.filterwarnings("ignore") + +def _process_single_movie_area(data_id, z_bottom, mask_url, timepoints, max_retries=5): + """ + Helper function to compute area at the glass for a single movie. + Designed to be called in parallel. Includes retry logic for network errors. + """ + if pd.isna(mask_url): + return [], None + + results = [] + + # Retry logic for loading BioImage + img_seg = None + last_error = None + for attempt in range(max_retries): + try: + img_seg = BioImage(mask_url) + break # Success, exit retry loop + except Exception as e: + last_error = e + if attempt < max_retries - 1: + wait_time = min(2 ** attempt, 30) # Exponential backoff: 1, 2, 4, 8, 16 (max 30) + time.sleep(wait_time) + continue + + if img_seg is None: + print(f"Error loading {data_id} after {max_retries} retries: {last_error}") + return [], data_id + + try: + # Check actual T dimension of the image + actual_t_size = img_seg.dims.T + is_fixed_cell = (actual_t_size == 1) + + for t in timepoints: + # For fixed cells (single timepoint), always use T=0 regardless of computed timepoint + t_index = 0 if is_fixed_cell else min(int(t), actual_t_size - 1) + img_seg_tl = img_seg.get_image_dask_data("ZYX", T=t_index) + img_z = img_seg_tl[z_bottom:z_bottom+2] + z_max_proj = np.max(img_z, axis=0) + img_fh = scipy.ndimage.binary_fill_holes(z_max_proj).astype(int) + + area_pixels = np.count_nonzero(img_fh) + results.append({ + 'Data ID': data_id, + 'Timepoint': t, + 'Area at the glass (pixels)': area_pixels, + 'Area at the glass(square micrometer)': area_pixels * (0.271 * 0.271) + }) + return results, None + except Exception as e: + print(f"Error processing {data_id}: {e}") + return [], data_id + # %% [markdown] def add_bottom_z(df): """ This function adds bottom Z - Zplane corresponding to the glass - defined from the sum of area over all time points vs Z - + Parameters ---------- df: DataFrame @@ -31,19 +87,19 @@ def add_bottom_z(df): ------- df_merged: DataFrame Returns the input DataFrame with'Normalized Z plane' and 'Bottom Z plane' columns""" - + df['Area of all cells mask per Z (square micrometer)']=df['Area of all cells mask per Z (pixels)']*(0.271*0.271) area_time=df.groupby(['Data ID','Z plane'])['Area of all cells mask per Z (square micrometer)'].agg('sum').reset_index() file_id, z_bottom=[],[] for id, df_id in tqdm(area_time.groupby('Data ID')): file_id.append(id) df_id=df_id.reset_index() - + raw_values=df_id['Area of all cells mask per Z (square micrometer)'].values dy=np.diff(raw_values) max_dy=np.max(dy) idx_max= np.where(dy== max_dy)[0] - + zo=df_id['Z plane'][idx_max].values[0]+1 z_bottom.append(zo) @@ -52,53 +108,108 @@ def add_bottom_z(df): df_normalized_z=pd.merge(df,df_bottom_z, on=['Data ID']) - df_normalized_z['Normalized Z plane']=df_normalized_z.apply(lambda x: x['Z plane']-x['Bottom Z plane'], axis=1) + # Vectorized subtraction (much faster than apply with lambda) + df_normalized_z['Normalized Z plane'] = df_normalized_z['Z plane'] - df_normalized_z['Bottom Z plane'] return df_normalized_z -def add_bottom_mip_migration(df_merged): +def add_bottom_mip_migration(df_merged, n_jobs=32): ''' This adds area of MIP of bottom 2Z planes to get area at the glass and compute migration time from that. - + Parameters ---------- df_merged: DataFrame - Dataframe with Bottom Z plane column and All-cells mask paths for each movie (merging df_normalized_z with Imaging_and_segmentation_data.csv) + Dataframe with 'Data ID', 'Timepoint', 'Bottom Z plane', and 'All Cells Mask URL' columns. + n_jobs: int + Number of parallel jobs. -1 uses all available cores. Default: 32. Returns ------- df_mm: DataFrame - Returns the input DataFrame with 'Area at the glass (pixels)','Area at the glass(square micrometer)' and 'Migration time (h)' columns - ''' - - df_mm=pd.DataFrame() - for id, df_id in tqdm(df_merged.groupby('Data ID')): - ar_v,tp=[],[] - - l = df_id['Timepoint'].max() - for t, df_tp in df_id.groupby('Timepoint'): - if t > l: - break - z_bottom=df_tp['Bottom Z plane'].values[0] - img_seg = BioImage(df_tp['All Cells Mask URL'].values[0]) - img_seg_tl = img_seg.get_image_dask_data("ZYX") - img_z=img_seg_tl[z_bottom:z_bottom+2] - z_max_proj = np.max(img_z,axis=0) - img_fh=scipy.ndimage.binary_fill_holes(z_max_proj).astype(int) - - ar2=np.count_nonzero(img_fh) - ar_v.append(ar2) - tp.append(t) - df_area=pd.DataFrame(zip(tp,ar_v), columns=['Timepoint','Area at the glass (pixels)']) - - df_area['Area at the glass(square micrometer)']=df_area['Area at the glass (pixels)']*(0.271*0.271) - df_area['Data ID']=id - df_merged_area=pd.merge(df_id,df_area, on=['Data ID','Timepoint']) - df_mm=pd.concat([df_mm,df_merged_area]) + Returns DataFrame with 'Data ID', 'Timepoint', 'Area at the glass (pixels)', + 'Area at the glass(square micrometer)' columns + ''' + # Prepare arguments for parallel processing + movie_args = [] + for data_id, df_id in df_merged.groupby('Data ID'): + z_bottom = df_id['Bottom Z plane'].values[0] + mask_url = df_id['All Cells Mask URL'].values[0] + timepoints = sorted(df_id['Timepoint'].unique()) + movie_args.append((data_id, z_bottom, mask_url, timepoints)) + + print(f"Processing {len(movie_args)} movies with {n_jobs} parallel jobs...") + + # Process movies in parallel + all_results = Parallel(n_jobs=n_jobs, verbose=10)( + delayed(_process_single_movie_area)(data_id, z_bottom, mask_url, timepoints) + for data_id, z_bottom, mask_url, timepoints in movie_args + ) + + # Flatten results list and collect failed movies + results = [] + failed_movies = [] + for movie_results, failed_id in all_results: + results.extend(movie_results) + if failed_id is not None: + failed_movies.append(failed_id) + + if failed_movies: + print(f"\nWARNING: {len(failed_movies)} movies failed after retries:") + for movie_id in failed_movies: + print(f" - {movie_id}") + + # Single DataFrame creation at the end + df_mm = pd.DataFrame(results) return df_mm - + + +def add_metadata_only_movies(df_features, Imaging_and_segmentation_data): + """ + Add metadata-only movies (those without All Cells Mask) to the feature manifest. + These movies have metadata but couldn't be processed by feature extraction + because they lack segmentation masks. + + Parameters + ---------- + df_features: DataFrame + The main feature dataframe with processed movies + Imaging_and_segmentation_data: DataFrame + Full imaging and segmentation metadata + + Returns + ------- + df_combined: DataFrame + Combined dataframe with both processed and metadata-only movies + """ + # Find movies without All Cells Mask URL (metadata-only) + df_no_mask = Imaging_and_segmentation_data[ + Imaging_and_segmentation_data['All Cells Mask URL'].isna() + ] + metadata_only_ids = set(df_no_mask['Data ID'].unique()) + + # Exclude any that might already be in features (shouldn't happen, but safe check) + processed_ids = set(df_features['Data ID'].unique()) + metadata_only_ids = metadata_only_ids - processed_ids + + if len(metadata_only_ids) == 0: + print("No metadata-only movies to add") + return df_features + + print(f"Adding {len(metadata_only_ids)} metadata-only movies (no All Cells Mask)") + + # Get metadata for these movies (one row per movie) + df_metadata_only = Imaging_and_segmentation_data[ + Imaging_and_segmentation_data['Data ID'].isin(metadata_only_ids) + ].drop_duplicates('Data ID') + + # Combine with existing features + df_combined = pd.concat([df_features, df_metadata_only], ignore_index=True, sort=False) + + return df_combined + def add_gene_metrics(df_features): ''' @@ -114,7 +225,7 @@ def add_gene_metrics(df_features): Returns ------- df_features_addons: DataFrame - + ''' #filtering to 10 z-slices over which the mean intensity is calculated df_z=df_features[(df_features['Normalized Z plane']>=0) & (df_features['Normalized Z plane']<10)] @@ -129,7 +240,7 @@ def add_gene_metrics(df_features): for id, df_id in df_eomes.groupby('Data ID'): df_id=df_id.sort_values('Timepoint') #smoothing the mean intensity curve - df_id['int_smooth']=savgol_filter(df_id.mean_intensity.values,polyorder=2, window_length=10) + df_id['int_smooth']=savgol_filter(df_id.mean_intensity.values,polyorder=2, window_length=10) int_max=max(df_id.int_smooth) t_max=df_id['Timepoint'][df_id.int_smooth==int_max].values[0] Movie_ids_eomes.append(id) @@ -143,7 +254,7 @@ def add_gene_metrics(df_features): for id, df_id in df_tbxt.groupby('Data ID'): df_id=df_id.sort_values('Timepoint') #smoothing the mean intensity curve - df_id['int_smooth']=savgol_filter(df_id.mean_intensity.values,polyorder=2, window_length=10) + df_id['int_smooth']=savgol_filter(df_id.mean_intensity.values,polyorder=2, window_length=10) int_max=max(df_id.int_smooth) t_max=df_id['Timepoint'][df_id.int_smooth==int_max].values[0] Movie_ids_tbxt.append(id) @@ -170,11 +281,12 @@ def add_gene_metrics(df_features): Movie_ids_sox, time_half_maximal_sox=[],[] for id, df_id in df_sox.groupby('Data ID'): df_id=df_id.sort_values('Timepoint') - df_id['int_smooth']=savgol_filter(df_id.mean_intensity.values,polyorder=2, window_length=10) - int_50=(max(df_id.int_smooth.values[0])+min(df_id.int_smooth))/2 + df_id['int_smooth']=savgol_filter(df_id.mean_intensity.values,polyorder=2, window_length=10) + # Use the first timepoint intensity value - biologically meaningful + int_50=(df_id.int_smooth.values[0]+min(df_id.int_smooth))/2 t_50=min(df_id['Timepoint'][(df_id.int_smooth<=int_50)]) Movie_ids_sox.append(id) - time_half_maximal_sox.append(t_50) + time_half_maximal_sox.append(t_50*(30/60)) df_sox_metrics=pd.DataFrame(zip(Movie_ids_sox, time_half_maximal_sox), columns=['Data ID','Time of half-maximal SOX2 expression (h)']) #merging eomes metrics with feature manifest @@ -185,20 +297,23 @@ def add_gene_metrics(df_features): # %% [markdown] ## master function to implement the pipeline -def compute_metrics(output_folder): +def compute_metrics(output_folder, load_from_aws: bool = True, local_imaging_csv: str = None): ''' This is a master function that implements every function and post processing to save a compiled final manifest to be used with analysis_plots.py Parameters ---------- - Imaging_and_segmentation_data: DataFrame - Dataframe with imaging and segmentation information for each movie + output_folder: Path + Path to the folder to save the final feature manifest - all_cells_feature_csvs_folder: Folder path - Path to the folder where csvs per movie for the features extracted from all-cells masks is stored + load_from_aws: bool, default True + If True, load imaging_and_segmentation_data from AWS S3. + If False, load from local file. + + local_imaging_csv: str, optional + Path to local imaging_and_segmentation_data.csv file. + Only used when load_from_aws=False. If not provided, uses default local path. - final_feature_folder: folder path - Path to the folder to save the final feature manifest Returns ------- df_features_final: DataFrame @@ -207,47 +322,113 @@ def compute_metrics(output_folder): print('compiling intensity and z features into a single dataframe') df = io.load_bf_colony_features() + #df = df[df['Data ID'] == '3500005824_35'] print('computing glass information for normalized z position') df_all_z=add_bottom_z(df) print(len(df_all_z.index)) print('merging the bottom z information with the colony mask path csv') - df_z = df_all_z.groupby('Data ID')['Bottom Z plane'].agg('first').reset_index() - Imaging_and_segmentation_data = io.load_imaging_and_segmentation_dataset() - df_merged = pd.merge(df_z,Imaging_and_segmentation_data, how='left', on=['Data ID']) + Imaging_and_segmentation_data = io.load_imaging_and_segmentation_dataset( + load_from_aws=load_from_aws, + local_path=local_imaging_csv + ) + + # Pass only needed columns to add_bottom_mip_migration (reduces memory and speeds up groupby) + df_for_area = df_all_z[['Data ID', 'Timepoint', 'Bottom Z plane']].drop_duplicates() + df_for_area = pd.merge( + df_for_area, + Imaging_and_segmentation_data[['Data ID', 'All Cells Mask URL']], + on='Data ID', + how='left' + ) print('computing area at the glass (bottom 2 z MIP) and migration time') - df_mm=add_bottom_mip_migration(df_merged) + df_mm = add_bottom_mip_migration(df_for_area) print(len(df_mm.index)) print('merging everything into a single feature manifest') - df_features=pd.merge(df_all_z,df_mm, on=['Data ID','Timepoint','Z plane'], suffixes=("","_remove"), how='left') - df_features.drop([i for i in df_features.columns if 'remove' in i], axis=1, inplace=True) + # Merge area results back to full dataframe + df_features = pd.merge(df_all_z, df_mm, on=['Data ID', 'Timepoint'], how='left') + # Merge imaging metadata (only columns not already in df_features to avoid duplicates) + existing_cols = set(df_features.columns) + new_cols = ['Data ID'] + [col for col in Imaging_and_segmentation_data.columns + if col not in existing_cols] + df_features = pd.merge(df_features, Imaging_and_segmentation_data[new_cols], on=['Data ID'], how='left') print(len(df_features.index)) + # Round migration onset times to nearest 0.5 hour to align with Time hr grid + footprint_col = 'Migration Onset Time (Footprint Area Based)' + if footprint_col in df_features.columns: + df_features[footprint_col] = (df_features[footprint_col] * 2).round() / 2 + print(f'Rounded {footprint_col} to nearest 0.5 hour') + + io_col = 'Migration Onset Time (Inside/Outside Basement Membrane Based)' + if io_col in df_features.columns: + df_features[io_col] = (df_features[io_col] * 2).round() / 2 + print(f'Rounded {io_col} to nearest 0.5 hour') + + manual_col = 'Migration Onset Time (Manual First Cell Detection)' + if manual_col in df_features.columns: + df_features[manual_col] = (df_features[manual_col] * 2).round() / 2 + print(f'Rounded {manual_col} to nearest 0.5 hour') + print('adding gene specific metrics...') df_features_addons=add_gene_metrics(df_features) - #only including the columns of interest - features = ['Data ID', 'Experimental Condition', 'Gene', + + # Reorder columns to put key analysis columns first, then metadata + priority_columns = ['Data ID', 'Experimental Condition', 'Gene', 'Single Colony Or Lumenoid At Time of Migration', 'Absence Of Migrating Cells Coming From Colony Out Of FOV At Time Of Migration', 'Timelapse Interval', 'Timepoint', 'Z plane', 'Area of all cells mask per Z (pixels)', 'Area of all cells mask per Z (square micrometer)', - 'Mean intensity per Z', 'Total intensity per Z', 'Bottom Z plane', + 'Mean intensity per Z', 'Total intensity per Z', + 'Mean intensity per Z (Channel 2)', 'Total intensity per Z (Channel 2)', + 'Mean intensity per Z (Channel 3)', 'Total intensity per Z (Channel 3)', + 'Bottom Z plane', 'Normalized Z plane', 'Area at the glass (pixels)', - 'Area at the glass(square micrometer)'] - features = [feat for feat in features if feat in df_features_addons.columns] - df_features_final=df_features_addons[features] + 'Area at the glass(square micrometer)', + 'Time of max EOMES expression (h)', + 'Time of max TBXT expression (h)', + 'Time of inflection of E-cad expression (h)', + 'Time of half-maximal SOX2 expression (h)', + 'Migration Onset Time (Inside/Outside Basement Membrane Based)', + 'Migration Onset Time (Manual First Cell Detection)'] + + # Get priority columns that exist, then add remaining columns + existing_priority = [col for col in priority_columns if col in df_features_addons.columns] + other_columns = [col for col in df_features_addons.columns if col not in priority_columns] + all_columns = existing_priority + other_columns + + df_features_final = df_features_addons[all_columns] + print(f"Final columns: {len(df_features_final.columns)}") print(len(df_features_final.index)) + # Add metadata-only movies (those without image data but with metadata) + print('adding metadata-only movies...') + df_features_final = add_metadata_only_movies(df_features_final, Imaging_and_segmentation_data) + print(f"Total Data IDs after adding metadata-only movies: {df_features_final['Data ID'].nunique()}") + print('saving the final feature file') - df_features_final.to_csv(output_folder / f"Image_analysis_extracted_features.csv") + df_features_final.to_csv(output_folder / f"Image_analysis_extracted_features.csv", index=False) # %% [markdown] if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description='Compute metrics for EMT analysis') + parser.add_argument('--local', action='store_true', + help='Load imaging_and_segmentation_data from local file instead of AWS') + parser.add_argument('--local-csv', type=str, default=None, + help='Path to local imaging_and_segmentation_data.csv (only used with --local)') + args = parser.parse_args() + base_results_dir = io.setup_base_directory_name("metric_computation") - df_features_all = compute_metrics(output_folder=base_results_dir) \ No newline at end of file + df_features_all = compute_metrics( + output_folder=base_results_dir, + load_from_aws=not args.local, + local_imaging_csv=args.local_csv + ) diff --git a/README.md b/README.md index 6333251..7ea05d2 100755 --- a/README.md +++ b/README.md @@ -36,6 +36,8 @@ Extracts per-Z-plane features from each movie: colony mask area and fluorescence **Dual-camera alignment**: The imaging system uses two cameras (Camera 1: brightfield + 638 nm; Camera 2: 488 nm + 561 nm). Since the all-cells segmentation mask is derived from brightfield (Camera 1), the mask is aligned to Camera 2 coordinates using the dual-camera calibration matrix before extracting intensity from 488/561 nm channels. Channels on the same camera as the mask do not require alignment. +Output: `EMT_data_analysis/results/feature_extraction/Features_bf_colony_mask_*Data-ID*.csv` + ## Step 2 — Metric computation ```bash @@ -48,6 +50,12 @@ Compiles per-movie CSVs from Step 1 into a single manifest and computes gene-spe - **EOMES**: Time of maximum expression (peak of smoothed intensity curve) - **CDH1**: Time of inflection of E-cadherin expression (minimum of second derivative of smoothed intensity) +Mean intensity is computed as total intensity divided by all-cells mask area, averaged over the bottom 10 Z-slices above the glass. Intensity curves are smoothed using a Savitzky-Golay filter (polynomial order 2). Movies are processed in parallel using `joblib`. + +To load the imaging manifest from a local file instead of AWS: +```bash +python EMT_data_analysis/analysis_scripts/Metric_computation.py --local [--local-csv path/to/file.csv] +``` Output: `EMT_data_analysis/results/metric_computation/Image_analysis_extracted_features.csv` ## Step 3 — Nuclei localization