From abe7d049baec6cb17c7062585f4a52a777cf9151 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Tue, 27 May 2025 05:23:46 +0000 Subject: [PATCH] wandb --- .gitignore | 3 + requirements.txt | 159 ++++++++++++++++------- results_metrics_finetune.json | 32 +++-- results_metrics_linear_probe.json | 33 +++-- src/models/model_comparison_baseline.py | 161 ++++++++++++++++++++---- 5 files changed, 289 insertions(+), 99 deletions(-) diff --git a/.gitignore b/.gitignore index e82aa1e..cfcf652 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,9 @@ simclr_linear_probe/ **/rng_state.pth **/training_args.bin +# Weights & Biases +wandb/ + # Python __pycache__/ *.py[cod] diff --git a/requirements.txt b/requirements.txt index d842fa4..6a2ce92 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,60 +1,125 @@ -asttokens -colorama -comm +accelerate==1.7.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.12.0 +aiosignal==1.3.2 +annotated-types==0.7.0 +asttokens==3.0.0 +async-timeout==5.0.1 +attrs==25.3.0 +certifi==2025.4.26 +charset-normalizer==3.4.2 +click==8.1.8 +colorama==0.4.6 +comm==0.2.2 contourpy==1.3.0 cycler==0.12.1 -debugpy -decorator -exceptiongroup -executing +datasets==3.6.0 +debugpy==1.8.14 +decorator==5.2.1 +dill==0.3.8 +docker-pycreds==0.4.0 +eval_type_backport==0.2.2 +exceptiongroup==1.3.0 +executing==2.2.0 +filelock==3.18.0 fonttools==4.54.1 -importlib_metadata -ipykernel -ipython -jedi +frozenlist==1.6.0 +fsspec==2025.3.0 +gitdb==4.0.12 +GitPython==3.1.44 +hf-xet==1.1.2 +huggingface-hub==0.32.0 +idna==3.10 +importlib_metadata==8.7.0 +importlib_resources==6.5.2 +iniconfig==2.1.0 +ipykernel==6.29.5 +ipython==8.18.1 +jedi==0.19.2 +Jinja2==3.1.6 joblib==1.4.2 -jupyter_client -jupyter_core +jupyter_client==8.6.3 +jupyter_core==5.7.2 kiwisolver==1.4.7 +MarkupSafe==3.0.2 matplotlib==3.9.2 -matplotlib-inline -nest_asyncio -numpy -packaging +matplotlib-inline==0.1.7 +mpmath==1.3.0 +multidict==6.4.4 +multiprocess==0.70.16 +nest-asyncio==1.6.0 +networkx==3.2.1 +numpy==2.0.2 +nvidia-cublas-cu12==12.6.4.1 +nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cudnn-cu12==9.5.1.17 +nvidia-cufft-cu12==11.3.0.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.7.77 +nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparselt-cu12==0.6.3 +nvidia-ml-py==12.575.51 +nvidia-nccl-cu12==2.26.2 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.6.77 +packaging==25.0 pandas==2.2.3 -parso -pickleshare +parso==0.8.4 +pexpect==4.9.0 +pickleshare==0.7.5 pillow==11.0.0 -platformdirs -prompt_toolkit -psutil -pure_eval -Pygments +platformdirs==4.3.8 +pluggy==1.6.0 +prompt_toolkit==3.0.51 +propcache==0.3.1 +protobuf==6.31.0 +psutil==7.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pyarrow==20.0.0 +pydantic==2.11.5 +pydantic_core==2.33.2 +Pygments==2.19.1 +pynvml==12.0.0 pyparsing==3.2.0 -python-dateutil +pytest==8.3.5 +python-dateutil==2.9.0.post0 pytz==2024.2 -pyzmq +PyYAML==6.0.2 +pyzmq==26.4.0 +regex==2024.11.6 +requests==2.32.3 +safetensors==0.5.3 scikit-learn==1.5.2 -scipy -setuptools==75.1.0 -six -stack-data +scipy==1.13.1 +seaborn==0.13.2 +sentry-sdk==2.29.1 +setproctitle==1.3.6 +six==1.17.0 +smmap==5.0.2 +stack-data==0.6.3 +sympy==1.14.0 +thop==0.1.1.post2209072238 threadpoolctl==3.5.0 -tornado -traitlets -typing_extensions +timm==1.0.15 +tokenizers==0.21.1 +tomli==2.2.1 +torch==2.7.0 +torchvision==0.22.0 +tornado==6.5.1 +tqdm==4.67.1 +traitlets==5.14.3 +transformers==4.52.3 +triton==3.3.0 +typing-inspection==0.4.1 +typing_extensions==4.13.2 tzdata==2024.2 -wcwidth -wheel==0.44.0 -zipp -# External/ML packages -torch>=2.0.0 -torchvision>=0.15.0 -transformers>=4.36.0 -datasets>=2.14.0 -timm>=0.9.0 -seaborn>=0.12.0 -pynvml>=11.5.0 -thop>=0.1.1 -Pillow>=9.0.0 # included again for compatibility -accelerate \ No newline at end of file +urllib3==2.4.0 +wandb==0.19.11 +wcwidth==0.2.13 +xxhash==3.5.0 +yarl==1.20.0 +zipp==3.21.0 diff --git a/results_metrics_finetune.json b/results_metrics_finetune.json index 7b44e37..46f9038 100644 --- a/results_metrics_finetune.json +++ b/results_metrics_finetune.json @@ -1,18 +1,24 @@ { - "simclr": { - "peak_memory_mb": 989.1875, - "flops_giga": 4.131698688, - "train_time_seconds": 417.5733857154846, - "eval_time_seconds": 25.519327640533447, + "vit": { + "model_name": "vit", + "model_type": "vit", + "peak_memory_mb": 3645.1875, + "flops_giga": 16.862863872, + "train_time_seconds": 454.5931091308594, + "eval_time_seconds": 33.34872579574585, "eval_metrics": { - "eval_loss": 0.6880730390548706, - "eval_accuracy": 0.6410256410256411, - "eval_f1": 0.6350267379679144, - "eval_auc": 0.6370808678500987, - "eval_runtime": 25.5184, - "eval_samples_per_second": 3.057, - "eval_steps_per_second": 0.196, - "epoch": 3.0 + "eval_loss": 0.6044296026229858, + "eval_accuracy": 0.7435897435897436, + "eval_f1": 0.7420634920634921, + "eval_auc": 0.8060486522024982, + "eval_runtime": 33.3459, + "eval_samples_per_second": 2.339, + "eval_steps_per_second": 0.15, + "epoch": 3.0, + "model": "vit", + "phase": "finetune", + "gpu_memory_mb": 3645.1875, + "best_accuracy": 0.7435897435897436 } } } \ No newline at end of file diff --git a/results_metrics_linear_probe.json b/results_metrics_linear_probe.json index 8b7c417..f093390 100644 --- a/results_metrics_linear_probe.json +++ b/results_metrics_linear_probe.json @@ -1,18 +1,25 @@ { - "simclr": { - "peak_memory_mb": 1003.1875, - "flops_giga": 4.131698688, - "train_time_seconds": 128.30324125289917, - "eval_time_seconds": 25.598750591278076, + "vit": { + "model_name": "vit", + "model_type": "vit", + "phase": "linear_probe", + "peak_memory_mb": 3645.1875, + "flops_giga": 16.862863872, + "train_time_seconds": 186.49300956726074, + "eval_time_seconds": 40.83929514884949, "eval_metrics": { - "eval_loss": 0.6896047592163086, - "eval_accuracy": 0.5512820512820513, - "eval_f1": 0.49019607843137253, - "eval_auc": 0.5746219592373438, - "eval_runtime": 25.5978, - "eval_samples_per_second": 3.047, - "eval_steps_per_second": 0.195, - "epoch": 1.0 + "eval_loss": 0.7638523578643799, + "eval_accuracy": 0.46153846153846156, + "eval_f1": 0.415, + "eval_auc": 0.38067061143984227, + "eval_runtime": 40.8364, + "eval_samples_per_second": 1.91, + "eval_steps_per_second": 0.122, + "epoch": 1.0, + "model": "vit", + "phase": "linear_probe", + "gpu_memory_mb": 1165.1875, + "best_accuracy": 0.46153846153846156 } } } \ No newline at end of file diff --git a/src/models/model_comparison_baseline.py b/src/models/model_comparison_baseline.py index e3b2962..e11ea11 100644 --- a/src/models/model_comparison_baseline.py +++ b/src/models/model_comparison_baseline.py @@ -8,8 +8,6 @@ # Environment Setup import os -os.environ["WANDB_DISABLED"] = "true" # Disable Weights & Biases logging - # Standard Libraries import io import json @@ -40,6 +38,8 @@ ) from datasets import load_dataset, ClassLabel +# Weights & Biases +import wandb # Metrics from sklearn.metrics import ( @@ -75,7 +75,6 @@ # GPU Memory Monitoring (optional) try: import pynvml - pynvml.nvmlInit() GPU_AVAILABLE = True except ImportError: @@ -91,13 +90,63 @@ ) os.environ["HF_HOME"] = os.getenv("HF_HOME", "~/.cache/huggingface") +class WandbCallback(TrainerCallback): + def __init__(self, model_name, phase): + self.model_name = model_name + self.phase = phase + self.best_accuracy = 0.0 + + def on_log(self, args, state, control, logs=None, **kwargs): + if logs is not None: + # Add model name and phase to logs + logs["model"] = self.model_name + logs["phase"] = self.phase + + # Track GPU memory if available + if GPU_AVAILABLE: + logs["gpu_memory_mb"] = get_gpu_memory() + + # Log to wandb + wandb.log(logs) + + def on_evaluate(self, args, state, control, metrics=None, **kwargs): + if metrics is not None: + # Track best accuracy + if "eval_accuracy" in metrics: + self.best_accuracy = max(self.best_accuracy, metrics["eval_accuracy"]) + metrics["best_accuracy"] = self.best_accuracy + + # Log evaluation metrics + wandb.log(metrics) + +def main(num_train_images=5000, proportion_per_transform=0.2, resolution=224): + + # Simple batch size configuration + batch_size = 64 + # Initialize wandb + wandb_config = { + "num_train_images": num_train_images, + "proportion_per_transform": proportion_per_transform, + "resolution": resolution, + "batch_size": batch_size, + "num_epochs": 3, + "warmup_steps": 500, + "weight_decay": 0.01, + "gpu_available": GPU_AVAILABLE, + } + + wandb.init( + entity="ericcui-use-stanford-university", + project="CS231N Test", + config=wandb_config, + tags=["baseline", "model-comparison"] + ) -def main(num_train_images=1000, proportion_per_transform=0.2, resolution=224): models = [ - # {"name": "vit", "model_id": "google/vit-base-patch16-224", "type": "vit"}, + {"name": "vit", "model_id": "google/vit-base-patch16-224", "type": "vit"}, # {"name": "dinov2", "model_id": "facebook/dinov2-base", "type": "dinov2"}, - {"name": "simclr", "model_id": "resnet50", "type": "simclr"}, + # {"name": "simclr", "model_id": "resnet50", "type": "simclr"}, ] results = {m["name"]: {} for m in models} @@ -116,6 +165,7 @@ def main(num_train_images=1000, proportion_per_transform=0.2, resolution=224): i for i, label in enumerate(dataset["label"]) if str(label) in FILTERED_CLASSES # Convert to string for comparison ] + # Select only those indices dataset = dataset.select(filtered_indices) print(f"Number of images after filtering for classes {FILTERED_CLASSES}: {len(dataset)}") @@ -270,8 +320,8 @@ def main(num_train_images=1000, proportion_per_transform=0.2, resolution=224): train_args = TrainingArguments( output_dir=os.path.join(env_path("TRAIN_OUTPUT_DIR", "."), f"{name}"), num_train_epochs=3, - per_device_train_batch_size=16, - per_device_eval_batch_size=16, + per_device_train_batch_size=batch_size, + per_device_eval_batch_size=batch_size, warmup_steps=500, weight_decay=0.01, logging_dir=os.path.join(env_path("LOG_DIR", "."), f"{name}"), @@ -294,7 +344,8 @@ def main(num_train_images=1000, proportion_per_transform=0.2, resolution=224): log_dir=env_path("LOG_DIR", "./logs"), phase="finetune", model_name=name, - ) + ), + WandbCallback(name, "finetune"), ], ) @@ -302,6 +353,12 @@ def main(num_train_images=1000, proportion_per_transform=0.2, resolution=224): start_time = time.time() peak_memory = get_gpu_memory() if GPU_AVAILABLE else -1 + # Log model architecture + if typ in HF_MODELS: + wandb.watch(model, log="all", log_freq=100) + elif typ == SSL_MODEL: + wandb.watch(model.backbone, log="all", log_freq=100) + trainer.train() current_memory = get_gpu_memory() if GPU_AVAILABLE else -1 @@ -312,6 +369,18 @@ def main(num_train_images=1000, proportion_per_transform=0.2, resolution=224): eval_time = time.time() - eval_start_time train_time = time.time() - start_time - eval_time + # Log model-specific metrics + model_metrics = { + "model_name": name, + "model_type": typ, + "peak_memory_mb": peak_memory, + "flops_giga": flops, + "train_time_seconds": train_time, + "eval_time_seconds": eval_time, + "eval_metrics": eval_results, + } + wandb.log(model_metrics) + model_dir = os.path.join( env_path("MODEL_DIR", "."), f"{name}" ) @@ -334,19 +403,30 @@ def main(num_train_images=1000, proportion_per_transform=0.2, resolution=224): f, ) - results[name] = { - "peak_memory_mb": peak_memory, - "flops_giga": flops, - "train_time_seconds": train_time, - "eval_time_seconds": eval_time, - "eval_metrics": eval_results, - } + # Save model as wandb artifact + artifact = wandb.Artifact( + name=f"{name}_model", + type="model", + description=f"Trained {name} model with {typ} architecture" + ) + artifact.add_dir(model_dir) + wandb.log_artifact(artifact) + + results[name] = model_metrics print( f"[Finetune] {name}: {results[name]}" ) # ---- LINEAR PROBE PHASE ---- + # Create a new wandb run for linear probe + wandb.init( + project="model-comparison-baseline", + config=wandb_config, + tags=["baseline", "model-comparison", "linear-probe"], + name=f"{name}_linear_probe" + ) + if typ == "vit": model = ViTForImageClassification.from_pretrained( model_id, num_labels=NUM_FILTERED_CLASSES, ignore_mismatched_sizes=True @@ -362,14 +442,20 @@ def main(num_train_images=1000, proportion_per_transform=0.2, resolution=224): model.to(device) freeze_backbone(model, typ) + # Log model architecture for linear probe + if typ in HF_MODELS: + wandb.watch(model, log="all", log_freq=100) + elif typ == SSL_MODEL: + wandb.watch(model.backbone, log="all", log_freq=100) + linear_args = TrainingArguments( output_dir=os.path.join( env_path("TRAIN_OUTPUT_DIR", "."), f"{name}_linear_probe", ), num_train_epochs=1, - per_device_train_batch_size=16, - per_device_eval_batch_size=16, + per_device_train_batch_size=batch_size, + per_device_eval_batch_size=batch_size, warmup_steps=100, weight_decay=0.01, logging_dir=os.path.join( @@ -394,7 +480,8 @@ def main(num_train_images=1000, proportion_per_transform=0.2, resolution=224): log_dir=env_path("LOG_DIR", "./logs"), phase="linear_probe", model_name=name, - ) + ), + WandbCallback(name, "linear_probe"), ], ) @@ -409,6 +496,19 @@ def main(num_train_images=1000, proportion_per_transform=0.2, resolution=224): eval_time = time.time() - eval_start_time train_time = time.time() - start_time - eval_time + # Log linear probe metrics + linear_probe_metrics = { + "model_name": name, + "model_type": typ, + "phase": "linear_probe", + "peak_memory_mb": peak_memory, + "flops_giga": flops, + "train_time_seconds": train_time, + "eval_time_seconds": eval_time, + "eval_metrics": eval_results, + } + wandb.log(linear_probe_metrics) + model_dir = os.path.join( env_path("MODEL_DIR", "."), f"{name}_linear_probe" ) @@ -431,18 +531,27 @@ def main(num_train_images=1000, proportion_per_transform=0.2, resolution=224): f, ) - results_linear_probe[name] = { - "peak_memory_mb": peak_memory, - "flops_giga": flops, - "train_time_seconds": train_time, - "eval_time_seconds": eval_time, - "eval_metrics": eval_results, - } + # Save linear probe model as wandb artifact + artifact = wandb.Artifact( + name=f"{name}_linear_probe_model", + type="model", + description=f"Linear probe {name} model with {typ} architecture" + ) + artifact.add_dir(model_dir) + wandb.log_artifact(artifact) + + results_linear_probe[name] = linear_probe_metrics print( f"[LinearProbe] {name}: {results_linear_probe[name]}" ) + # Close the wandb run for linear probe + wandb.finish() + + # Close the main wandb run + wandb.finish() + with open( os.path.join( env_path("TRAIN_OUTPUT_DIR", "."), "results_metrics_finetune.json"