-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexample.py
More file actions
77 lines (64 loc) · 2.43 KB
/
example.py
File metadata and controls
77 lines (64 loc) · 2.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
import subprocess
import zipfile
from pathlib import Path
from src.data_loading import load_data, MultiChannelDataset
from src.masks import MultiBlockMaskCollator
from src.transforms import make_transforms_rgb, make_transforms
def download_dataset_from_kaggle(dataset_name, download_path):
"""Download dataset from Kaggle if it doesn't exist."""
print(f"Dataset not found. Downloading from Kaggle: {dataset_name}")
# Create download directory if it doesn't exist
os.makedirs(download_path, exist_ok=True)
# Download dataset using kaggle CLI
try:
subprocess.run(
["kaggle", "datasets", "download", "-d", dataset_name, "-p", download_path],
check=True
)
print(f"Dataset downloaded to {download_path}")
# Unzip the dataset
zip_files = list(Path(download_path).glob("*.zip"))
for zip_file in zip_files:
print(f"Extracting {zip_file}...")
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall(download_path)
# Remove zip file after extraction
zip_file.unlink()
print(f"Extraction complete!")
except subprocess.CalledProcessError as e:
print(f"Error downloading dataset: {e}")
raise
except Exception as e:
print(f"Error processing dataset: {e}")
raise
def main():
# Define paths
dataset_root = '/teamspace/studios/this_studio/JEPA/data/BEN_14k'
download_path = '/teamspace/studios/this_studio/JEPA/data'
kaggle_dataset = "narendraaironi/bigearthnet-14k"
# Check if dataset exists, if not download it
if not os.path.exists(dataset_root):
download_dataset_from_kaggle(kaggle_dataset, download_path)
else:
print(f"Dataset already exists at {dataset_root}")
bigearthnet_s2_root = os.path.join(dataset_root, 'BigEarthNet-S1')
train_ds = MultiChannelDataset(
root=bigearthnet_s2_root,
split='train',
transform=make_transforms(num_channels=2),
)
print(f"Dataset length: {len(train_ds)}")
sample = train_ds[0]
print(train_ds.metadata[0])
print(f"Sample shape: {sample.shape}")
data_loader = load_data(
root=bigearthnet_s2_root,
split='train',
batch_size=4,
shuffle=True,
num_workers=2,
collate_fn=MultiBlockMaskCollator()
)
if __name__ == "__main__":
main()