Skip to content

Commit d8e68cc

Browse files
committed
argparser + fixes
1 parent a87dd35 commit d8e68cc

File tree

1 file changed

+60
-32
lines changed

1 file changed

+60
-32
lines changed

chebai/preprocessing/migration/chebi_data_migration.py

Lines changed: 60 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -13,51 +13,70 @@ class ChebiDataMigration:
1313
__MODULE_PATH: str = "chebai.preprocessing.datasets.chebi"
1414
__DATA_ROOT_DIR: str = "data"
1515

16-
def __init__(self, chebi_version, class_name: str):
17-
self._chebi_version: int = chebi_version
16+
def __init__(self, class_name: str, chebi_version: int, single_class: int = None):
1817
# Chebi class instance according to new data structure
1918
self._chebi_cls: Type[_ChEBIDataExtractor] = self._dynamic_import_chebi_cls(
20-
class_name, chebi_version
21-
)
22-
self._class_path: str = class_name
23-
24-
def _get_old_dir_structure(self):
25-
base_dir = os.path.join(
26-
self.__DATA_ROOT_DIR,
27-
self._chebi_cls._name,
28-
f"chebi_v{self._chebi_cls.chebi_version}",
19+
class_name, chebi_version, single_class
2920
)
21+
self._chebi_version: int = chebi_version
22+
self._single_class: int = single_class
23+
self._class_name: str = class_name
3024

3125
@classmethod
32-
def _dynamic_import_chebi_cls(cls, class_name: str, chebi_version: int):
26+
def _dynamic_import_chebi_cls(
27+
cls, class_name: str, chebi_version: int, single_class: int
28+
) -> Type[_ChEBIDataExtractor]:
3329
class_name = class_name.strip()
3430
module = __import__(cls.__MODULE_PATH, fromlist=[class_name])
3531
_class = getattr(module, class_name)
36-
return _class({"chebi_version": chebi_version})
32+
return _class(**{"chebi_version": chebi_version, "single_class": single_class})
3733

3834
def migrate(self):
3935
os.makedirs(self._chebi_cls.base_dir, exist_ok=True)
36+
print("Migration started..................")
37+
self._migrate_old_raw_data()
4038
self._migrate_old_processed_data()
39+
print("Migration completed..................")
4140

4241
def _migrate_old_raw_data(self):
42+
print("-" * 50)
43+
print("Migrating old raw Data.....................")
44+
4345
self._copy_file(self._old_raw_dir, self._chebi_cls.raw_dir, "chebi.obo")
4446
self._copy_file(
4547
self._old_raw_dir, self._chebi_cls.processed_dir_main, "classes.txt"
4648
)
49+
4750
old_splits_file_names = {
4851
"train": "train.pkl",
4952
"validation": "validation.pkl",
5053
"test": "test.pkl",
5154
}
52-
data_df, split_ass_df = self._combine_splits(
55+
data_file_path = os.path.join(self._chebi_cls.processed_dir_main, "data.pkl")
56+
if os.path.isfile(data_file_path):
57+
print(f"File {data_file_path} already exists in new data-folder structure")
58+
return
59+
60+
data_df, split_ass_df = self._combine_pkl_splits(
5361
self._old_raw_dir, old_splits_file_names
5462
)
55-
data_df.to_pickle(os.path.join(self._chebi_cls.processed_dir_main, "data.pkl"))
56-
split_ass_df.to_csv(
57-
os.path.join(self._chebi_cls.processed_dir_main, "splits.csv")
58-
)
63+
64+
data_df.to_pickle(data_file_path)
65+
print(f"File {data_file_path} saved to new data-folder structure")
66+
67+
split_file = os.path.join(self._chebi_cls.processed_dir_main, "splits.csv")
68+
split_ass_df.to_csv(split_file)
69+
print(f"File {split_file} saved to new data-folder structure")
5970

6071
def _migrate_old_processed_data(self):
72+
print("-" * 50)
73+
print("Migrating old processed data.....................")
74+
75+
data_file_path = os.path.join(self._chebi_cls.processed_dir, "data.pt")
76+
if os.path.isfile(data_file_path):
77+
print(f"File {data_file_path} already exists in new data-folder structure")
78+
return
79+
6180
old_splits_file_names = {
6281
"train": "train.pt",
6382
"validation": "validation.pt",
@@ -67,13 +86,16 @@ def _migrate_old_processed_data(self):
6786
data_df = self._combine_pt_splits(
6887
self._old_processed_dir, old_splits_file_names
6988
)
70-
torch.save(data_df, self._chebi_cls.processed_dir)
89+
90+
torch.save(data_df, data_file_path)
91+
print(f"File {data_file_path} saved to new data-folder structure")
7192

7293
def _combine_pt_splits(
7394
self, old_dir: str, old_splits_file_names: Dict[str, str]
7495
) -> pd.DataFrame:
7596
self._check_if_old_splits_exists(old_dir, old_splits_file_names)
7697

98+
print("Combinig `.pt` splits...")
7799
df_list: List[pd.DataFrame] = []
78100
for split, file_name in old_splits_file_names.items():
79101
file_path = os.path.join(old_dir, file_name)
@@ -90,14 +112,15 @@ def _combine_pkl_splits(
90112
df_list: List[pd.DataFrame] = []
91113
split_assignment_list: List[pd.DataFrame] = []
92114

115+
print("Combining `.pkl` splits...")
93116
for split, file_name in old_splits_file_names.items():
94117
file_path = os.path.join(old_dir, file_name)
95-
file_df = pd.DataFrame(self._chebi_cls._load_data_from_file(file_path))
118+
file_df = pd.DataFrame(self._chebi_cls._load_data_from_file(path=file_path))
96119
file_df["split"] = split # Assign the split label to the DataFrame
97120
df_list.append(file_df)
98121

99122
# Create split assignment for the current DataFrame
100-
split_assignment = pd.DataFrame({"id": file_df["id"], "split": split})
123+
split_assignment = pd.DataFrame({"id": file_df["ident"], "split": split})
101124
split_assignment_list.append(split_assignment)
102125

103126
# Concatenate all dataframes and split assignments
@@ -137,7 +160,9 @@ def _copy_file(old_file_dir, new_file_dir, file_name):
137160
@property
138161
def _old_base_dir(self):
139162
return os.path.join(
140-
"data", self._chebi_cls._name, f"chebi_v{self._chebi_cls.chebi_version}"
163+
self.__DATA_ROOT_DIR,
164+
self._chebi_cls._name,
165+
f"chebi_v{self._chebi_cls.chebi_version}",
141166
)
142167

143168
@property
@@ -163,21 +188,24 @@ def _old_raw_dir(self):
163188
description="Migrate ChEBI dataset to new structure and handle splits."
164189
)
165190
parser.add_argument(
166-
"old_directory", type=str, help="Path to the old directory structure"
191+
"--chebi_class",
192+
type=str,
193+
required=True,
194+
help="Chebi class name from the `chebai/preprocessing/datasets/chebi.py`",
167195
)
168196
parser.add_argument(
169-
"new_directory", type=str, help="Path to the new directory structure"
197+
"--chebi_version", type=int, required=True, help="Chebi data version"
170198
)
171199
parser.add_argument(
172-
"--split_file_path",
173-
type=str,
174-
help="Path to the CSV file with split configuration",
200+
"--single_class",
201+
type=int,
202+
help="The ID of the single class to predict",
175203
default=None,
176204
)
177-
parser.add_argument("chebi_version", type=int, help="Data Version related to chebi")
178205
args = parser.parse_args()
179206

180-
# main(args.old_directory, args.new_directory, args.split_file_path)
181-
182-
# python migration_script.py path/to/old_directory path/to/new_directory --split_file_path path/to/split_configuration.csv
183-
# python migration_script.py path/to/old_directory path/to/new_directory
207+
ChebiDataMigration(
208+
class_name=args.chebi_class,
209+
chebi_version=args.chebi_version,
210+
single_class=args.single_class,
211+
).migrate()

0 commit comments

Comments
 (0)