Skip to content

Commit a87dd35

Browse files
committed
migration script for chebi data for new data restructure
1 parent f747257 commit a87dd35

File tree

2 files changed

+183
-0
lines changed

2 files changed

+183
-0
lines changed

chebai/preprocessing/migration/__init__.py

Whitespace-only changes.
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
import argparse
2+
import os
3+
import shutil
4+
from typing import Dict, List, Tuple, Type
5+
6+
import pandas as pd
7+
import torch
8+
9+
from chebai.preprocessing.datasets.chebi import _ChEBIDataExtractor
10+
11+
12+
class ChebiDataMigration:
13+
__MODULE_PATH: str = "chebai.preprocessing.datasets.chebi"
14+
__DATA_ROOT_DIR: str = "data"
15+
16+
def __init__(self, chebi_version, class_name: str):
17+
self._chebi_version: int = chebi_version
18+
# Chebi class instance according to new data structure
19+
self._chebi_cls: Type[_ChEBIDataExtractor] = self._dynamic_import_chebi_cls(
20+
class_name, chebi_version
21+
)
22+
self._class_path: str = class_name
23+
24+
def _get_old_dir_structure(self):
25+
base_dir = os.path.join(
26+
self.__DATA_ROOT_DIR,
27+
self._chebi_cls._name,
28+
f"chebi_v{self._chebi_cls.chebi_version}",
29+
)
30+
31+
@classmethod
32+
def _dynamic_import_chebi_cls(cls, class_name: str, chebi_version: int):
33+
class_name = class_name.strip()
34+
module = __import__(cls.__MODULE_PATH, fromlist=[class_name])
35+
_class = getattr(module, class_name)
36+
return _class({"chebi_version": chebi_version})
37+
38+
def migrate(self):
39+
os.makedirs(self._chebi_cls.base_dir, exist_ok=True)
40+
self._migrate_old_processed_data()
41+
42+
def _migrate_old_raw_data(self):
43+
self._copy_file(self._old_raw_dir, self._chebi_cls.raw_dir, "chebi.obo")
44+
self._copy_file(
45+
self._old_raw_dir, self._chebi_cls.processed_dir_main, "classes.txt"
46+
)
47+
old_splits_file_names = {
48+
"train": "train.pkl",
49+
"validation": "validation.pkl",
50+
"test": "test.pkl",
51+
}
52+
data_df, split_ass_df = self._combine_splits(
53+
self._old_raw_dir, old_splits_file_names
54+
)
55+
data_df.to_pickle(os.path.join(self._chebi_cls.processed_dir_main, "data.pkl"))
56+
split_ass_df.to_csv(
57+
os.path.join(self._chebi_cls.processed_dir_main, "splits.csv")
58+
)
59+
60+
def _migrate_old_processed_data(self):
61+
old_splits_file_names = {
62+
"train": "train.pt",
63+
"validation": "validation.pt",
64+
"test": "test.pt",
65+
}
66+
67+
data_df = self._combine_pt_splits(
68+
self._old_processed_dir, old_splits_file_names
69+
)
70+
torch.save(data_df, self._chebi_cls.processed_dir)
71+
72+
def _combine_pt_splits(
73+
self, old_dir: str, old_splits_file_names: Dict[str, str]
74+
) -> pd.DataFrame:
75+
self._check_if_old_splits_exists(old_dir, old_splits_file_names)
76+
77+
df_list: List[pd.DataFrame] = []
78+
for split, file_name in old_splits_file_names.items():
79+
file_path = os.path.join(old_dir, file_name)
80+
file_df = pd.DataFrame(torch.load(file_path))
81+
df_list.append(file_df)
82+
83+
return pd.concat(df_list, ignore_index=True)
84+
85+
def _combine_pkl_splits(
86+
self, old_dir: str, old_splits_file_names: Dict[str, str]
87+
) -> Tuple[pd.DataFrame, pd.DataFrame]:
88+
self._check_if_old_splits_exists(old_dir, old_splits_file_names)
89+
90+
df_list: List[pd.DataFrame] = []
91+
split_assignment_list: List[pd.DataFrame] = []
92+
93+
for split, file_name in old_splits_file_names.items():
94+
file_path = os.path.join(old_dir, file_name)
95+
file_df = pd.DataFrame(self._chebi_cls._load_data_from_file(file_path))
96+
file_df["split"] = split # Assign the split label to the DataFrame
97+
df_list.append(file_df)
98+
99+
# Create split assignment for the current DataFrame
100+
split_assignment = pd.DataFrame({"id": file_df["id"], "split": split})
101+
split_assignment_list.append(split_assignment)
102+
103+
# Concatenate all dataframes and split assignments
104+
combined_df = pd.concat(df_list, ignore_index=True)
105+
combined_split_assignment = pd.concat(split_assignment_list, ignore_index=True)
106+
107+
return combined_df, combined_split_assignment
108+
109+
@staticmethod
110+
def _check_if_old_splits_exists(old_dir, old_splits_file_names):
111+
if any(
112+
not os.path.isfile(os.path.join(old_dir, file))
113+
for file in old_splits_file_names.values()
114+
):
115+
raise FileNotFoundError(
116+
f"One of the split {old_splits_file_names.values()} doesn't exists "
117+
f"in old data-folder structure: {old_dir}"
118+
)
119+
120+
@staticmethod
121+
def _copy_file(old_file_dir, new_file_dir, file_name):
122+
os.makedirs(new_file_dir, exist_ok=True)
123+
new_file_path = os.path.join(new_file_dir, file_name)
124+
if os.path.isfile(new_file_path):
125+
print(f"File {new_file_path} already exists in new data-folder structure")
126+
return
127+
128+
old_file_path = os.path.join(old_file_dir, file_name)
129+
if not os.path.isfile(old_file_path):
130+
raise FileNotFoundError(
131+
f"File {old_file_path} doesn't exists in old data-folder structure"
132+
)
133+
134+
shutil.copy2(os.path.abspath(old_file_path), os.path.abspath(new_file_path))
135+
print(f"Copied from {old_file_path} to {new_file_path}")
136+
137+
@property
138+
def _old_base_dir(self):
139+
return os.path.join(
140+
"data", self._chebi_cls._name, f"chebi_v{self._chebi_cls.chebi_version}"
141+
)
142+
143+
@property
144+
def _old_processed_dir(self):
145+
res = os.path.join(
146+
self._old_base_dir,
147+
"processed",
148+
*self._chebi_cls.identifier,
149+
)
150+
if self._chebi_cls.single_class is None:
151+
return res
152+
else:
153+
return os.path.join(res, f"single_{self._chebi_cls.single_class}")
154+
155+
@property
156+
def _old_raw_dir(self):
157+
"""name of dir where the raw data is stored"""
158+
return os.path.join(self._old_base_dir, "raw")
159+
160+
161+
if __name__ == "__main__":
162+
parser = argparse.ArgumentParser(
163+
description="Migrate ChEBI dataset to new structure and handle splits."
164+
)
165+
parser.add_argument(
166+
"old_directory", type=str, help="Path to the old directory structure"
167+
)
168+
parser.add_argument(
169+
"new_directory", type=str, help="Path to the new directory structure"
170+
)
171+
parser.add_argument(
172+
"--split_file_path",
173+
type=str,
174+
help="Path to the CSV file with split configuration",
175+
default=None,
176+
)
177+
parser.add_argument("chebi_version", type=int, help="Data Version related to chebi")
178+
args = parser.parse_args()
179+
180+
# main(args.old_directory, args.new_directory, args.split_file_path)
181+
182+
# python migration_script.py path/to/old_directory path/to/new_directory --split_file_path path/to/split_configuration.csv
183+
# python migration_script.py path/to/old_directory path/to/new_directory

0 commit comments

Comments
 (0)