@@ -13,51 +13,70 @@ class ChebiDataMigration:
1313 __MODULE_PATH : str = "chebai.preprocessing.datasets.chebi"
1414 __DATA_ROOT_DIR : str = "data"
1515
16- def __init__ (self , chebi_version , class_name : str ):
17- self ._chebi_version : int = chebi_version
16+ def __init__ (self , class_name : str , chebi_version : int , single_class : int = None ):
1817 # Chebi class instance according to new data structure
1918 self ._chebi_cls : Type [_ChEBIDataExtractor ] = self ._dynamic_import_chebi_cls (
20- class_name , chebi_version
21- )
22- self ._class_path : str = class_name
23-
24- def _get_old_dir_structure (self ):
25- base_dir = os .path .join (
26- self .__DATA_ROOT_DIR ,
27- self ._chebi_cls ._name ,
28- f"chebi_v{ self ._chebi_cls .chebi_version } " ,
19+ class_name , chebi_version , single_class
2920 )
21+ self ._chebi_version : int = chebi_version
22+ self ._single_class : int = single_class
23+ self ._class_name : str = class_name
3024
3125 @classmethod
32- def _dynamic_import_chebi_cls (cls , class_name : str , chebi_version : int ):
26+ def _dynamic_import_chebi_cls (
27+ cls , class_name : str , chebi_version : int , single_class : int
28+ ) -> Type [_ChEBIDataExtractor ]:
3329 class_name = class_name .strip ()
3430 module = __import__ (cls .__MODULE_PATH , fromlist = [class_name ])
3531 _class = getattr (module , class_name )
36- return _class ({"chebi_version" : chebi_version })
32+ return _class (** {"chebi_version" : chebi_version , "single_class" : single_class })
3733
3834 def migrate (self ):
3935 os .makedirs (self ._chebi_cls .base_dir , exist_ok = True )
36+ print ("Migration started.................." )
37+ self ._migrate_old_raw_data ()
4038 self ._migrate_old_processed_data ()
39+ print ("Migration completed.................." )
4140
4241 def _migrate_old_raw_data (self ):
42+ print ("-" * 50 )
43+ print ("Migrating old raw Data....................." )
44+
4345 self ._copy_file (self ._old_raw_dir , self ._chebi_cls .raw_dir , "chebi.obo" )
4446 self ._copy_file (
4547 self ._old_raw_dir , self ._chebi_cls .processed_dir_main , "classes.txt"
4648 )
49+
4750 old_splits_file_names = {
4851 "train" : "train.pkl" ,
4952 "validation" : "validation.pkl" ,
5053 "test" : "test.pkl" ,
5154 }
52- data_df , split_ass_df = self ._combine_splits (
55+ data_file_path = os .path .join (self ._chebi_cls .processed_dir_main , "data.pkl" )
56+ if os .path .isfile (data_file_path ):
57+ print (f"File { data_file_path } already exists in new data-folder structure" )
58+ return
59+
60+ data_df , split_ass_df = self ._combine_pkl_splits (
5361 self ._old_raw_dir , old_splits_file_names
5462 )
55- data_df .to_pickle (os .path .join (self ._chebi_cls .processed_dir_main , "data.pkl" ))
56- split_ass_df .to_csv (
57- os .path .join (self ._chebi_cls .processed_dir_main , "splits.csv" )
58- )
63+
64+ data_df .to_pickle (data_file_path )
65+ print (f"File { data_file_path } saved to new data-folder structure" )
66+
67+ split_file = os .path .join (self ._chebi_cls .processed_dir_main , "splits.csv" )
68+ split_ass_df .to_csv (split_file )
69+ print (f"File { split_file } saved to new data-folder structure" )
5970
6071 def _migrate_old_processed_data (self ):
72+ print ("-" * 50 )
73+ print ("Migrating old processed data....................." )
74+
75+ data_file_path = os .path .join (self ._chebi_cls .processed_dir , "data.pt" )
76+ if os .path .isfile (data_file_path ):
77+ print (f"File { data_file_path } already exists in new data-folder structure" )
78+ return
79+
6180 old_splits_file_names = {
6281 "train" : "train.pt" ,
6382 "validation" : "validation.pt" ,
@@ -67,13 +86,16 @@ def _migrate_old_processed_data(self):
6786 data_df = self ._combine_pt_splits (
6887 self ._old_processed_dir , old_splits_file_names
6988 )
70- torch .save (data_df , self ._chebi_cls .processed_dir )
89+
90+ torch .save (data_df , data_file_path )
91+ print (f"File { data_file_path } saved to new data-folder structure" )
7192
7293 def _combine_pt_splits (
7394 self , old_dir : str , old_splits_file_names : Dict [str , str ]
7495 ) -> pd .DataFrame :
7596 self ._check_if_old_splits_exists (old_dir , old_splits_file_names )
7697
98+ print ("Combinig `.pt` splits..." )
7799 df_list : List [pd .DataFrame ] = []
78100 for split , file_name in old_splits_file_names .items ():
79101 file_path = os .path .join (old_dir , file_name )
@@ -90,14 +112,15 @@ def _combine_pkl_splits(
90112 df_list : List [pd .DataFrame ] = []
91113 split_assignment_list : List [pd .DataFrame ] = []
92114
115+ print ("Combining `.pkl` splits..." )
93116 for split , file_name in old_splits_file_names .items ():
94117 file_path = os .path .join (old_dir , file_name )
95- file_df = pd .DataFrame (self ._chebi_cls ._load_data_from_file (file_path ))
118+ file_df = pd .DataFrame (self ._chebi_cls ._load_data_from_file (path = file_path ))
96119 file_df ["split" ] = split # Assign the split label to the DataFrame
97120 df_list .append (file_df )
98121
99122 # Create split assignment for the current DataFrame
100- split_assignment = pd .DataFrame ({"id" : file_df ["id " ], "split" : split })
123+ split_assignment = pd .DataFrame ({"id" : file_df ["ident " ], "split" : split })
101124 split_assignment_list .append (split_assignment )
102125
103126 # Concatenate all dataframes and split assignments
@@ -137,7 +160,9 @@ def _copy_file(old_file_dir, new_file_dir, file_name):
137160 @property
138161 def _old_base_dir (self ):
139162 return os .path .join (
140- "data" , self ._chebi_cls ._name , f"chebi_v{ self ._chebi_cls .chebi_version } "
163+ self .__DATA_ROOT_DIR ,
164+ self ._chebi_cls ._name ,
165+ f"chebi_v{ self ._chebi_cls .chebi_version } " ,
141166 )
142167
143168 @property
@@ -163,21 +188,24 @@ def _old_raw_dir(self):
163188 description = "Migrate ChEBI dataset to new structure and handle splits."
164189 )
165190 parser .add_argument (
166- "old_directory" , type = str , help = "Path to the old directory structure"
191+ "--chebi_class" ,
192+ type = str ,
193+ required = True ,
194+ help = "Chebi class name from the `chebai/preprocessing/datasets/chebi.py`" ,
167195 )
168196 parser .add_argument (
169- "new_directory " , type = str , help = "Path to the new directory structure "
197+ "--chebi_version " , type = int , required = True , help = "Chebi data version "
170198 )
171199 parser .add_argument (
172- "--split_file_path " ,
173- type = str ,
174- help = "Path to the CSV file with split configuration " ,
200+ "--single_class " ,
201+ type = int ,
202+ help = "The ID of the single class to predict " ,
175203 default = None ,
176204 )
177- parser .add_argument ("chebi_version" , type = int , help = "Data Version related to chebi" )
178205 args = parser .parse_args ()
179206
180- # main(args.old_directory, args.new_directory, args.split_file_path)
181-
182- # python migration_script.py path/to/old_directory path/to/new_directory --split_file_path path/to/split_configuration.csv
183- # python migration_script.py path/to/old_directory path/to/new_directory
207+ ChebiDataMigration (
208+ class_name = args .chebi_class ,
209+ chebi_version = args .chebi_version ,
210+ single_class = args .single_class ,
211+ ).migrate ()
0 commit comments