argparser + fixes

aditya0by0 · aditya0by0 · commit d8e68cc9cb8f · 2024-07-01T21:42:30.000+02:00
diff --git a/chebai/preprocessing/migration/chebi_data_migration.py b/chebai/preprocessing/migration/chebi_data_migration.py
@@ -13,51 +13,70 @@ class ChebiDataMigration:
     __MODULE_PATH: str = "chebai.preprocessing.datasets.chebi"
     __DATA_ROOT_DIR: str = "data"
 
-    def __init__(self, chebi_version, class_name: str):
-        self._chebi_version: int = chebi_version
+    def __init__(self, class_name: str, chebi_version: int, single_class: int = None):
         # Chebi class instance according to new data structure
         self._chebi_cls: Type[_ChEBIDataExtractor] = self._dynamic_import_chebi_cls(
-            class_name, chebi_version
-        )
-        self._class_path: str = class_name
-
-    def _get_old_dir_structure(self):
-        base_dir = os.path.join(
-            self.__DATA_ROOT_DIR,
-            self._chebi_cls._name,
-            f"chebi_v{self._chebi_cls.chebi_version}",
+            class_name, chebi_version, single_class
         )
+        self._chebi_version: int = chebi_version
+        self._single_class: int = single_class
+        self._class_name: str = class_name
 
     @classmethod
-    def _dynamic_import_chebi_cls(cls, class_name: str, chebi_version: int):
+    def _dynamic_import_chebi_cls(
+        cls, class_name: str, chebi_version: int, single_class: int
+    ) -> Type[_ChEBIDataExtractor]:
         class_name = class_name.strip()
         module = __import__(cls.__MODULE_PATH, fromlist=[class_name])
         _class = getattr(module, class_name)
-        return _class({"chebi_version": chebi_version})
+        return _class(**{"chebi_version": chebi_version, "single_class": single_class})
 
     def migrate(self):
         os.makedirs(self._chebi_cls.base_dir, exist_ok=True)
+        print("Migration started..................")
+        self._migrate_old_raw_data()
         self._migrate_old_processed_data()
+        print("Migration completed..................")
 
     def _migrate_old_raw_data(self):
+        print("-" * 50)
+        print("Migrating old raw Data.....................")
+
         self._copy_file(self._old_raw_dir, self._chebi_cls.raw_dir, "chebi.obo")
         self._copy_file(
             self._old_raw_dir, self._chebi_cls.processed_dir_main, "classes.txt"
         )
+
         old_splits_file_names = {
             "train": "train.pkl",
             "validation": "validation.pkl",
             "test": "test.pkl",
         }
-        data_df, split_ass_df = self._combine_splits(
+        data_file_path = os.path.join(self._chebi_cls.processed_dir_main, "data.pkl")
+        if os.path.isfile(data_file_path):
+            print(f"File {data_file_path} already exists in new data-folder structure")
+            return
+
+        data_df, split_ass_df = self._combine_pkl_splits(
             self._old_raw_dir, old_splits_file_names
         )
-        data_df.to_pickle(os.path.join(self._chebi_cls.processed_dir_main, "data.pkl"))
-        split_ass_df.to_csv(
-            os.path.join(self._chebi_cls.processed_dir_main, "splits.csv")
-        )
+
+        data_df.to_pickle(data_file_path)
+        print(f"File {data_file_path} saved to new data-folder structure")
+
+        split_file = os.path.join(self._chebi_cls.processed_dir_main, "splits.csv")
+        split_ass_df.to_csv(split_file)
+        print(f"File {split_file} saved to new data-folder structure")
 
     def _migrate_old_processed_data(self):
+        print("-" * 50)
+        print("Migrating old processed data.....................")
+
+        data_file_path = os.path.join(self._chebi_cls.processed_dir, "data.pt")
+        if os.path.isfile(data_file_path):
+            print(f"File {data_file_path} already exists in new data-folder structure")
+            return
+
         old_splits_file_names = {
             "train": "train.pt",
             "validation": "validation.pt",
@@ -67,13 +86,16 @@ def _migrate_old_processed_data(self):
         data_df = self._combine_pt_splits(
             self._old_processed_dir, old_splits_file_names
         )
-        torch.save(data_df, self._chebi_cls.processed_dir)
+
+        torch.save(data_df, data_file_path)
+        print(f"File {data_file_path} saved to new data-folder structure")
 
     def _combine_pt_splits(
         self, old_dir: str, old_splits_file_names: Dict[str, str]
     ) -> pd.DataFrame:
         self._check_if_old_splits_exists(old_dir, old_splits_file_names)
 
+        print("Combinig `.pt` splits...")
         df_list: List[pd.DataFrame] = []
         for split, file_name in old_splits_file_names.items():
             file_path = os.path.join(old_dir, file_name)
@@ -90,14 +112,15 @@ def _combine_pkl_splits(
         df_list: List[pd.DataFrame] = []
         split_assignment_list: List[pd.DataFrame] = []
 
+        print("Combining `.pkl` splits...")
         for split, file_name in old_splits_file_names.items():
             file_path = os.path.join(old_dir, file_name)
-            file_df = pd.DataFrame(self._chebi_cls._load_data_from_file(file_path))
+            file_df = pd.DataFrame(self._chebi_cls._load_data_from_file(path=file_path))
             file_df["split"] = split  # Assign the split label to the DataFrame
             df_list.append(file_df)
 
             # Create split assignment for the current DataFrame
-            split_assignment = pd.DataFrame({"id": file_df["id"], "split": split})
+            split_assignment = pd.DataFrame({"id": file_df["ident"], "split": split})
             split_assignment_list.append(split_assignment)
 
         # Concatenate all dataframes and split assignments
@@ -137,7 +160,9 @@ def _copy_file(old_file_dir, new_file_dir, file_name):
     @property
     def _old_base_dir(self):
         return os.path.join(
-            "data", self._chebi_cls._name, f"chebi_v{self._chebi_cls.chebi_version}"
+            self.__DATA_ROOT_DIR,
+            self._chebi_cls._name,
+            f"chebi_v{self._chebi_cls.chebi_version}",
         )
 
     @property
@@ -163,21 +188,24 @@ def _old_raw_dir(self):
         description="Migrate ChEBI dataset to new structure and handle splits."
     )
     parser.add_argument(
-        "old_directory", type=str, help="Path to the old directory structure"
+        "--chebi_class",
+        type=str,
+        required=True,
+        help="Chebi class name from the `chebai/preprocessing/datasets/chebi.py`",
     )
     parser.add_argument(
-        "new_directory", type=str, help="Path to the new directory structure"
+        "--chebi_version", type=int, required=True, help="Chebi data version"
     )
     parser.add_argument(
-        "--split_file_path",
-        type=str,
-        help="Path to the CSV file with split configuration",
+        "--single_class",
+        type=int,
+        help="The ID of the single class to predict",
         default=None,
     )
-    parser.add_argument("chebi_version", type=int, help="Data Version related to chebi")
     args = parser.parse_args()
 
-    # main(args.old_directory, args.new_directory, args.split_file_path)
-
-# python migration_script.py path/to/old_directory path/to/new_directory --split_file_path path/to/split_configuration.csv
-# python migration_script.py path/to/old_directory path/to/new_directory
+    ChebiDataMigration(
+        class_name=args.chebi_class,
+        chebi_version=args.chebi_version,
+        single_class=args.single_class,
+    ).migrate()