improve selfies preprocessing, add chebi50selfies class

sfluegel · sfluegel · commit 392ea4017133 · 2024-07-01T09:58:55.000+02:00
diff --git a/chebai/preprocessing/bin/selfies/tokens.txt b/chebai/preprocessing/bin/selfies/tokens.txt
@@ -772,3 +772,5 @@
 [Fm]
 [Md]
 [No]
+[HH1]
+[CH3-1]
diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
@@ -594,6 +594,10 @@ class ChEBIOver100SELFIES(ChEBIOverXSELFIES, ChEBIOver100):
     pass
 
 
+class ChEBIOver50SELFIES(ChEBIOverXSELFIES, ChEBIOver50):
+    pass
+
+
 class ChEBIOverXPartial(ChEBIOverX):
     """Dataset that doesn't use the full ChEBI, but extracts are part of ChEBI"""
 
diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py
@@ -1,6 +1,7 @@
 import os
 
 from pysmiles.read_smiles import _tokenize
+from rdkit import Chem
 from transformers import RobertaTokenizerFast
 import deepsmiles
 import selfies as sf
@@ -186,14 +187,17 @@ def _read_data(self, raw_data):
         try:
             tokenized = sf.split_selfies(sf.encoder(raw_data.strip(), strict=True))
             tokenized = [self._get_token_index(v) for v in tokenized]
-        except Exception as e:
-            print(f"could not process {raw_data}")
-            # print(f'\t{e}')
-            self.error_count += 1
-            print(f"\terror count: {self.error_count}")
-            tokenized = None
-            # if self.error_count > 20:
-            #    raise Exception('Too many errors')
+        except Exception:
+            try:
+                # resolve potential problems by rdkit normalisation
+                smiles = Chem.MolToSmiles(Chem.MolFromSmiles(raw_data.strip()))
+                tokenized = sf.split_selfies(sf.encoder(smiles, strict=True))
+                tokenized = [self._get_token_index(v) for v in tokenized]
+            except Exception as e:
+                print(f"SELFIES encoding failed: {e}")
+                self.error_count += 1
+                print(f"\terror count: {self.error_count}")
+                tokenized = None
         return tokenized
 
 

-Original file line number
+Diff line change
 [Fm]
 [Md]
 [No]
 +[HH1]
 +[CH3-1]