Skip to content

Commit 392ea40

Browse files
author
sfluegel
committed
improve selfies preprocessing, add chebi50selfies class
1 parent 7e5f801 commit 392ea40

File tree

3 files changed

+18
-8
lines changed

3 files changed

+18
-8
lines changed

chebai/preprocessing/bin/selfies/tokens.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -772,3 +772,5 @@
772772
[Fm]
773773
[Md]
774774
[No]
775+
[HH1]
776+
[CH3-1]

chebai/preprocessing/datasets/chebi.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,10 @@ class ChEBIOver100SELFIES(ChEBIOverXSELFIES, ChEBIOver100):
594594
pass
595595

596596

597+
class ChEBIOver50SELFIES(ChEBIOverXSELFIES, ChEBIOver50):
598+
pass
599+
600+
597601
class ChEBIOverXPartial(ChEBIOverX):
598602
"""Dataset that doesn't use the full ChEBI, but extracts are part of ChEBI"""
599603

chebai/preprocessing/reader.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22

33
from pysmiles.read_smiles import _tokenize
4+
from rdkit import Chem
45
from transformers import RobertaTokenizerFast
56
import deepsmiles
67
import selfies as sf
@@ -186,14 +187,17 @@ def _read_data(self, raw_data):
186187
try:
187188
tokenized = sf.split_selfies(sf.encoder(raw_data.strip(), strict=True))
188189
tokenized = [self._get_token_index(v) for v in tokenized]
189-
except Exception as e:
190-
print(f"could not process {raw_data}")
191-
# print(f'\t{e}')
192-
self.error_count += 1
193-
print(f"\terror count: {self.error_count}")
194-
tokenized = None
195-
# if self.error_count > 20:
196-
# raise Exception('Too many errors')
190+
except Exception:
191+
try:
192+
# resolve potential problems by rdkit normalisation
193+
smiles = Chem.MolToSmiles(Chem.MolFromSmiles(raw_data.strip()))
194+
tokenized = sf.split_selfies(sf.encoder(smiles, strict=True))
195+
tokenized = [self._get_token_index(v) for v in tokenized]
196+
except Exception as e:
197+
print(f"SELFIES encoding failed: {e}")
198+
self.error_count += 1
199+
print(f"\terror count: {self.error_count}")
200+
tokenized = None
197201
return tokenized
198202

199203

0 commit comments

Comments
 (0)