File tree Expand file tree Collapse file tree 3 files changed +18
-8
lines changed
Expand file tree Collapse file tree 3 files changed +18
-8
lines changed Original file line number Diff line number Diff line change 772772[Fm]
773773[Md]
774774[No]
775+ [HH1]
776+ [CH3-1]
Original file line number Diff line number Diff line change @@ -594,6 +594,10 @@ class ChEBIOver100SELFIES(ChEBIOverXSELFIES, ChEBIOver100):
594594 pass
595595
596596
597+ class ChEBIOver50SELFIES (ChEBIOverXSELFIES , ChEBIOver50 ):
598+ pass
599+
600+
597601class ChEBIOverXPartial (ChEBIOverX ):
598602 """Dataset that doesn't use the full ChEBI, but extracts are part of ChEBI"""
599603
Original file line number Diff line number Diff line change 11import os
22
33from pysmiles .read_smiles import _tokenize
4+ from rdkit import Chem
45from transformers import RobertaTokenizerFast
56import deepsmiles
67import selfies as sf
@@ -186,14 +187,17 @@ def _read_data(self, raw_data):
186187 try :
187188 tokenized = sf .split_selfies (sf .encoder (raw_data .strip (), strict = True ))
188189 tokenized = [self ._get_token_index (v ) for v in tokenized ]
189- except Exception as e :
190- print (f"could not process { raw_data } " )
191- # print(f'\t{e}')
192- self .error_count += 1
193- print (f"\t error count: { self .error_count } " )
194- tokenized = None
195- # if self.error_count > 20:
196- # raise Exception('Too many errors')
190+ except Exception :
191+ try :
192+ # resolve potential problems by rdkit normalisation
193+ smiles = Chem .MolToSmiles (Chem .MolFromSmiles (raw_data .strip ()))
194+ tokenized = sf .split_selfies (sf .encoder (smiles , strict = True ))
195+ tokenized = [self ._get_token_index (v ) for v in tokenized ]
196+ except Exception as e :
197+ print (f"SELFIES encoding failed: { e } " )
198+ self .error_count += 1
199+ print (f"\t error count: { self .error_count } " )
200+ tokenized = None
197201 return tokenized
198202
199203
You can’t perform that action at this time.
0 commit comments