greglandrum · knalice · Jan 2, 2023
diff --git a/basic/tutorial_extension/new_rdkit_nodes/utils.py b/basic/tutorial_extension/new_rdkit_nodes/utils.py
@@ -1,6 +1,8 @@
+import logging
 import knime_extension as knext
 from rdkit import Chem
-import knime.types.chemistry as cet  # To work with and compare against chemical data types like SMILES,...
+import knime.types.chemistry as cet
+LOGGER = logging.getLogger(__name__)
 
 category = knext.category(
     '/community/rdkit',
@@ -23,14 +25,16 @@
 def column_is_convertible_to_mol(column: knext.Column):
     c_type = column.ktype
     allowedTypes = smilesTypes + ctabTypes + rdkitTypes
-
     return c_type in allowedTypes
 
+def column_is_integer(column: knext.Column):
+    return (column.ktype == knext.int32() or column.ktype == knext.int64())
 
 def convert_column_to_rdkit_mol(df,
                                 molecule_column_type,
                                 molecule_column_param,
                                 sanitizeOnParse=True):
+    # return
     if molecule_column_type in rdkitTypes:
         LOGGER.warning("rdkit mols detected")
         mols = df[molecule_column_param]
@@ -46,4 +50,4 @@ def convert_column_to_rdkit_mol(df,
         ]
     else:
         raise ValueError('unrecognized molecule column type')
-    return mols
+    return mols
diff --git a/basic/tutorial_extension/new_rdkit_nodes/visualize_rdkitfp_bits.py b/basic/tutorial_extension/new_rdkit_nodes/visualize_rdkitfp_bits.py
@@ -44,21 +44,20 @@
 
 """
 Part of the RDKit Python extension. Node 'Visualize RDKit fingerprint bits'.
-
 @author Alice Krebs, KNIME GmbH, Konstanz, Germany
-@author Steffen Fissler, KNIME GmbH, Konstanz, Germany
+@author Greg Landrum, ETH Zurich, Switzerland
 """
 
 import logging
 from turtle import clear
 import knime_extension as knext
-from rdkit import Chem
+import knime.types.chemistry as cet
+from new_rdkit_nodes import utils
 from new_rdkit_nodes.utils import category
-# from rdkit.Chem import rdMolDescriptors
+from rdkit import Chem
 from rdkit.Chem.Draw import IPythonConsole
 from rdkit.Chem import Draw
 
-# from rdkit.Chem import PandasTools
 LOGGER = logging.getLogger(__name__)
 IPythonConsole.UninstallIPythonRenderer()
 
@@ -67,87 +66,125 @@
     name="Visualize bits of RDKit fingerprints",
     node_type=knext.NodeType.MANIPULATOR,
     icon_path="icon.png",
-    category=category)
-@knext.input_table(name="Input table 1", description="Input table 1 with molecules")
-@knext.input_table(name="Input table 2", description="Input table 2 with RDKit fingerprint bits")
+    category=category
+    )
+@knext.input_table(
+    name="Input table 1", 
+    description="Input table 1 containing molecules"
+    )
+@knext.input_table(
+    name="Input table 2", 
+    description="Input table 2 containing a list of RDKit fingerprint bits"
+    )
 @knext.output_table(
     name="Highlighted bits",
-    description="Output tables including images of the highlighted bits",
+    description="Output table including images of the highlighted bits"
 )
+
 class visualizerdkitfpbits:
     """
-    This node has a description, and I will change it once I figured out the code...
-    """
+    This node takes molecules as input, converts them to RDKit mol if not given already, calculates the RDKit fingerprint
+    and gets the bit info. To calculate the fingerprint, the user specifies the minimum and maximun path length. Once the 
+    fingerprint bits and according bit info is calculated, it is checked if the bit is contained in the user-provided bit 
+    list. If so, a svg is returned, if not a missing value. The same number of columns is appended to the input table with
+    the molecules (port 1), as there are items in the bit list (port 2).
 
+    """
+# define config window
+    # get user input on fingerprint size aka number of bits, minimum and maximum path length. 
     fp_size = knext.IntParameter(
         "size of fingerprint",
         "Define the fingerprint size aka number of bits",
         2048,
-        min_value=0,
-    )
-    # min_path = knext.IntParameter("minimum path length", "Define the path length", 2, min_value=0)
+        min_value=0
+        )
+
+    min_path = knext.IntParameter(
+        "minimum path length", 
+        "Define the path length", 
+        2, 
+        min_value=0
+        )
+
     max_path = knext.IntParameter(
-        "maximum path length", "Define the path length", 2, min_value=0
-    )
-
-    # def is_molecule(column):  # Filter columns visible in the column_param for chemistry type; column_filter=is_molecule
-    #     return (
-    #         column.ktype == knext.smiles()
-    #         or column.ktype == knext.smarts()
-    #         or column.ktype == knext.sdf()
-    #     )
-
-    molecule_column = knext.ColumnParameter(
+        "maximum path length", 
+        "Define the path length", 
+        7, 
+        min_value=0
+        )
+
+    # select molecule column from input port 0 and the bit column from port 1
+    molecule_column_param = knext.ColumnParameter(
         label="Molecule column",
-        description="Choose the column from the first input table containing the molecules",
+        description=
+        "Select the molecule column. The column has to be SMILES, SDF, or RDKit molecule.",
         port_index=0,
+        # column_filter=utils.column_is_convertible_to_mol,
+        include_row_key=False,
+        include_none_column=False,
     )
+
     bits_column = knext.ColumnParameter(
         label="Bits column",
         description="Choose the column from the second input table containing the bits as integer",
         port_index=1,
+        # column_filter=utils.column_is_integer,
+        include_row_key=False,
+        include_none_column=False
     )
 
-    a = 0  # the number of columns we want to add
+    a = 0  
 
+# define the schema of the node output table
     def configure(
-        self, configure_context, input_schema_1: knext.Schema, input_schema_2
-    ):  # STEFFEN NERVEN wie man a setzt!!!
+        self, configure_context, input_schema_1: knext.Schema, input_schema_2: knext.Schema
+    ): 
         for i in range(self.a):
-            input_schema_1 = input_schema_1.append(knext.Column(knext.string()), "test")
-            # input_schema_1 = input_schema_1.append(knext.Column(knext.string(), f"column{self.a}"))
+            input_schema_1 = input_schema_1.append(knext.Column(knext.string()), "test")            
         return input_schema_1
 
+
+# actual function
+    # def execute(self, exec_context, input_1, input_2):
+    #     return input_1
+
     def execute(self, exec_context, input_1, input_2):
+        if self.molecule_column_param is None:
+            raise AttributeError(
+                "Molecule column was not selected in configuration dialog.")
+
+        if self.bits_column is None:
+            raise AttributeError(
+                "Bits column was not selected in configuration dialog.")
+
+
         a = len(self.bits_column)
 
+        # make Pandas dataframes
         input_1_pandas = input_1.to_pandas()
         input_2_pandas = input_2.to_pandas()
+        output_table_1 = input_1_pandas.copy()
 
-        # define the molecules from input table 1
-        # mols = [Chem.MolFromSmiles(smi) for smi in input_1_pandas[self.molecule_column]]
-
-        mols = []
-        for smi in input_1_pandas[self.molecule_column]:
-            mols.append(Chem.MolFromSmiles(smi))
-
-        # LOGGER.warning(mols)
-        # LOGGER.warning(type(mols))
+        # Prepare mols: If input column consists of rdkit molecules, use them;
+        # if input column consists of SMILES, convert to rdkit molecules
+        molecule_column_type = input_1.schema[self.molecule_column_param].ktype
+        df = input_1.to_pandas()
+        mols = utils.convert_column_to_rdkit_mol(df,
+                                                 molecule_column_type,
+                                                 self.molecule_column_param,
+                                                 sanitizeOnParse=False)
 
         # create list of FP bits from input table 2
         fp_ids = input_2_pandas[self.bits_column]
 
         cols = [None] * len(fp_ids)
         for i in range(len(cols)):
             cols[i] = []
-
-        # defining the output table
-        output_table_1 = input_1_pandas.copy()
-
+
         for mol in mols:
-            rdkbi = {}  # defining a dictionary
+            rdkbi = {}  
             fp = Chem.RDKFingerprint(
-                mol, maxPath=self.max_path, fpSize=self.fp_size, bitInfo=rdkbi
+                mol, minPath=self.min_path, maxPath=self.max_path, fpSize=self.fp_size, bitInfo=rdkbi
             )  # calculate fingerprint with user-defined path length and nr of bits aka fp size
             Chem.Kekulize(mol)  # kekulize molecules
             for i, idx in enumerate(
@@ -160,8 +197,9 @@ def execute(self, exec_context, input_1, input_2):
                         cols[i].append(None)
                 else:
                     cols[i].append(None)
-
+        
         for i, idx in enumerate(fp_ids):
             output_table_1[f"bit{idx}"] = cols[i]
 
+        # return knext.Table.from_pandas(input_1_pandas)
         return knext.Table.from_pandas(output_table_1)