11from typing import Dict , List , Optional
22
3+ import pubchempy as pcp
4+ import requests
35from pydantic import root_validator , validator
46from pydantic_yaml import YamlModel , YamlStrEnum
57
@@ -19,6 +21,8 @@ class Identifier(YamlModel):
1921 """Identifier information."""
2022
2123 id : str
24+
25+ """A description of the field"""
2226 description : Optional [str ]
2327 type : IdentifierEnum
2428 names : Optional [List [str ]]
@@ -43,17 +47,80 @@ class ColumnTypes(YamlStrEnum):
4347 continuous = "continuous"
4448 categorical = "categorical"
4549 ordinal = "ordinal"
50+ boolean = "boolean"
4651
4752
4853class Target (YamlModel ):
4954 """Target information."""
5055
5156 id : str
57+
58+ """A english description of the field"""
5259 description : str
53- units : str
60+
61+ """The units of the field. None if unitless."""
62+ units : Optional [str ]
63+
64+ """The type of the field. Can be one of `continuous`, `categorical`, `ordinal`, `boolean`."""
5465 type : ColumnTypes
66+
67+ """A list of names describing the field.
68+
69+ Note that this will be used in building the prompts. Some example for prompts:
70+
71+ - Boolean variables
72+
73+ - `Is <name> <identifier>?`
74+ - ```
75+ What molecules in the list are <name>?
76+ - <identifier_1>
77+ - <identifier_2>
78+ - <identifier_3>
79+ ```
80+
81+
82+ - Continuous variables
83+
84+ - `What is <name> of <identifier>?`
85+ - ```
86+ What is the molecule with largest <name> in the following list?
87+ - <identifier_1>
88+ - <identifier_2>
89+ - <identifier_3>
90+ ```
91+ """
5592 names : List [str ]
5693
94+ """A URI or multiple (consitent ) URIs for the field.
95+
96+ Ideally this would be a link to an entry in an ontrology or controlled
97+ vocabulary that can also provide a canonical description for the field.
98+ """
99+ uris : Optional [List [str ]]
100+
101+ """A PubChem assay IDs or multiple (consistent) PubChem assay IDs.
102+
103+ Make sure that the first assay ID is the primary assay ID.
104+ """
105+ pubchem_aids : Optional [List [int ]]
106+
107+ @validator ("pubchem_aids" )
108+ def uris_resolves (cls , values ):
109+ if values is not None :
110+ for uri in values .get ("uris" ):
111+ # perform a request to the URI and check if it resolves
112+ response = requests .get (uri )
113+ if response .status_code != 200 :
114+ raise ValueError (f"URI { uri } does not resolve" )
115+
116+ @validator ("pubchem_aids" )
117+ def pubchem_assay_ids_resolve (cls , values ):
118+ if values is not None :
119+ for aid in values :
120+ assays = pcp .get_assays (aid )
121+ if len (assays ) == 0 :
122+ raise ValueError (f"PubChem assay ID { aid } does not resolve" )
123+
57124
58125class Template (YamlModel ):
59126 prompt : str
@@ -95,3 +162,12 @@ class Dataset(YamlModel):
95162 def num_points_must_be_positive (cls , v ):
96163 if v < 0 :
97164 raise ValueError ("num_points must be positive" )
165+
166+ @validator ("links" )
167+ def links_must_resolve (cls , v ):
168+ if v is not None :
169+ for link in v :
170+ response = requests .get (link .url )
171+ if response .status_code != 200 :
172+ if not (("acs" in response .text ) or ("sage" in response .text )):
173+ raise ValueError (f"Link { link .url } does not resolve" )
0 commit comments