Skip to content

Commit e0670a2

Browse files
kjappelbaumml-evspre-commit-ci[bot]MicPie
committed
feat: Add uris field for identifiers (#103)
* Add `uris` field for identifiers * Linting * update valdation * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * feat: fix typo --------- Co-authored-by: Matthew Evans <git@ml-evs.science> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Michael Pieler <Michael.Pieler@Gmail.com>
1 parent af51931 commit e0670a2

File tree

2 files changed

+78
-1
lines changed

2 files changed

+78
-1
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ dev = [
3030
"pre-commit",
3131
"pydantic_yaml",
3232
"pytest",
33+
"pubchempy"
3334
]
3435

3536
# [project.optional-dependencies]

src/chemnlp/data_val/model.py

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from typing import Dict, List, Optional
22

3+
import pubchempy as pcp
4+
import requests
35
from pydantic import root_validator, validator
46
from pydantic_yaml import YamlModel, YamlStrEnum
57

@@ -19,6 +21,8 @@ class Identifier(YamlModel):
1921
"""Identifier information."""
2022

2123
id: str
24+
25+
"""A description of the field"""
2226
description: Optional[str]
2327
type: IdentifierEnum
2428
names: Optional[List[str]]
@@ -43,17 +47,80 @@ class ColumnTypes(YamlStrEnum):
4347
continuous = "continuous"
4448
categorical = "categorical"
4549
ordinal = "ordinal"
50+
boolean = "boolean"
4651

4752

4853
class Target(YamlModel):
4954
"""Target information."""
5055

5156
id: str
57+
58+
"""A english description of the field"""
5259
description: str
53-
units: str
60+
61+
"""The units of the field. None if unitless."""
62+
units: Optional[str]
63+
64+
"""The type of the field. Can be one of `continuous`, `categorical`, `ordinal`, `boolean`."""
5465
type: ColumnTypes
66+
67+
"""A list of names describing the field.
68+
69+
Note that this will be used in building the prompts. Some example for prompts:
70+
71+
- Boolean variables
72+
73+
- `Is <name> <identifier>?`
74+
- ```
75+
What molecules in the list are <name>?
76+
- <identifier_1>
77+
- <identifier_2>
78+
- <identifier_3>
79+
```
80+
81+
82+
- Continuous variables
83+
84+
- `What is <name> of <identifier>?`
85+
- ```
86+
What is the molecule with largest <name> in the following list?
87+
- <identifier_1>
88+
- <identifier_2>
89+
- <identifier_3>
90+
```
91+
"""
5592
names: List[str]
5693

94+
"""A URI or multiple (consitent ) URIs for the field.
95+
96+
Ideally this would be a link to an entry in an ontrology or controlled
97+
vocabulary that can also provide a canonical description for the field.
98+
"""
99+
uris: Optional[List[str]]
100+
101+
"""A PubChem assay IDs or multiple (consistent) PubChem assay IDs.
102+
103+
Make sure that the first assay ID is the primary assay ID.
104+
"""
105+
pubchem_aids: Optional[List[int]]
106+
107+
@validator("pubchem_aids")
108+
def uris_resolves(cls, values):
109+
if values is not None:
110+
for uri in values.get("uris"):
111+
# perform a request to the URI and check if it resolves
112+
response = requests.get(uri)
113+
if response.status_code != 200:
114+
raise ValueError(f"URI {uri} does not resolve")
115+
116+
@validator("pubchem_aids")
117+
def pubchem_assay_ids_resolve(cls, values):
118+
if values is not None:
119+
for aid in values:
120+
assays = pcp.get_assays(aid)
121+
if len(assays) == 0:
122+
raise ValueError(f"PubChem assay ID {aid} does not resolve")
123+
57124

58125
class Template(YamlModel):
59126
prompt: str
@@ -95,3 +162,12 @@ class Dataset(YamlModel):
95162
def num_points_must_be_positive(cls, v):
96163
if v < 0:
97164
raise ValueError("num_points must be positive")
165+
166+
@validator("links")
167+
def links_must_resolve(cls, v):
168+
if v is not None:
169+
for link in v:
170+
response = requests.get(link.url)
171+
if response.status_code != 200:
172+
if not (("acs" in response.text) or ("sage" in response.text)):
173+
raise ValueError(f"Link {link.url} does not resolve")

0 commit comments

Comments
 (0)