Skip to content

Commit bb08626

Browse files
committed
run a medcat model
1 parent a71adec commit bb08626

File tree

2 files changed

+309
-0
lines changed

2 files changed

+309
-0
lines changed

medcat/3_run_model/ReadMe.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Running a model to annotate text

medcat/3_run_model/run_model.ipynb

Lines changed: 308 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,308 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import os\n",
10+
"os.environ['MKL_NUM_THREAD'] = '1'\n",
11+
"os.environ['NUMEXPR_NUM_THREADS'] = '1'\n",
12+
"os.environ['OMP_NUM_THREADS'] = '1'\n",
13+
"\n",
14+
"from medcat.cat import CAT\n",
15+
"from medcat.vocab import Vocab\n",
16+
"from medcat.cdb import CDB\n",
17+
"from tokenizers import ByteLevelBPETokenizer\n",
18+
"\n",
19+
"import pandas as pd\n",
20+
"import numpy as np\n",
21+
"import json\n",
22+
"from tqdm.notebook import tqdm"
23+
]
24+
},
25+
{
26+
"cell_type": "code",
27+
"execution_count": null,
28+
"metadata": {},
29+
"outputs": [],
30+
"source": [
31+
"import warnings\n",
32+
"warnings.filterwarnings(\"ignore\", category=FutureWarning)\n"
33+
]
34+
},
35+
{
36+
"cell_type": "markdown",
37+
"metadata": {},
38+
"source": [
39+
"# Paths and Config"
40+
]
41+
},
42+
{
43+
"cell_type": "code",
44+
"execution_count": null,
45+
"metadata": {},
46+
"outputs": [],
47+
"source": [
48+
"data_dir = './data/'\n",
49+
"\n",
50+
"data_path = os.path.join(data_dir, \"<data_file>\") # Add your data file here\n",
51+
"doc_id_column = \"id\"\n",
52+
"doc_text_column = \"description\"\n",
53+
"\n",
54+
"model_dir = '../../models/'\n",
55+
"\n",
56+
"modelpack = '' # enter your model here. Should the the output of trained 'output_modelpack'.\n",
57+
"model_pack_path = os.path.join(model_dir, modelpack)\n",
58+
"\n",
59+
"filter_path = None\n",
60+
"\n",
61+
"ann_folder_path = os.path.join(data_dir, f'annotated_docs')\n",
62+
"if not os.path.exists(ann_folder_path):\n",
63+
" os.makedirs(ann_folder_path)\n",
64+
" \n",
65+
"save_path_annotations_per_doc = os.path.join(ann_folder_path, \"<output_filename>.json\")\n"
66+
]
67+
},
68+
{
69+
"cell_type": "markdown",
70+
"metadata": {},
71+
"source": [
72+
"# Load MedCAT model"
73+
]
74+
},
75+
{
76+
"cell_type": "code",
77+
"execution_count": null,
78+
"metadata": {},
79+
"outputs": [],
80+
"source": [
81+
"# Create CAT - the main class from medcat used for concept annotation\n",
82+
"cat = CAT.load_model_pack(model_pack_path)"
83+
]
84+
},
85+
{
86+
"cell_type": "markdown",
87+
"metadata": {},
88+
"source": [
89+
"# Annotate"
90+
]
91+
},
92+
{
93+
"cell_type": "code",
94+
"execution_count": null,
95+
"metadata": {},
96+
"outputs": [],
97+
"source": [
98+
"# Set snomed filter if needed\n",
99+
"snomed_filter = json.load(open(snomed_filter_path))\n"
100+
]
101+
},
102+
{
103+
"cell_type": "code",
104+
"execution_count": null,
105+
"metadata": {},
106+
"outputs": [],
107+
"source": [
108+
"cat.cdb.print_stats()"
109+
]
110+
},
111+
{
112+
"cell_type": "code",
113+
"execution_count": null,
114+
"metadata": {},
115+
"outputs": [],
116+
"source": [
117+
"df = pd.read_csv(data_path)[[doc_id_column, doc_text_column]]\n"
118+
]
119+
},
120+
{
121+
"cell_type": "code",
122+
"execution_count": null,
123+
"metadata": {
124+
"scrolled": true
125+
},
126+
"outputs": [],
127+
"source": [
128+
"%%time\n",
129+
"batch_size = 1000\n",
130+
"batch = []\n",
131+
"cnt = 0\n",
132+
"results = []\n",
133+
"for id, row in df.iterrows():\n",
134+
" text = row[doc_text_column]\n",
135+
" # Skip text if under 10 characters\n",
136+
" if len(str(text)) > 10:\n",
137+
" batch.append((row[doc_id_column], text))\n",
138+
" else:\n",
139+
" batch.append((row[doc_id_column], []))\n",
140+
" \n",
141+
" if len(batch) > batch_size or id == len(df) - 1:\n",
142+
" # Update the number of processors depending on your machine.\n",
143+
" result = cat.multiprocessing(batch, nproc=2, addl_info=snomed_filter)\n",
144+
" results.extend(result)\n",
145+
" cnt += 1\n",
146+
" print(\"Done: {} - rows\".format((cnt-1)* batch_size + len(batch)-1))\n",
147+
" \n",
148+
" # Reset the batch\n",
149+
" batch = []"
150+
]
151+
},
152+
{
153+
"cell_type": "code",
154+
"execution_count": null,
155+
"metadata": {},
156+
"outputs": [],
157+
"source": [
158+
"# Double check nothing is missed\n",
159+
"assert len(results)+len(skipped_docs) == len(df)"
160+
]
161+
},
162+
{
163+
"cell_type": "code",
164+
"execution_count": null,
165+
"metadata": {},
166+
"outputs": [],
167+
"source": [
168+
"# Save to file (docs is docs 2 annotations)\n",
169+
"json.dump(results, open(save_path_annotations_per_doc, \"w\"))"
170+
]
171+
},
172+
{
173+
"cell_type": "markdown",
174+
"metadata": {},
175+
"source": [
176+
"### Inspect the model"
177+
]
178+
},
179+
{
180+
"cell_type": "code",
181+
"execution_count": null,
182+
"metadata": {},
183+
"outputs": [],
184+
"source": [
185+
"text = \"He was diagnosed with heart failure\"\n",
186+
"doc = cat(text)\n",
187+
"print(doc.ents)"
188+
]
189+
},
190+
{
191+
"cell_type": "code",
192+
"execution_count": null,
193+
"metadata": {},
194+
"outputs": [],
195+
"source": [
196+
"# Display Snomed codes\n",
197+
"for ent in doc.ents:\n",
198+
" print(ent, \" - \", ent._.cui, \" - \", cdb.cui2preferred_name[ent._.cui])"
199+
]
200+
},
201+
{
202+
"cell_type": "code",
203+
"execution_count": null,
204+
"metadata": {},
205+
"outputs": [],
206+
"source": [
207+
"# To show semantic types for each entity\n",
208+
"for ent in doc.ents:\n",
209+
" print(ent, \" - \", cdb.cui2type_ids.get(ent._.cui))"
210+
]
211+
},
212+
{
213+
"cell_type": "code",
214+
"execution_count": null,
215+
"metadata": {},
216+
"outputs": [],
217+
"source": [
218+
"# Display\n",
219+
"from spacy import displacy\n",
220+
"displacy.render(doc, style='ent', jupyter=True)"
221+
]
222+
},
223+
{
224+
"cell_type": "markdown",
225+
"metadata": {},
226+
"source": [
227+
"# Alternative approach"
228+
]
229+
},
230+
{
231+
"cell_type": "code",
232+
"execution_count": null,
233+
"metadata": {
234+
"scrolled": true
235+
},
236+
"outputs": [],
237+
"source": [
238+
"# This approach does not use multiprocessing. But iterates line by line through your dataset.\n",
239+
"\n",
240+
"docs = {}\n",
241+
"print(f\"Len of df: {len(df)}\") \n",
242+
"\n",
243+
"for i, row in tqdm(df.iterrows(), total=df.shape[0]):\n",
244+
" text = str(row[doc_text_column])\n",
245+
" \n",
246+
" # Skip text if under 10 characters,\n",
247+
" if len(text) > 10:\n",
248+
" docs[row[doc_id_column]] = cat.get_entities(text)\n",
249+
" else:\n",
250+
" docs[row[doc_id_column]] = []"
251+
]
252+
},
253+
{
254+
"cell_type": "code",
255+
"execution_count": null,
256+
"metadata": {},
257+
"outputs": [],
258+
"source": [
259+
"cat.cdb.print_stats()"
260+
]
261+
},
262+
{
263+
"cell_type": "code",
264+
"execution_count": null,
265+
"metadata": {},
266+
"outputs": [],
267+
"source": [
268+
"# Save to file (docs is docs 2 annotations)\n",
269+
"json.dump(docs, open(save_path_annotations_per_doc, \"w\"))\n"
270+
]
271+
},
272+
{
273+
"cell_type": "code",
274+
"execution_count": null,
275+
"metadata": {},
276+
"outputs": [],
277+
"source": []
278+
},
279+
{
280+
"cell_type": "code",
281+
"execution_count": null,
282+
"metadata": {},
283+
"outputs": [],
284+
"source": []
285+
}
286+
],
287+
"metadata": {
288+
"kernelspec": {
289+
"display_name": "Python 3 (ipykernel)",
290+
"language": "python",
291+
"name": "python3"
292+
},
293+
"language_info": {
294+
"codemirror_mode": {
295+
"name": "ipython",
296+
"version": 3
297+
},
298+
"file_extension": ".py",
299+
"mimetype": "text/x-python",
300+
"name": "python",
301+
"nbconvert_exporter": "python",
302+
"pygments_lexer": "ipython3",
303+
"version": "3.7.3"
304+
}
305+
},
306+
"nbformat": 4,
307+
"nbformat_minor": 4
308+
}

0 commit comments

Comments
 (0)