run a medcat model

antsh3k · antsh3k · commit bb08626a6958 · 2022-06-23T19:21:56.000+01:00
diff --git a/medcat/3_run_model/ReadMe.md b/medcat/3_run_model/ReadMe.md
@@ -0,0 +1 @@
+# Running a model to annotate text
diff --git a/medcat/3_run_model/run_model.ipynb b/medcat/3_run_model/run_model.ipynb
@@ -0,0 +1,308 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['MKL_NUM_THREAD'] = '1'\n",
+    "os.environ['NUMEXPR_NUM_THREADS'] = '1'\n",
+    "os.environ['OMP_NUM_THREADS'] = '1'\n",
+    "\n",
+    "from medcat.cat import CAT\n",
+    "from medcat.vocab import Vocab\n",
+    "from medcat.cdb import CDB\n",
+    "from tokenizers import ByteLevelBPETokenizer\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import json\n",
+    "from tqdm.notebook import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\", category=FutureWarning)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Paths and Config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_dir = './data/'\n",
+    "\n",
+    "data_path = os.path.join(data_dir, \"<data_file>\")  # Add your data file here\n",
+    "doc_id_column = \"id\"\n",
+    "doc_text_column = \"description\"\n",
+    "\n",
+    "model_dir = '../../models/'\n",
+    "\n",
+    "modelpack = ''  # enter your model here. Should the the output of trained 'output_modelpack'.\n",
+    "model_pack_path = os.path.join(model_dir, modelpack)\n",
+    "\n",
+    "filter_path = None\n",
+    "\n",
+    "ann_folder_path = os.path.join(data_dir, f'annotated_docs')\n",
+    "if not os.path.exists(ann_folder_path):\n",
+    "    os.makedirs(ann_folder_path)\n",
+    "    \n",
+    "save_path_annotations_per_doc = os.path.join(ann_folder_path, \"<output_filename>.json\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load MedCAT model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create CAT - the main class from medcat used for concept annotation\n",
+    "cat = CAT.load_model_pack(model_pack_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Annotate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set snomed filter if needed\n",
+    "snomed_filter = json.load(open(snomed_filter_path))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cat.cdb.print_stats()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(data_path)[[doc_id_column, doc_text_column]]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "batch_size = 1000\n",
+    "batch = []\n",
+    "cnt = 0\n",
+    "results = []\n",
+    "for id, row in df.iterrows():\n",
+    "    text = row[doc_text_column]\n",
+    "    # Skip text if under 10 characters\n",
+    "    if len(str(text)) > 10:\n",
+    "        batch.append((row[doc_id_column], text))\n",
+    "    else:\n",
+    "        batch.append((row[doc_id_column], []))\n",
+    "    \n",
+    "    if len(batch) > batch_size or id == len(df) - 1:\n",
+    "        # Update the number of processors depending on your machine.\n",
+    "        result = cat.multiprocessing(batch, nproc=2, addl_info=snomed_filter)\n",
+    "        results.extend(result)\n",
+    "        cnt += 1\n",
+    "        print(\"Done: {} - rows\".format((cnt-1)* batch_size + len(batch)-1))\n",
+    "        \n",
+    "        # Reset the batch\n",
+    "        batch = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Double check nothing is missed\n",
+    "assert len(results)+len(skipped_docs) == len(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save to file (docs is docs 2 annotations)\n",
+    "json.dump(results, open(save_path_annotations_per_doc, \"w\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Inspect the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"He was diagnosed with heart failure\"\n",
+    "doc = cat(text)\n",
+    "print(doc.ents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Display Snomed codes\n",
+    "for ent in doc.ents:\n",
+    "    print(ent, \" - \", ent._.cui, \" - \", cdb.cui2preferred_name[ent._.cui])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# To show semantic types for each entity\n",
+    "for ent in doc.ents:\n",
+    "    print(ent, \" - \", cdb.cui2type_ids.get(ent._.cui))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Display\n",
+    "from spacy import displacy\n",
+    "displacy.render(doc, style='ent', jupyter=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Alternative approach"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# This approach does not use multiprocessing. But iterates line by line through your dataset.\n",
+    "\n",
+    "docs = {}\n",
+    "print(f\"Len of df: {len(df)}\") \n",
+    "\n",
+    "for i, row in tqdm(df.iterrows(), total=df.shape[0]):\n",
+    "    text = str(row[doc_text_column])\n",
+    "    \n",
+    "    # Skip text if under 10 characters,\n",
+    "    if len(text) > 10:\n",
+    "        docs[row[doc_id_column]] = cat.get_entities(text)\n",
+    "    else:\n",
+    "        docs[row[doc_id_column]] = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cat.cdb.print_stats()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save to file (docs is docs 2 annotations)\n",
+    "json.dump(docs, open(save_path_annotations_per_doc, \"w\"))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}