medcattrainer export workflow

antsh3k · antsh3k · commit a71adecf6bf7 · 2022-06-23T19:20:22.000+01:00
diff --git a/medcat/evaluate_mct_export/mct_export_summary.ipynb b/medcat/evaluate_mct_export/mct_export_summary.ipynb
@@ -0,0 +1,240 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate a MedCATtrainer project export"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import pandas as pd\n",
+    "import plotly.graph_objects as go\n",
+    "from plotly. subplots import make_subplots\n",
+    "from IPython.display import Image\n",
+    "from collections import Counter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mct_export = '../../data/medcattrainer_export/'+''  # mct_export .json here\n",
+    "with open(mct_export, 'r') as jsonfile:\n",
+    "    mct = json.load(jsonfile)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# projects\n",
+    "for p in mct['projects']:\n",
+    "    print(p['name'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# documents\n",
+    "doc_lst = []\n",
+    "for p in mct['projects']:\n",
+    "    for doc in p['documents']:\n",
+    "        doc_lst.append(doc['name'])\n",
+    "doc_lst"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'mct' is not defined",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m/var/folders/31/x5x_6lb14zj9cz75df77dx9h0000gn/T/ipykernel_45904/483071205.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# annotations\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mann_lst\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mp\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmct\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'projects'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0mdoc\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'documents'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0manns\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdoc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'annotations'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'mct' is not defined"
+     ],
+     "output_type": "error"
+    }
+   ],
+   "source": [
+    "# annotations\n",
+    "ann_lst = []\n",
+    "for p in mct['projects']:\n",
+    "    for doc in p['documents']:\n",
+    "        for anns in doc['annotations']:\n",
+    "            ann_lst.append(anns)\n",
+    "ann_lst\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Summary format for analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "ann_lst = []\n",
+    "for p in mct['projects']:\n",
+    "    for doc in p['documents']:\n",
+    "        for anns in doc['annotations']:\n",
+    "            output = dict()\n",
+    "            output['project'] = p['name']\n",
+    "            output['document_name'] = doc['name']\n",
+    "            meta_anns_dict = {}\n",
+    "            for meta_ann in anns['meta_anns'].items():\n",
+    "                meta_anns_dict.update({meta_ann[0]: meta_ann[1]['value']})\n",
+    "                \n",
+    "            anns.pop('meta_anns')\n",
+    "            output.update(anns)\n",
+    "            output.update(meta_anns_dict)\n",
+    "            ann_lst.append(output)\n",
+    "final_output = pd.DataFrame(ann_lst)\n",
+    "final_output['last_modified'] = pd.to_datetime(final_output['last_modified'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Counts of annotations\n",
+    "cui_counts = Counter(final_output['cui'])\n",
+    "cui_counts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Make User plots"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = final_output[['user', 'last_modified']]\n",
+    "data = df.groupby([df['last_modified'].dt.year.rename('year'),\n",
+    "                   df['last_modified'].df.month.rename('month'),\n",
+    "                   df['last_modified'].dt.day.rename('day'),\n",
+    "                   df['user']]).agg({'count'})\n",
+    "\n",
+    "data = pd.DataFrame(data)\n",
+    "data.columns = data.columns.droplevel()\n",
+    "data = data.reset_index(drop=False)\n",
+    "data['date'] = pd.datetime(data[['year', 'month', 'day']])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# annotator work\n",
+    "\n",
+    "fig = go.Figure()\n",
+    "for user in data['user'].unique():\n",
+    "    fig.add_trace(\n",
+    "        go.Bar(x=data[data['user'] == user]['date'], y=data[data['user'] == user]['count'], name=user),\n",
+    "    )\n",
+    "\n",
+    "fig.update_layout(tutle = {'text': 'MedCATtrainer Annotator Progress'})\n",
+    "fig.update_layout(legend_title_text='MedCAT Annotator')\n",
+    "fig.update_xaxes(title_text='Date')\n",
+    "fig.update_yaxes(title_text='Annotation Count')\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}