BlueBrain · drsantos89 · Aug 31, 2022 · Aug 31, 2022 · Sep 6, 2022 · Sep 6, 2022
diff --git a/docs/source/api/bluesearch.k8s.embeddings.rst b/docs/source/api/bluesearch.k8s.embeddings.rst
@@ -0,0 +1,7 @@
+bluesearch.k8s.embeddings module
+================================
+
+.. automodule:: bluesearch.k8s.embeddings
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/bluesearch.k8s.rst b/docs/source/api/bluesearch.k8s.rst
@@ -9,6 +9,7 @@ Submodules
 
    bluesearch.k8s.connect
    bluesearch.k8s.create_indices
+   bluesearch.k8s.embeddings
 
 Module contents
 ---------------

diff --git a/notebooks/check_paragrapha_size.ipynb b/notebooks/check_paragrapha_size.ipynb
@@ -0,0 +1,231 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# connect to ES"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bluesearch.k8s.connect import connect"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = connect()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# tokenize all the paragraphs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tqdm\n",
+    "from elasticsearch.helpers import scan"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(\"sentence-transformers/multi-qa-MiniLM-L6-cos-v1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lens = []\n",
+    "progress = tqdm.tqdm(position=0, unit=\" Docs\", desc=\"Scanning paragraphs\")\n",
+    "body = {\"query\":{\"match_all\":{}}}\n",
+    "for hit in scan(client, query=body, index=\"paragraphs\"):\n",
+    "    emb = tokenizer.tokenize(hit['_source']['text'])\n",
+    "    lens.append(len(emb))\n",
+    "    progress.update(1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# plot results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "sns.set()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.boxplot(lens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.boxplot(lens)\n",
+    "plt.ylim([0, 512])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(lens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(lens, bins=100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(lens, bins=100)\n",
+    "plt.xlim([0, 512])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lens=np.array(lens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(lens[np.array(lens)>512]) / len(lens) * 100"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# get biggest paragraphs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "paragraphs = []\n",
+    "progress = tqdm.tqdm(position=0, unit=\" Docs\", desc=\"Scanning paragraphs\")\n",
+    "body = {\"query\":{\"match_all\":{}}}\n",
+    "for hit in scan(client, query=body, index=\"paragraphs\"):\n",
+    "    emb = tokenizer.tokenize(hit['_source']['text'])\n",
+    "    hit['_source']['tokenizer'] = ', '.join(emb)\n",
+    "    progress.update(1)\n",
+    "    if len(emb) > 1000:\n",
+    "        paragraphs.append(hit['_source'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "paragraphs"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.5 ('py10')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.5"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "e14b248c68ef27f7e40aef879e7b97aaa0976632ef81142793ba6d8efee923a4"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/setup.py b/setup.py
@@ -54,7 +54,7 @@
     # Required to encrypt mysql password; >= 3.2 to fix RSA decryption vulnerability
     "cryptography>=3.2",
     "defusedxml",
-    "elasticsearch>=8",
+    "elasticsearch==8.3.3",
     "google-cloud-storage",
     "h5py",
     "ipython",