diff --git a/programs/stam-nu.ipynb b/programs/stam-nu.ipynb
index 19bcc48..ec22a29 100644
--- a/programs/stam-nu.ipynb
+++ b/programs/stam-nu.ipynb
@@ -2,11 +2,12 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
+ "import time\n",
"from itertools import chain\n",
"\n",
"from tf.app import use\n",
@@ -15,7 +16,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
@@ -33,7 +34,7 @@
{
"data": {
"text/html": [
- "app: ~/github/ETCBC/bhsa/app"
+ "app: ~/github/ETCBC/bhsa/app"
],
"text/plain": [
""
@@ -45,7 +46,7 @@
{
"data": {
"text/html": [
- "data: ~/github/ETCBC/bhsa/tf/2021"
+ "data: ~/github/ETCBC/bhsa/tf/2021"
],
"text/plain": [
""
@@ -57,7 +58,7 @@
{
"data": {
"text/html": [
- "data: ~/github/ETCBC/phono/tf/2021"
+ "data: ~/github/ETCBC/phono/tf/2021"
],
"text/plain": [
""
@@ -69,7 +70,7 @@
{
"data": {
"text/html": [
- "data: ~/github/ETCBC/parallels/tf/2021"
+ "data: ~/github/ETCBC/parallels/tf/2021"
],
"text/plain": [
""
@@ -804,7 +805,7 @@
" \n",
"\n",
"\n",
- " Settings:
specified
- apiVersion:
3 - appName:
ETCBC/bhsa - appPath:
/Users/me/github/ETCBC/bhsa/app - commit: no value
- css:
'' dataDisplay:
exampleSectionHtml:
<code>Genesis 1:1</code> (use <a href=\"https://github.com/{org}/{repo}/blob/master/tf/{version}/book%40en.tf\" target=\"_blank\">English book names</a>)excludedFeatures:
g_uvf_utf8g_vbskq_hybridlanguageISOg_nmelex0is_rootg_vbs_utf8g_uvfdistrootsuffix_persong_vbedist_unitsuffix_numberdistributional_parentkq_hybrid_utf8crossrefSETinstructiong_prslexeme_countrank_occg_pfm_utf8freq_occcrossrefLCSfunctional_parentg_pfmg_nme_utf8g_vbe_utf8kindg_prs_utf8suffix_gendermother_object_type
noneValues:
absentn/anoneunknown- no value
NA
docs:
- docBase:
{docRoot}/{repo} - docExt:
'' - docPage:
'' - docRoot:
https://{org}.github.io - featurePage:
0_home
- interfaceDefaults:
{} - isCompatible:
True - local:
clone - localDir:
/Users/me/github/ETCBC/bhsa/_temp provenanceSpec:
- corpus:
BHSA = Biblia Hebraica Stuttgartensia Amstelodamensis - doi:
10.5281/zenodo.1007624 moduleSpecs:
:
- backend: no value
- corpus:
Phonetic Transcriptions docUrl:
https://nbviewer.jupyter.org/github/etcbc/phono/blob/master/programs/phono.ipynb- doi:
10.5281/zenodo.1007636 - org:
ETCBC - relative:
/tf - repo:
phono
:
- backend: no value
- corpus:
Parallel Passages docUrl:
https://nbviewer.jupyter.org/github/ETCBC/parallels/blob/master/programs/parallels.ipynb- doi:
10.5281/zenodo.1007642 - org:
ETCBC - relative:
/tf - repo:
parallels
- org:
ETCBC - relative:
/tf - repo:
bhsa - version:
2021 - webBase:
https://shebanq.ancient-data.org/hebrew - webHint:
Show this on SHEBANQ - webLang:
la - webLexId:
True webUrl:
{webBase}/text?book=<1>&chapter=<2>&verse=<3>&version={version}&mr=m&qw=q&tp=txt_p&tr=hb&wget=v&qget=v&nget=vt- webUrlLex:
{webBase}/word?version={version}&id=<lid>
- release: no value
typeDisplay:
clause:
- label:
{typ} {rela} - style:
''
clause_atom:
- hidden:
True - label:
{code} - level:
1 - style:
''
half_verse:
- hidden:
True - label:
{label} - style:
'' - verselike:
True
lex:
- featuresBare:
gloss - label:
{voc_lex_utf8} - lexOcc:
word - style:
orig - template:
{voc_lex_utf8}
phrase:
- label:
{typ} {function} - style:
''
phrase_atom:
- hidden:
True - label:
{typ} {rela} - level:
1 - style:
''
sentence:
sentence_atom:
- hidden:
True - label:
{number} - level:
1 - style:
''
subphrase:
- hidden:
True - label:
{number} - style:
''
word:
- features:
pdp vs vt - featuresBare:
lex:gloss
- writing:
hbo
\n"
+ " Settings:
specified
- apiVersion:
3 - appName:
ETCBC/bhsa - appPath:
/home/proycon/github/ETCBC/bhsa/app - commit: no value
- css:
'' dataDisplay:
exampleSectionHtml:
<code>Genesis 1:1</code> (use <a href=\"https://github.com/{org}/{repo}/blob/master/tf/{version}/book%40en.tf\" target=\"_blank\">English book names</a>)excludedFeatures:
g_uvf_utf8g_vbskq_hybridlanguageISOg_nmelex0is_rootg_vbs_utf8g_uvfdistrootsuffix_persong_vbedist_unitsuffix_numberdistributional_parentkq_hybrid_utf8crossrefSETinstructiong_prslexeme_countrank_occg_pfm_utf8freq_occcrossrefLCSfunctional_parentg_pfmg_nme_utf8g_vbe_utf8kindg_prs_utf8suffix_gendermother_object_type
noneValues:
absentn/anoneunknown- no value
NA
docs:
- docBase:
{docRoot}/{repo} - docExt:
'' - docPage:
'' - docRoot:
https://{org}.github.io - featurePage:
0_home
- interfaceDefaults:
{} - isCompatible:
True - local:
clone - localDir:
/home/proycon/github/ETCBC/bhsa/_temp provenanceSpec:
- corpus:
BHSA = Biblia Hebraica Stuttgartensia Amstelodamensis - doi:
10.5281/zenodo.1007624 moduleSpecs:
:
- backend: no value
- corpus:
Phonetic Transcriptions docUrl:
https://nbviewer.jupyter.org/github/etcbc/phono/blob/master/programs/phono.ipynb- doi:
10.5281/zenodo.1007636 - org:
ETCBC - relative:
/tf - repo:
phono
:
- backend: no value
- corpus:
Parallel Passages docUrl:
https://nbviewer.jupyter.org/github/ETCBC/parallels/blob/master/programs/parallels.ipynb- doi:
10.5281/zenodo.1007642 - org:
ETCBC - relative:
/tf - repo:
parallels
- org:
ETCBC - relative:
/tf - repo:
bhsa - version:
2021 - webBase:
https://shebanq.ancient-data.org/hebrew - webHint:
Show this on SHEBANQ - webLang:
la - webLexId:
True webUrl:
{webBase}/text?book=<1>&chapter=<2>&verse=<3>&version={version}&mr=m&qw=q&tp=txt_p&tr=hb&wget=v&qget=v&nget=vt- webUrlLex:
{webBase}/word?version={version}&id=<lid>
- release: no value
typeDisplay:
clause:
- label:
{typ} {rela} - style:
''
clause_atom:
- hidden:
True - label:
{code} - level:
1 - style:
''
half_verse:
- hidden:
True - label:
{label} - style:
'' - verselike:
True
lex:
- featuresBare:
gloss - label:
{voc_lex_utf8} - lexOcc:
word - style:
orig - template:
{voc_lex_utf8}
phrase:
- label:
{typ} {function} - style:
''
phrase_atom:
- hidden:
True - label:
{typ} {rela} - level:
1 - style:
''
sentence:
sentence_atom:
- hidden:
True - label:
{number} - level:
1 - style:
''
subphrase:
- hidden:
True - label:
{number} - style:
''
word:
- features:
pdp vs vt - featuresBare:
lex:gloss
- writing:
hbo
\n"
],
"text/plain": [
""
@@ -1460,14 +1461,14 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- " 0.90s 15522 results\n"
+ " 0.82s 15522 results\n"
]
}
],
@@ -1486,7 +1487,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -1576,16 +1577,24 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 5,
"metadata": {},
"outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1020241\n",
+ "0.2074108123779297 s\n"
+ ]
+ },
{
"data": {
"text/plain": [
"15522"
]
},
- "execution_count": 19,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -1593,6 +1602,8 @@
"source": [
"resultsH = []\n",
"\n",
+ "t = time.time()\n",
+ "print(len(F.otype.data))\n",
"for p in F.otype.s(\"phrase\"):\n",
" ws = L.d(p, otype=\"word\")\n",
" if len(ws) < 2:\n",
@@ -1602,6 +1613,7 @@
" if F.nu.v(fi) != F.nu.v(la):\n",
" continue\n",
" resultsH.append((p, fi, la))\n",
+ "print(time.time() - t,\"s\")\n",
"\n",
"len(resultsH)"
]
@@ -1615,7 +1627,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -1624,7 +1636,7 @@
"True"
]
},
- "execution_count": 20,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -1637,384 +1649,115 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## In STAM\n",
- "\n",
- "Challenges:\n",
- "\n",
- "### Find the first and last word in each phrase\n",
- "\n",
- "Given a phrase, we need to find its words.\n",
- "Well, a phrase is an annotation with key `otype` and value `phrase` and target some word annotations.\n",
- "These target annotations are easy to get, by means of the `annotations()` method on annotations.\n",
- "\n",
- "Then we have to find the first and last words among these targets.\n",
- "\n",
- "**That is currently difficult!**\n",
- "\n",
- "You need a concept of order between annotations.\n",
- "One possibility is to put sequence numbers in the data, as annotations.\n",
- "But that is very cumbersome, because you need to refer to yet another level of annotation.\n",
- "And it will inflate the data.\n",
- "\n",
- "The other possibility is \"canonical ordering\".\n",
- "Annotations that target the text can be ordered by their targets.\n",
- "A target is a subset of textual positions. Two such subsets can be ordered as follows:\n",
- "\n",
- "* if A is a subset of B then B <= A\n",
- "* if B is a subset of A then A <= B\n",
- "* if A and B are no subsets of each other, the set that has the smallest element that does not belong to the other set, is the smallest.\n",
- "\n",
- "As part of the index building, you could create the rank of each annotation in this ordering.\n",
- "\n",
- "Annotations that target annotations that are already canonically ordered, can themselves be canonically ordered wrt their targets.\n",
- "\n",
- "Without this, the user will need to implement sorting in ad-hoc ways.\n",
- "\n",
- "### Retrieve values for the first and last word\n",
- "\n",
- "Given the annotations for the first and last word in a phrase, we have to find annotations with key `nu`\n",
- "and target these words, and read off their value.\n",
- "\n",
- "**That is currently difficult!**\n",
- "\n",
- "A way out is this:\n",
- "\n",
- "As preparation, before looping through the phrases:\n",
- "Make a dict that associates word anno ids with nu-values:\n",
- "\n",
- "* retrieve all annotations that have key `nu`, for each annotation:\n",
- "* pick the target, it is a word annotation, pick its id and use that as key in the dict\n",
- "* pick the data and from that the value and use that as value in the dict\n",
- "\n",
- "Then, for each phrase with at least two words:\n",
- "\n",
- "* retrieve the first word and from there the nu-value for that word\n",
- "* retrieve the second word and from there the nu-value for that word\n",
- "* compare the two values. If they are equal, we have a hit.\n",
- "\n",
- "This can be improved if the API offers a n efficient function to look up values.\n",
- "That could be a pre computation of all those dicts.\n",
- "\n",
- "Even better: those dicts could be the primary data!"
+ "## In STAM\n"
]
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Current: 3.01 GB\n",
- "Delta: 3.01 GB\n"
+ "0.3.0\n",
+ "Current: 2.83 GB\n",
+ "Delta: 2.83 GB\n"
]
}
],
"source": [
"import stam\n",
+ "print(stam.VERSION)\n",
"\n",
"from memutil import memUsage\n",
"memUsage()\n",
"\n",
"workDir = f\"{A.tempDir}/stam\"\n",
- "storeC = stam.AnnotationStore(file=f\"{workDir}/bhsa.store.stam.csv\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "metadata": {},
- "outputs": [],
- "source": [
- "aDataSet = list(storeC.annotationsets())[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "metadata": {},
- "outputs": [],
- "source": [
- "def stamOtype(otype):\n",
- " otypeData = aDataSet.find_data(\"otype\", otype)\n",
- " otypeAnnos = otypeData[0].annotations()\n",
- " return otypeAnnos\n",
- "\n",
- "def idsOf(annos):\n",
- " return {a.id() for a in annos}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 59,
- "metadata": {},
- "outputs": [],
- "source": [
- "# get the word annotations, sorted, and the phrase annotations\n",
- "\n",
- "def getPos(wordAnno):\n",
- " t = wordAnno.textselections()[0]\n",
- " return (t.begin(), t.end())\n",
- "\n",
- "wordAnnos = stamOtype(\"word\")\n",
- "wordIds = idsOf(wordAnnos)\n",
- "phraseAnnos = stamOtype(\"phrase\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 60,
- "metadata": {},
- "outputs": [],
- "source": [
- "wordAnnos = sorted(wordAnnos, key=getPos)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "metadata": {},
- "outputs": [],
- "source": [
- "# make a rank of the word annos\n",
- "\n",
- "wordRank = {anno.id(): i for (i, anno) in enumerate(wordAnnos)}"
+ "storeC = stam.AnnotationStore(file=f\"{workDir}/bhsa.store.stam.cbor\")"
]
},
{
- "cell_type": "code",
- "execution_count": 49,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "78754"
- ]
- },
- "execution_count": 49,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# get the phrase annotations together with their first and last word\n",
- "\n",
- "phrases = []\n",
- "\n",
- "for pAnno in phraseAnnos:\n",
- " words = pAnno.annotations()\n",
- " if len(words) < 2:\n",
- " continue\n",
- " sortedWords = sorted(words, key=lambda x: wordRank[x.id()])\n",
- " phrases.append((p, words[0], words[-1]))\n",
- "\n",
- "len(phrases)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " 0.90s 78754 results\n"
- ]
- }
- ],
- "source": [
- "# intermediate check with TF\n",
- "\n",
- "query = \"\"\"\n",
- "phrase\n",
- " =: word\n",
- " # word\n",
- " :=\n",
- "\"\"\"\n",
- "results = A.search(query)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "# get the `nu` information ready\n",
- "# we collect a dict keyed by word id with values the grammatical number of the word\n",
- "\n",
- "nuKey = aDataSet.key(\"nu\")\n",
- "nuAnnos = nuKey.annotations()\n",
- "\n",
- "nuValue = {}\n",
- "\n",
- "for nuAnno in nuAnnos:\n",
- " value = nuAnno.data()[0].value()\n",
- " word = list(nuAnno.annotations())[0]\n",
- "\n",
- " nuValue[word.id()] = value\n"
+ "With the new higher-order API, all this can be done in one go similar to the manually coded TF fragment:"
]
},
{
"cell_type": "code",
- "execution_count": 52,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "בְּ NA\n",
- "רֵאשִׁ֖ית sg\n",
- "בָּרָ֣א sg\n",
- "אֱלֹהִ֑ים pl\n",
- "אֵ֥ת NA\n",
- "הַ NA\n",
- "שָּׁמַ֖יִם pl\n",
- "וְ NA\n",
- "אֵ֥ת NA\n",
- "הָ NA\n",
- "אָֽרֶץ00 sg\n"
+ "1.9822394847869873 s\n"
]
- }
- ],
- "source": [
- "# check some values\n",
- "\n",
- "for wordAnno in wordAnnos[0:11]:\n",
- " print(f\"{wordAnno} {nuValue[wordAnno.id()]}\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "So far so good!"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "metadata": {},
- "outputs": [
+ },
{
"data": {
"text/plain": [
"15522"
]
},
- "execution_count": 53,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "# now compute the final result\n",
+ "import time\n",
"\n",
- "resultsSTAM = [x for x in phrases if nuValue[x[1].id()] == nuValue[x[2].id()]]\n",
- "len(resultsSTAM)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Perfect!\n",
- "\n",
- "## Now in one go\n",
+ "t = time.time()\n",
+ "dataset = storeC.dataset(\"features\")\n",
+ "key_otype = dataset.key(\"otype\")\n",
+ "key_nu = dataset.key(\"nu\")\n",
+ "results = []\n",
+ "for phrase in key_otype.data(value=\"phrase\").annotations():\n",
+ " words = phrase.annotations_in_targets(filter=key_otype, value=\"word\")\n",
+ " if len(words) < 2:\n",
+ " continue\n",
+ " firstword = words[0]\n",
+ " lastword = words[-1]\n",
+ " for annotation in firstword.annotations(filter=key_nu, limit=1): #forgetting the limit would lead to a big performance penalty!\n",
+ " data = annotation.data(filter=key_nu, limit=1)\n",
+ " if lastword.test_annotations(filter=data):\n",
+ " results.append((phrase,firstword,lastword))\n",
+ "print(time.time() - t,\"s\")\n",
"\n",
- "In order to see the performance, let's do this again in one go."
+ "len(results)"
]
},
{
- "cell_type": "code",
- "execution_count": 54,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "# The complete task in one go\n",
- "\n",
- "def getNicePhrases():\n",
- " aDataSet = list(storeC.annotationsets())[0]\n",
- " wordAnnos = sorted(stamOtype(\"word\"), key=getPos)\n",
- " wordIds = idsOf(wordAnnos)\n",
- " wordRank = {anno.id(): i for (i, anno) in enumerate(wordAnnos)}\n",
- "\n",
- " phraseAnnos = stamOtype(\"phrase\")\n",
- " phrases = []\n",
- "\n",
- " for pAnno in phraseAnnos:\n",
- " words = pAnno.annotations()\n",
- " if len(words) < 2:\n",
- " continue\n",
- " sortedWords = sorted(words, key=lambda x: wordRank[x.id()])\n",
- " phrases.append((p, words[0], words[-1]))\n",
+ "The execution times for this task on my machine were (the STAM rust experiments are not in this notebook but external in https://github.com/knaw-huc/stam-experiments/blob/main/exp5/src/main.rs):\n",
"\n",
- " nuKey = aDataSet.key(\"nu\")\n",
- " nuAnnos = nuKey.annotations()\n",
- "\n",
- " nuValue = {}\n",
- "\n",
- " for nuAnno in nuAnnos:\n",
- " value = nuAnno.data()[0].value()\n",
- " word = list(nuAnno.annotations())[0]\n",
- "\n",
- " nuValue[word.id()] = value\n",
- "\n",
- " results = [x for x in phrases if nuValue[x[1].id()] == nuValue[x[2].id()]]\n",
- " print(len(results))\n",
- " return results"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 55,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "15522\n"
- ]
- }
- ],
- "source": [
- "resultsSTAM = getNicePhrases()"
+ "TF query | TF hand coding | STAM Python | STAM Rust | STAM Rust parallel (32 cores)\n",
+ "--- | --- | --- | --- | ----\n",
+ "0.9 | 0.2 | 1.98 | 0.6 | 0.06\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "The execution times for this task were\n",
- "\n",
- "TF query | TF hand coding | STAM\n",
- "--- | --- | ---\n",
- "0.9 | 0.2 | 2.0\n",
+ "We see that performance in STAM Python is significantly worse than the TF version. This is due to several things:\n",
"\n",
- "But in STAM we can move quite a bit out of the task:\n",
- "\n",
- "1. sorting the words should be taken care of during the index building/loading when loading the STAM dataset (saves 0.3 sec)\n",
- "2. retrieving the `nu` value should be optimized (could save 0.9 sec)"
+ "1. The STAM Python API adds significant overhead, in part inevitable, but may improve a bit with further optimisation in the future.\n",
+ "2. The data is modelled in in a way that fits TF perfectly, TF is able to take some shortcuts here that aren't available in a STAM model. (STAM has an explicit notion of annotation, TF maps data more directly to targets)\n"
]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": []
}
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "bhsa-env",
"language": "python",
- "name": "python3"
+ "name": "bhsa-env"
},
"language_info": {
"codemirror_mode": {
@@ -2026,10 +1769,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.1"
- },
- "orig_nbformat": 4
+ "version": "3.11.5"
+ }
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/programs/stam.ipynb b/programs/stam.ipynb
index bb27463..0de863d 100644
--- a/programs/stam.ipynb
+++ b/programs/stam.ipynb
@@ -2,15 +2,17 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
- "metadata": {},
+ "execution_count": 22,
+ "metadata": {
+ "scrolled": true
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Current: 0.13 GB\n",
- "Delta: 0.13 GB\n"
+ "Current: 5.85 GB\n",
+ "Delta: 0.00 GB\n"
]
}
],
@@ -21,6 +23,7 @@
"from tf.core.files import dirMake\n",
"\n",
"from memutil import memUsage\n",
+ "import time\n",
"memUsage()"
]
},
@@ -47,7 +50,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 24,
"metadata": {},
"outputs": [
{
@@ -65,7 +68,7 @@
{
"data": {
"text/html": [
- "app: ~/github/ETCBC/bhsa/app"
+ "app: ~/github/ETCBC/bhsa/app"
],
"text/plain": [
""
@@ -77,7 +80,7 @@
{
"data": {
"text/html": [
- "data: ~/github/ETCBC/bhsa/tf/2021"
+ "data: ~/github/ETCBC/bhsa/tf/2021"
],
"text/plain": [
""
@@ -89,7 +92,7 @@
{
"data": {
"text/html": [
- "data: ~/github/ETCBC/phono/tf/2021"
+ "data: ~/github/ETCBC/phono/tf/2021"
],
"text/plain": [
""
@@ -101,7 +104,7 @@
{
"data": {
"text/html": [
- "data: ~/github/ETCBC/parallels/tf/2021"
+ "data: ~/github/ETCBC/parallels/tf/2021"
],
"text/plain": [
""
@@ -836,7 +839,7 @@
" \n",
"\n",
"\n",
- " Settings:
specified
- apiVersion:
3 - appName:
ETCBC/bhsa - appPath:
/Users/me/github/ETCBC/bhsa/app - commit: no value
- css:
'' dataDisplay:
exampleSectionHtml:
<code>Genesis 1:1</code> (use <a href=\"https://github.com/{org}/{repo}/blob/master/tf/{version}/book%40en.tf\" target=\"_blank\">English book names</a>)excludedFeatures:
g_uvf_utf8g_vbskq_hybridlanguageISOg_nmelex0is_rootg_vbs_utf8g_uvfdistrootsuffix_persong_vbedist_unitsuffix_numberdistributional_parentkq_hybrid_utf8crossrefSETinstructiong_prslexeme_countrank_occg_pfm_utf8freq_occcrossrefLCSfunctional_parentg_pfmg_nme_utf8g_vbe_utf8kindg_prs_utf8suffix_gendermother_object_type
noneValues:
absentn/anoneunknown- no value
NA
docs:
- docBase:
{docRoot}/{repo} - docExt:
'' - docPage:
'' - docRoot:
https://{org}.github.io - featurePage:
0_home
- interfaceDefaults:
{} - isCompatible:
True - local:
clone - localDir:
/Users/me/github/ETCBC/bhsa/_temp provenanceSpec:
- corpus:
BHSA = Biblia Hebraica Stuttgartensia Amstelodamensis - doi:
10.5281/zenodo.1007624 moduleSpecs:
:
- backend: no value
- corpus:
Phonetic Transcriptions docUrl:
https://nbviewer.jupyter.org/github/etcbc/phono/blob/master/programs/phono.ipynb- doi:
10.5281/zenodo.1007636 - org:
ETCBC - relative:
/tf - repo:
phono
:
- backend: no value
- corpus:
Parallel Passages docUrl:
https://nbviewer.jupyter.org/github/ETCBC/parallels/blob/master/programs/parallels.ipynb- doi:
10.5281/zenodo.1007642 - org:
ETCBC - relative:
/tf - repo:
parallels
- org:
ETCBC - relative:
/tf - repo:
bhsa - version:
2021 - webBase:
https://shebanq.ancient-data.org/hebrew - webHint:
Show this on SHEBANQ - webLang:
la - webLexId:
True webUrl:
{webBase}/text?book=<1>&chapter=<2>&verse=<3>&version={version}&mr=m&qw=q&tp=txt_p&tr=hb&wget=v&qget=v&nget=vt- webUrlLex:
{webBase}/word?version={version}&id=<lid>
- release: no value
typeDisplay:
clause:
- label:
{typ} {rela} - style:
''
clause_atom:
- hidden:
True - label:
{code} - level:
1 - style:
''
half_verse:
- hidden:
True - label:
{label} - style:
'' - verselike:
True
lex:
- featuresBare:
gloss - label:
{voc_lex_utf8} - lexOcc:
word - style:
orig - template:
{voc_lex_utf8}
phrase:
- label:
{typ} {function} - style:
''
phrase_atom:
- hidden:
True - label:
{typ} {rela} - level:
1 - style:
''
sentence:
sentence_atom:
- hidden:
True - label:
{number} - level:
1 - style:
''
subphrase:
- hidden:
True - label:
{number} - style:
''
word:
- features:
pdp vs vt - featuresBare:
lex:gloss
- writing:
hbo
\n"
+ " Settings:
specified
- apiVersion:
3 - appName:
ETCBC/bhsa - appPath:
/home/proycon/github/ETCBC/bhsa/app - commit: no value
- css:
'' dataDisplay:
exampleSectionHtml:
<code>Genesis 1:1</code> (use <a href=\"https://github.com/{org}/{repo}/blob/master/tf/{version}/book%40en.tf\" target=\"_blank\">English book names</a>)excludedFeatures:
g_uvf_utf8g_vbskq_hybridlanguageISOg_nmelex0is_rootg_vbs_utf8g_uvfdistrootsuffix_persong_vbedist_unitsuffix_numberdistributional_parentkq_hybrid_utf8crossrefSETinstructiong_prslexeme_countrank_occg_pfm_utf8freq_occcrossrefLCSfunctional_parentg_pfmg_nme_utf8g_vbe_utf8kindg_prs_utf8suffix_gendermother_object_type
noneValues:
absentn/anoneunknown- no value
NA
docs:
- docBase:
{docRoot}/{repo} - docExt:
'' - docPage:
'' - docRoot:
https://{org}.github.io - featurePage:
0_home
- interfaceDefaults:
{} - isCompatible:
True - local:
clone - localDir:
/home/proycon/github/ETCBC/bhsa/_temp provenanceSpec:
- corpus:
BHSA = Biblia Hebraica Stuttgartensia Amstelodamensis - doi:
10.5281/zenodo.1007624 moduleSpecs:
:
- backend: no value
- corpus:
Phonetic Transcriptions docUrl:
https://nbviewer.jupyter.org/github/etcbc/phono/blob/master/programs/phono.ipynb- doi:
10.5281/zenodo.1007636 - org:
ETCBC - relative:
/tf - repo:
phono
:
- backend: no value
- corpus:
Parallel Passages docUrl:
https://nbviewer.jupyter.org/github/ETCBC/parallels/blob/master/programs/parallels.ipynb- doi:
10.5281/zenodo.1007642 - org:
ETCBC - relative:
/tf - repo:
parallels
- org:
ETCBC - relative:
/tf - repo:
bhsa - version:
2021 - webBase:
https://shebanq.ancient-data.org/hebrew - webHint:
Show this on SHEBANQ - webLang:
la - webLexId:
True webUrl:
{webBase}/text?book=<1>&chapter=<2>&verse=<3>&version={version}&mr=m&qw=q&tp=txt_p&tr=hb&wget=v&qget=v&nget=vt- webUrlLex:
{webBase}/word?version={version}&id=<lid>
- release: no value
typeDisplay:
clause:
- label:
{typ} {rela} - style:
''
clause_atom:
- hidden:
True - label:
{code} - level:
1 - style:
''
half_verse:
- hidden:
True - label:
{label} - style:
'' - verselike:
True
lex:
- featuresBare:
gloss - label:
{voc_lex_utf8} - lexOcc:
word - style:
orig - template:
{voc_lex_utf8}
phrase:
- label:
{typ} {function} - style:
''
phrase_atom:
- hidden:
True - label:
{typ} {rela} - level:
1 - style:
''
sentence:
sentence_atom:
- hidden:
True - label:
{number} - level:
1 - style:
''
subphrase:
- hidden:
True - label:
{number} - style:
''
word:
- features:
pdp vs vt - featuresBare:
lex:gloss
- writing:
hbo
\n"
],
"text/plain": [
""
@@ -1480,13 +1483,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Current: 2.81 GB\n",
- "Delta: 2.68 GB\n"
+ "2.7321038246154785 s\n",
+ "Current: 8.46 GB\n",
+ "Delta: 2.61 GB\n"
]
}
],
"source": [
+ "t = time.time()\n",
"A = use(\"ETCBC/bhsa:clone\", checkout=\"clone\", hoist=globals())\n",
+ "print(time.time() - t, \"s\")\n",
"memUsage()"
]
},
@@ -1499,15 +1505,15 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Current: 2.93 GB\n",
- "Delta: 0.11 GB\n"
+ "Current: 2.89 GB\n",
+ "Delta: 0.12 GB\n"
]
}
],
@@ -1536,16 +1542,25 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 4,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.3.0\n"
+ ]
+ }
+ ],
"source": [
- "import stam"
+ "from stam import VERSION, AnnotationStore, Selector, Offset\n",
+ "print(VERSION)"
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -1568,8 +1583,8 @@
"nodes (type phrase) ...\n",
"nodes (type phrase_atom) ...\n",
"nodes (type subphrase) ...\n",
- "Current: 3.83 GB\n",
- "Delta: 0.90 GB\n"
+ "Current: 3.53 GB\n",
+ "Delta: 0.65 GB\n"
]
}
],
@@ -1577,9 +1592,9 @@
"storeId = \"ETCBC/bhsa\"\n",
"print(f\"store (corpus {storeId}) ...\")\n",
"\n",
- "store = stam.AnnotationStore(id=storeId)\n",
+ "store = AnnotationStore(id=storeId)\n",
"setId = \"features\"\n",
- "dataset = store.add_annotationset(setId)\n",
+ "dataset = store.add_dataset(setId)\n",
"\n",
"textId = \"hebrew_unicode\"\n",
"print(f\"text (format {textId}) ...\")\n",
@@ -1587,16 +1602,9 @@
"\n",
"print(\"nodes\")\n",
"\n",
- "stamTextSel = stam.Selector.textselector\n",
- "stamOffset = stam.Offset.simple\n",
- "stamCompSel = stam.Selector.compositeselector\n",
- "stamAnnoSel = stam.Selector.annotationselector\n",
- "storeAnnotate = store.annotate\n",
- "storeAnno = store.annotation\n",
- "\n",
"otypeKey = dataset.add_key(\"otype\")\n",
"\n",
- "annoIdFromNode = {}\n",
+ "annoFromNode = {}\n",
"\n",
"slotType = F.otype.slotType\n",
"\n",
@@ -1606,10 +1614,9 @@
"\n",
"for w in F.otype.s(otype):\n",
" typeData = dict(key=otypeKey, value=otype, set=dataset)\n",
- " anno = storeAnnotate(\n",
- " target=stamTextSel(textResource, stamOffset(*pos[w])), data=typeData\n",
+ " annoFromNode[w] = store.annotate(\n",
+ " target=Selector.textselector(textResource, Offset.simple(*pos[w])), data=typeData\n",
" )\n",
- " annoIdFromNode[w] = anno.id()\n",
"\n",
"for otype in F.otype.all:\n",
" if otype == F.otype.slotType:\n",
@@ -1618,18 +1625,17 @@
" typeData = dict(key=otypeKey, value=otype, set=dataset)\n",
" for n in F.otype.s(otype):\n",
" slots = E.oslots.s(n)\n",
- " slotsSel = stamCompSel(\n",
- " *[stamAnnoSel(storeAnno(annoIdFromNode[slot])) for slot in slots]\n",
+ " slotsSel = Selector.compositeselector(\n",
+ " *[Selector.annotationselector(annoFromNode[slot]) for slot in slots]\n",
" )\n",
- " anno = store.annotate(target=slotsSel, data=typeData)\n",
- " annoIdFromNode[n] = anno.id()\n",
+ " annoFromNode[n] = store.annotate(target=slotsSel, data=typeData)\n",
"\n",
"memUsage()"
]
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -1718,8 +1724,8 @@
"\tvoc_lex_utf8\n",
"\tvs\n",
"\tvt\n",
- "Current: 7.55 GB\n",
- "Delta: 3.73 GB\n"
+ "Current: 6.04 GB\n",
+ "Delta: 2.51 GB\n"
]
}
],
@@ -1733,14 +1739,14 @@
" featKey = dataset.add_key(feat)\n",
" for (n, v) in Fs(feat).items():\n",
" featData = dict(key=featKey, value=v, set=dataset)\n",
- " storeAnnotate(target=stamAnnoSel(storeAnno(annoIdFromNode[n])), data=featData)\n",
+ " store.annotate(target=Selector.annotationselector(annoFromNode[n]), data=featData)\n",
"\n",
"memUsage()"
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -1750,8 +1756,8 @@
"edge features\n",
"\tcrossref\n",
"\tmother\n",
- "Current: 7.77 GB\n",
- "Delta: 0.21 GB\n"
+ "Current: 6.08 GB\n",
+ "Delta: 0.04 GB\n"
]
}
],
@@ -1763,29 +1769,61 @@
" continue\n",
" print(f\"\\t{feat}\")\n",
" featKey = dataset.add_key(feat)\n",
- " nId = stamAnnoSel(storeAnno(annoIdFromNode[n]))\n",
+ " nId = Selector.annotationselector(annoFromNode[n])\n",
" for n, ms in Es(feat).items():\n",
" for m, v in ms.items() if type(ms) is dict else ((x, None) for x in ms):\n",
" featData = dict(key=featKey, value=v, set=dataset)\n",
- " mId = stamAnnoSel(storeAnno(annoIdFromNode[m]))\n",
- " target = stamCompSel(nId, mId)\n",
- " storeAnnotate(target=target, data=featData)\n",
+ " mId = Selector.annotationselector(annoFromNode[m])\n",
+ " target = Selector.compositeselector(nId, mId)\n",
+ " store.annotate(target=target, data=featData)\n",
"\n",
"memUsage()"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "cleanup - deallocating python overhead from conversion\n",
+ "Current: 5.85 GB\n",
+ "Delta: -0.23 GB\n",
+ "cleanup - optimizing memory consumption of internal STAM data structures\n",
+ "Current: 5.85 GB\n",
+ "Delta: -0.00 GB\n",
+ "(note: textfabric model is still loaded as well!)\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"cleanup - deallocating python overhead from conversion\")\n",
+ "\n",
+ "del annoFromNode, text, pos\n",
+ "memUsage()\n",
+ "\n",
+ "print(\"cleanup - optimizing memory consumption of internal STAM data structures\")\n",
+ "store.shrink_to_fit()\n",
+ "memUsage()\n",
+ "\n",
+ "print(\"(note: textfabric model is still loaded as well!)\")\n"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Serializing\n",
"\n",
- "Lets serialize the STAM dataset to disk, in JSON and CSV."
+ "Let's serialize the STAM dataset to disk, in JSON and CSV."
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@@ -1802,21 +1840,24 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Current: 9.17 GB\n",
- "Delta: 1.40 GB\n"
+ "16.307935730961617 s\n",
+ "Current: 5.85 GB\n",
+ "Delta: 0.00 GB\n"
]
}
],
"source": [
"store.set_filename(f\"{workDir}/bhsa.json\")\n",
- "store.save()\n",
+ "print(timeit.timeit(\n",
+ " store.save\n",
+ ", number=1),\"s\")\n",
"memUsage()"
]
},
@@ -1829,21 +1870,54 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Current: 9.17 GB\n",
+ "5.578051328659058\n",
+ "Current: 5.85 GB\n",
"Delta: 0.00 GB\n"
]
}
],
"source": [
"store.set_filename(f\"{workDir}/bhsa.csv\")\n",
+ "t = time.time()\n",
+ "store.save()\n",
+ "print(time.time() - t, \"s\")\n",
+ "memUsage()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## CBOR"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2.369333267211914 s\n",
+ "Current: 8.46 GB\n",
+ "Delta: 0.00 GB\n"
+ ]
+ }
+ ],
+ "source": [
+ "store.set_filename(f\"{workDir}/bhsa.store.stam.cbor\")\n",
+ "t = time.time()\n",
"store.save()\n",
+ "print(time.time() - t, \"s\")\n",
"memUsage()"
]
},
@@ -1876,8 +1950,9 @@
"text": [
"Current: 0.07 GB\n",
"Delta: 0.07 GB\n",
- "Current: 16.41 GB\n",
- "Delta: 16.34 GB\n"
+ "84.40434789657593 s\n",
+ "Current: 3.12 GB\n",
+ "Delta: 3.05 GB\n"
]
}
],
@@ -1885,11 +1960,14 @@
"import os\n",
"import stam\n",
"\n",
+ "import time\n",
"from memutil import memUsage\n",
"memUsage()\n",
"\n",
"workDir = os.path.expanduser(\"~/github/ETCBC/bhsa/_temp/stam\")\n",
- "storeJ = stam.AnnotationStore(file=f\"{workDir}/bhsa.json\")\n",
+ "t = time.time()\n",
+ "store = stam.AnnotationStore(file=f\"{workDir}/bhsa.json\")\n",
+ "print(time.time() - t, \"s\")\n",
"memUsage()"
]
},
@@ -1904,17 +1982,56 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Current: 0.07 GB\n",
- "Delta: 0.07 GB\n",
- "Current: 16.66 GB\n",
- "Delta: 16.60 GB\n"
+ "Current: 0.08 GB\n",
+ "Delta: 0.01 GB\n",
+ "45.721251249313354 s\n",
+ "Current: 3.17 GB\n",
+ "Delta: 3.09 GB\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "import stam\n",
+ "import time\n",
+ "from memutil import memUsage\n",
+ "memUsage()\n",
+ "\n",
+ "workDir = os.path.expanduser(\"~/github/ETCBC/bhsa/_temp/stam\")\n",
+ "t = time.time()\n",
+ "store = stam.AnnotationStore(file=f\"{workDir}/bhsa.store.stam.csv\")\n",
+ "print(time.time() - t, \"s\")\n",
+ "memUsage()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## CBOR"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Current: 8.46 GB\n",
+ "Delta: 0.00 GB\n",
+ "3.655712604522705 s\n",
+ "Current: 11.80 GB\n",
+ "Delta: 3.34 GB\n"
]
}
],
@@ -1926,7 +2043,9 @@
"memUsage()\n",
"\n",
"workDir = os.path.expanduser(\"~/github/ETCBC/bhsa/_temp/stam\")\n",
- "storeC = stam.AnnotationStore(file=f\"{workDir}/bhsa.store.stam.csv\")\n",
+ "t = time.time()\n",
+ "store = stam.AnnotationStore(file=f\"{workDir}/bhsa.store.stam.cbor\")\n",
+ "print(time.time() - t, \"s\")\n",
"memUsage()"
]
},
@@ -1938,12 +2057,13 @@
"\n",
"contender | load time (sec) | save time (sec) | mem usage (GB) | disk usage (GB)\n",
"--- | --- | --- | --- | ---\n",
- "STAM | | | 16.4 | \n",
- "STAM-JSON | 115 | 17 | | 8.15\n",
- "STAM-CSV | 53 | 7 | | 2.35\n",
+ "STAM | | | 3.05 | \n",
+ "STAM-JSON | 84.40 | 16.31 | | 6.8\n",
+ "STAM-CSV | 45.72 | 5.58 | | 1.2\n",
+ "STAM-CBOR | 3.66 | 2.37 | | 0.74\n",
"TF | | | 2.7 |\n",
- "TF text | 92 | | | 0.10\n",
- "TF opt | 3 | | | 0.14\n",
+ "TF text | 92? | | | 0.10\n",
+ "TF opt | 3? | | | 0.14\n",
"\n"
]
},
@@ -2066,9 +2186,9 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "bhsa-env",
"language": "python",
- "name": "python3"
+ "name": "bhsa-env"
},
"language_info": {
"codemirror_mode": {
@@ -2080,10 +2200,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.1"
- },
- "orig_nbformat": 4
+ "version": "3.11.3"
+ }
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}