From 42156f3fed75e29db2d512b8772b8f1ca68c792d Mon Sep 17 00:00:00 2001 From: Maarten van Gompel Date: Tue, 8 Aug 2023 11:34:31 +0200 Subject: [PATCH 1/3] adapted notebook to new STAM API and remeasured things, adapted code to work without IDs, removed aliases --- programs/stam.ipynb | 171 ++++++++++++++++++++++++++------------------ 1 file changed, 102 insertions(+), 69 deletions(-) diff --git a/programs/stam.ipynb b/programs/stam.ipynb index bb27463..0366faf 100644 --- a/programs/stam.ipynb +++ b/programs/stam.ipynb @@ -3,7 +3,9 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stdout", @@ -65,7 +67,7 @@ { "data": { "text/html": [ - "app: ~/github/ETCBC/bhsa/app" + "app: ~/github/ETCBC/bhsa/app" ], "text/plain": [ "" @@ -77,7 +79,7 @@ { "data": { "text/html": [ - "data: ~/github/ETCBC/bhsa/tf/2021" + "data: ~/github/ETCBC/bhsa/tf/2021" ], "text/plain": [ "" @@ -89,7 +91,7 @@ { "data": { "text/html": [ - "data: ~/github/ETCBC/phono/tf/2021" + "data: ~/github/ETCBC/phono/tf/2021" ], "text/plain": [ "" @@ -101,7 +103,7 @@ { "data": { "text/html": [ - "data: ~/github/ETCBC/parallels/tf/2021" + "data: ~/github/ETCBC/parallels/tf/2021" ], "text/plain": [ "" @@ -836,7 +838,7 @@ " \n", "\n", "\n", - " Settings:
specified
  1. apiVersion: 3
  2. appName: ETCBC/bhsa
  3. appPath: /Users/me/github/ETCBC/bhsa/app
  4. commit: no value
  5. css: ''
  6. dataDisplay:
    • exampleSectionHtml:<code>Genesis 1:1</code> (use <a href=\"https://github.com/{org}/{repo}/blob/master/tf/{version}/book%40en.tf\" target=\"_blank\">English book names</a>)
    • excludedFeatures:
      • g_uvf_utf8
      • g_vbs
      • kq_hybrid
      • languageISO
      • g_nme
      • lex0
      • is_root
      • g_vbs_utf8
      • g_uvf
      • dist
      • root
      • suffix_person
      • g_vbe
      • dist_unit
      • suffix_number
      • distributional_parent
      • kq_hybrid_utf8
      • crossrefSET
      • instruction
      • g_prs
      • lexeme_count
      • rank_occ
      • g_pfm_utf8
      • freq_occ
      • crossrefLCS
      • functional_parent
      • g_pfm
      • g_nme_utf8
      • g_vbe_utf8
      • kind
      • g_prs_utf8
      • suffix_gender
      • mother_object_type
    • noneValues:
      • absent
      • n/a
      • none
      • unknown
      • no value
      • NA
  7. docs:
    • docBase: {docRoot}/{repo}
    • docExt: ''
    • docPage: ''
    • docRoot: https://{org}.github.io
    • featurePage: 0_home
  8. interfaceDefaults: {}
  9. isCompatible: True
  10. local: clone
  11. localDir: /Users/me/github/ETCBC/bhsa/_temp
  12. provenanceSpec:
    • corpus: BHSA = Biblia Hebraica Stuttgartensia Amstelodamensis
    • doi: 10.5281/zenodo.1007624
    • moduleSpecs:
      • :
        • backend: no value
        • corpus: Phonetic Transcriptions
        • docUrl:https://nbviewer.jupyter.org/github/etcbc/phono/blob/master/programs/phono.ipynb
        • doi: 10.5281/zenodo.1007636
        • org: ETCBC
        • relative: /tf
        • repo: phono
      • :
        • backend: no value
        • corpus: Parallel Passages
        • docUrl:https://nbviewer.jupyter.org/github/ETCBC/parallels/blob/master/programs/parallels.ipynb
        • doi: 10.5281/zenodo.1007642
        • org: ETCBC
        • relative: /tf
        • repo: parallels
    • org: ETCBC
    • relative: /tf
    • repo: bhsa
    • version: 2021
    • webBase: https://shebanq.ancient-data.org/hebrew
    • webHint: Show this on SHEBANQ
    • webLang: la
    • webLexId: True
    • webUrl:{webBase}/text?book=<1>&chapter=<2>&verse=<3>&version={version}&mr=m&qw=q&tp=txt_p&tr=hb&wget=v&qget=v&nget=vt
    • webUrlLex: {webBase}/word?version={version}&id=<lid>
  13. release: no value
  14. typeDisplay:
    • clause:
      • label: {typ} {rela}
      • style: ''
    • clause_atom:
      • hidden: True
      • label: {code}
      • level: 1
      • style: ''
    • half_verse:
      • hidden: True
      • label: {label}
      • style: ''
      • verselike: True
    • lex:
      • featuresBare: gloss
      • label: {voc_lex_utf8}
      • lexOcc: word
      • style: orig
      • template: {voc_lex_utf8}
    • phrase:
      • label: {typ} {function}
      • style: ''
    • phrase_atom:
      • hidden: True
      • label: {typ} {rela}
      • level: 1
      • style: ''
    • sentence:
      • label: {number}
      • style: ''
    • sentence_atom:
      • hidden: True
      • label: {number}
      • level: 1
      • style: ''
    • subphrase:
      • hidden: True
      • label: {number}
      • style: ''
    • word:
      • features: pdp vs vt
      • featuresBare: lex:gloss
  15. writing: hbo
\n" + " Settings:
specified
  1. apiVersion: 3
  2. appName: ETCBC/bhsa
  3. appPath: /home/proycon/github/ETCBC/bhsa/app
  4. commit: no value
  5. css: ''
  6. dataDisplay:
    • exampleSectionHtml:<code>Genesis 1:1</code> (use <a href=\"https://github.com/{org}/{repo}/blob/master/tf/{version}/book%40en.tf\" target=\"_blank\">English book names</a>)
    • excludedFeatures:
      • g_uvf_utf8
      • g_vbs
      • kq_hybrid
      • languageISO
      • g_nme
      • lex0
      • is_root
      • g_vbs_utf8
      • g_uvf
      • dist
      • root
      • suffix_person
      • g_vbe
      • dist_unit
      • suffix_number
      • distributional_parent
      • kq_hybrid_utf8
      • crossrefSET
      • instruction
      • g_prs
      • lexeme_count
      • rank_occ
      • g_pfm_utf8
      • freq_occ
      • crossrefLCS
      • functional_parent
      • g_pfm
      • g_nme_utf8
      • g_vbe_utf8
      • kind
      • g_prs_utf8
      • suffix_gender
      • mother_object_type
    • noneValues:
      • absent
      • n/a
      • none
      • unknown
      • no value
      • NA
  7. docs:
    • docBase: {docRoot}/{repo}
    • docExt: ''
    • docPage: ''
    • docRoot: https://{org}.github.io
    • featurePage: 0_home
  8. interfaceDefaults: {}
  9. isCompatible: True
  10. local: clone
  11. localDir: /home/proycon/github/ETCBC/bhsa/_temp
  12. provenanceSpec:
    • corpus: BHSA = Biblia Hebraica Stuttgartensia Amstelodamensis
    • doi: 10.5281/zenodo.1007624
    • moduleSpecs:
      • :
        • backend: no value
        • corpus: Phonetic Transcriptions
        • docUrl:https://nbviewer.jupyter.org/github/etcbc/phono/blob/master/programs/phono.ipynb
        • doi: 10.5281/zenodo.1007636
        • org: ETCBC
        • relative: /tf
        • repo: phono
      • :
        • backend: no value
        • corpus: Parallel Passages
        • docUrl:https://nbviewer.jupyter.org/github/ETCBC/parallels/blob/master/programs/parallels.ipynb
        • doi: 10.5281/zenodo.1007642
        • org: ETCBC
        • relative: /tf
        • repo: parallels
    • org: ETCBC
    • relative: /tf
    • repo: bhsa
    • version: 2021
    • webBase: https://shebanq.ancient-data.org/hebrew
    • webHint: Show this on SHEBANQ
    • webLang: la
    • webLexId: True
    • webUrl:{webBase}/text?book=<1>&chapter=<2>&verse=<3>&version={version}&mr=m&qw=q&tp=txt_p&tr=hb&wget=v&qget=v&nget=vt
    • webUrlLex: {webBase}/word?version={version}&id=<lid>
  13. release: no value
  14. typeDisplay:
    • clause:
      • label: {typ} {rela}
      • style: ''
    • clause_atom:
      • hidden: True
      • label: {code}
      • level: 1
      • style: ''
    • half_verse:
      • hidden: True
      • label: {label}
      • style: ''
      • verselike: True
    • lex:
      • featuresBare: gloss
      • label: {voc_lex_utf8}
      • lexOcc: word
      • style: orig
      • template: {voc_lex_utf8}
    • phrase:
      • label: {typ} {function}
      • style: ''
    • phrase_atom:
      • hidden: True
      • label: {typ} {rela}
      • level: 1
      • style: ''
    • sentence:
      • label: {number}
      • style: ''
    • sentence_atom:
      • hidden: True
      • label: {number}
      • level: 1
      • style: ''
    • subphrase:
      • hidden: True
      • label: {number}
      • style: ''
    • word:
      • features: pdp vs vt
      • featuresBare: lex:gloss
  15. writing: hbo
\n" ], "text/plain": [ "" @@ -1480,8 +1482,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Current: 2.81 GB\n", - "Delta: 2.68 GB\n" + "Current: 2.77 GB\n", + "Delta: 2.65 GB\n" ] } ], @@ -1499,15 +1501,15 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Current: 2.93 GB\n", - "Delta: 0.11 GB\n" + "Current: 2.89 GB\n", + "Delta: 0.12 GB\n" ] } ], @@ -1536,16 +1538,25 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.3.0\n" + ] + } + ], "source": [ - "import stam" + "from stam import VERSION, AnnotationStore, Selector, Offset\n", + "print(VERSION)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -1568,8 +1579,8 @@ "nodes (type phrase) ...\n", "nodes (type phrase_atom) ...\n", "nodes (type subphrase) ...\n", - "Current: 3.83 GB\n", - "Delta: 0.90 GB\n" + "Current: 3.54 GB\n", + "Delta: 0.64 GB\n" ] } ], @@ -1577,9 +1588,9 @@ "storeId = \"ETCBC/bhsa\"\n", "print(f\"store (corpus {storeId}) ...\")\n", "\n", - "store = stam.AnnotationStore(id=storeId)\n", + "store = AnnotationStore(id=storeId)\n", "setId = \"features\"\n", - "dataset = store.add_annotationset(setId)\n", + "dataset = store.add_dataset(setId)\n", "\n", "textId = \"hebrew_unicode\"\n", "print(f\"text (format {textId}) ...\")\n", @@ -1587,16 +1598,9 @@ "\n", "print(\"nodes\")\n", "\n", - "stamTextSel = stam.Selector.textselector\n", - "stamOffset = stam.Offset.simple\n", - "stamCompSel = stam.Selector.compositeselector\n", - "stamAnnoSel = stam.Selector.annotationselector\n", - "storeAnnotate = store.annotate\n", - "storeAnno = store.annotation\n", - "\n", "otypeKey = dataset.add_key(\"otype\")\n", "\n", - "annoIdFromNode = {}\n", + "annoFromNode = {}\n", "\n", "slotType = F.otype.slotType\n", "\n", @@ -1606,10 +1610,9 @@ "\n", "for w in F.otype.s(otype):\n", " typeData = dict(key=otypeKey, value=otype, set=dataset)\n", - " anno = storeAnnotate(\n", - " target=stamTextSel(textResource, stamOffset(*pos[w])), data=typeData\n", + " annoFromNode[w] = store.annotate(\n", + " target=Selector.textselector(textResource, Offset.simple(*pos[w])), data=typeData\n", " )\n", - " annoIdFromNode[w] = anno.id()\n", "\n", "for otype in F.otype.all:\n", " if otype == F.otype.slotType:\n", @@ -1618,18 +1621,17 @@ " typeData = dict(key=otypeKey, value=otype, set=dataset)\n", " for n in F.otype.s(otype):\n", " slots = E.oslots.s(n)\n", - " slotsSel = stamCompSel(\n", - " *[stamAnnoSel(storeAnno(annoIdFromNode[slot])) for slot in slots]\n", + " slotsSel = Selector.compositeselector(\n", + " *[Selector.annotationselector(annoFromNode[slot]) for slot in slots]\n", " )\n", - " anno = store.annotate(target=slotsSel, data=typeData)\n", - " annoIdFromNode[n] = anno.id()\n", + " annoFromNode[n] = store.annotate(target=slotsSel, data=typeData)\n", "\n", "memUsage()" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -1718,8 +1720,8 @@ "\tvoc_lex_utf8\n", "\tvs\n", "\tvt\n", - "Current: 7.55 GB\n", - "Delta: 3.73 GB\n" + "Current: 6.05 GB\n", + "Delta: 2.51 GB\n" ] } ], @@ -1733,14 +1735,14 @@ " featKey = dataset.add_key(feat)\n", " for (n, v) in Fs(feat).items():\n", " featData = dict(key=featKey, value=v, set=dataset)\n", - " storeAnnotate(target=stamAnnoSel(storeAnno(annoIdFromNode[n])), data=featData)\n", + " store.annotate(target=Selector.annotationselector(annoFromNode[n]), data=featData)\n", "\n", "memUsage()" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -1750,8 +1752,8 @@ "edge features\n", "\tcrossref\n", "\tmother\n", - "Current: 7.77 GB\n", - "Delta: 0.21 GB\n" + "Current: 6.09 GB\n", + "Delta: 0.04 GB\n" ] } ], @@ -1763,24 +1765,56 @@ " continue\n", " print(f\"\\t{feat}\")\n", " featKey = dataset.add_key(feat)\n", - " nId = stamAnnoSel(storeAnno(annoIdFromNode[n]))\n", + " nId = Selector.annotationselector(annoFromNode[n])\n", " for n, ms in Es(feat).items():\n", " for m, v in ms.items() if type(ms) is dict else ((x, None) for x in ms):\n", " featData = dict(key=featKey, value=v, set=dataset)\n", - " mId = stamAnnoSel(storeAnno(annoIdFromNode[m]))\n", - " target = stamCompSel(nId, mId)\n", - " storeAnnotate(target=target, data=featData)\n", + " mId = Selector.annotationselector(annoFromNode[m])\n", + " target = Selector.compositeselector(nId, mId)\n", + " store.annotate(target=target, data=featData)\n", "\n", "memUsage()" ] }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cleanup - deallocating python overhead from conversion\n", + "Current: 5.85 GB\n", + "Delta: -0.24 GB\n", + "cleanup - optimizing memory consumption of internal STAM data structures\n", + "Current: 5.85 GB\n", + "Delta: -0.00 GB\n", + "(note: textfabric model is still loaded as well!)\n" + ] + } + ], + "source": [ + "print(\"cleanup - deallocating python overhead from conversion\")\n", + "\n", + "del annoFromNode, text, pos\n", + "memUsage()\n", + "\n", + "print(\"cleanup - optimizing memory consumption of internal STAM data structures\")\n", + "store.shrink_to_fit()\n", + "memUsage()\n", + "\n", + "print(\"(note: textfabric model is still loaded as well!)\")\n" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Serializing\n", "\n", - "Lets serialize the STAM dataset to disk, in JSON and CSV." + "Let's serialize the STAM dataset to disk, in JSON and CSV." ] }, { @@ -1809,8 +1843,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Current: 9.17 GB\n", - "Delta: 1.40 GB\n" + "Current: 5.85 GB\n", + "Delta: 0.00 GB\n" ] } ], @@ -1829,15 +1863,15 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Current: 9.17 GB\n", - "Delta: 0.00 GB\n" + "Current: 5.90 GB\n", + "Delta: 0.05 GB\n" ] } ], @@ -1876,8 +1910,8 @@ "text": [ "Current: 0.07 GB\n", "Delta: 0.07 GB\n", - "Current: 16.41 GB\n", - "Delta: 16.34 GB\n" + "Current: 3.12 GB\n", + "Delta: 3.05 GB\n" ] } ], @@ -1889,7 +1923,7 @@ "memUsage()\n", "\n", "workDir = os.path.expanduser(\"~/github/ETCBC/bhsa/_temp/stam\")\n", - "storeJ = stam.AnnotationStore(file=f\"{workDir}/bhsa.json\")\n", + "store = stam.AnnotationStore(file=f\"{workDir}/bhsa.json\")\n", "memUsage()" ] }, @@ -1904,17 +1938,17 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Current: 0.07 GB\n", - "Delta: 0.07 GB\n", - "Current: 16.66 GB\n", - "Delta: 16.60 GB\n" + "Current: 3.12 GB\n", + "Delta: 0.00 GB\n", + "Current: 4.42 GB\n", + "Delta: 1.29 GB\n" ] } ], @@ -1926,7 +1960,7 @@ "memUsage()\n", "\n", "workDir = os.path.expanduser(\"~/github/ETCBC/bhsa/_temp/stam\")\n", - "storeC = stam.AnnotationStore(file=f\"{workDir}/bhsa.store.stam.csv\")\n", + "store = stam.AnnotationStore(file=f\"{workDir}/bhsa.store.stam.csv\")\n", "memUsage()" ] }, @@ -1938,9 +1972,9 @@ "\n", "contender | load time (sec) | save time (sec) | mem usage (GB) | disk usage (GB)\n", "--- | --- | --- | --- | ---\n", - "STAM | | | 16.4 | \n", - "STAM-JSON | 115 | 17 | | 8.15\n", - "STAM-CSV | 53 | 7 | | 2.35\n", + "STAM | | | 3.05 | \n", + "STAM-JSON | ? | ? | | 6.8\n", + "STAM-CSV | ? | ? | | 1.2\n", "TF | | | 2.7 |\n", "TF text | 92 | | | 0.10\n", "TF opt | 3 | | | 0.14\n", @@ -2066,9 +2100,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "bhsa-env", "language": "python", - "name": "python3" + "name": "bhsa-env" }, "language_info": { "codemirror_mode": { @@ -2080,10 +2114,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.1" - }, - "orig_nbformat": 4 + "version": "3.11.3" + } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } From 111081490201962b1c4f1418a53ccb1429431d40 Mon Sep 17 00:00:00 2001 From: Maarten van Gompel Date: Tue, 8 Aug 2023 14:14:26 +0200 Subject: [PATCH 2/3] added timing measurements, not sure about how to time TF though, the numbers in the table are much higher than what I get --- programs/stam.ipynb | 136 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 111 insertions(+), 25 deletions(-) diff --git a/programs/stam.ipynb b/programs/stam.ipynb index 0366faf..0de863d 100644 --- a/programs/stam.ipynb +++ b/programs/stam.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 22, "metadata": { "scrolled": true }, @@ -11,8 +11,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Current: 0.13 GB\n", - "Delta: 0.13 GB\n" + "Current: 5.85 GB\n", + "Delta: 0.00 GB\n" ] } ], @@ -23,6 +23,7 @@ "from tf.core.files import dirMake\n", "\n", "from memutil import memUsage\n", + "import time\n", "memUsage()" ] }, @@ -49,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -1482,13 +1483,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Current: 2.77 GB\n", - "Delta: 2.65 GB\n" + "2.7321038246154785 s\n", + "Current: 8.46 GB\n", + "Delta: 2.61 GB\n" ] } ], "source": [ + "t = time.time()\n", "A = use(\"ETCBC/bhsa:clone\", checkout=\"clone\", hoist=globals())\n", + "print(time.time() - t, \"s\")\n", "memUsage()" ] }, @@ -1579,8 +1583,8 @@ "nodes (type phrase) ...\n", "nodes (type phrase_atom) ...\n", "nodes (type subphrase) ...\n", - "Current: 3.54 GB\n", - "Delta: 0.64 GB\n" + "Current: 3.53 GB\n", + "Delta: 0.65 GB\n" ] } ], @@ -1720,7 +1724,7 @@ "\tvoc_lex_utf8\n", "\tvs\n", "\tvt\n", - "Current: 6.05 GB\n", + "Current: 6.04 GB\n", "Delta: 2.51 GB\n" ] } @@ -1752,7 +1756,7 @@ "edge features\n", "\tcrossref\n", "\tmother\n", - "Current: 6.09 GB\n", + "Current: 6.08 GB\n", "Delta: 0.04 GB\n" ] } @@ -1787,7 +1791,7 @@ "text": [ "cleanup - deallocating python overhead from conversion\n", "Current: 5.85 GB\n", - "Delta: -0.24 GB\n", + "Delta: -0.23 GB\n", "cleanup - optimizing memory consumption of internal STAM data structures\n", "Current: 5.85 GB\n", "Delta: -0.00 GB\n", @@ -1819,7 +1823,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -1836,13 +1840,14 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "16.307935730961617 s\n", "Current: 5.85 GB\n", "Delta: 0.00 GB\n" ] @@ -1850,7 +1855,9 @@ ], "source": [ "store.set_filename(f\"{workDir}/bhsa.json\")\n", - "store.save()\n", + "print(timeit.timeit(\n", + " store.save\n", + ", number=1),\"s\")\n", "memUsage()" ] }, @@ -1863,21 +1870,54 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Current: 5.90 GB\n", - "Delta: 0.05 GB\n" + "5.578051328659058\n", + "Current: 5.85 GB\n", + "Delta: 0.00 GB\n" ] } ], "source": [ "store.set_filename(f\"{workDir}/bhsa.csv\")\n", + "t = time.time()\n", + "store.save()\n", + "print(time.time() - t, \"s\")\n", + "memUsage()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CBOR" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.369333267211914 s\n", + "Current: 8.46 GB\n", + "Delta: 0.00 GB\n" + ] + } + ], + "source": [ + "store.set_filename(f\"{workDir}/bhsa.store.stam.cbor\")\n", + "t = time.time()\n", "store.save()\n", + "print(time.time() - t, \"s\")\n", "memUsage()" ] }, @@ -1910,6 +1950,7 @@ "text": [ "Current: 0.07 GB\n", "Delta: 0.07 GB\n", + "84.40434789657593 s\n", "Current: 3.12 GB\n", "Delta: 3.05 GB\n" ] @@ -1919,11 +1960,14 @@ "import os\n", "import stam\n", "\n", + "import time\n", "from memutil import memUsage\n", "memUsage()\n", "\n", "workDir = os.path.expanduser(\"~/github/ETCBC/bhsa/_temp/stam\")\n", + "t = time.time()\n", "store = stam.AnnotationStore(file=f\"{workDir}/bhsa.json\")\n", + "print(time.time() - t, \"s\")\n", "memUsage()" ] }, @@ -1945,10 +1989,49 @@ "name": "stdout", "output_type": "stream", "text": [ - "Current: 3.12 GB\n", + "Current: 0.08 GB\n", + "Delta: 0.01 GB\n", + "45.721251249313354 s\n", + "Current: 3.17 GB\n", + "Delta: 3.09 GB\n" + ] + } + ], + "source": [ + "import os\n", + "import stam\n", + "import time\n", + "from memutil import memUsage\n", + "memUsage()\n", + "\n", + "workDir = os.path.expanduser(\"~/github/ETCBC/bhsa/_temp/stam\")\n", + "t = time.time()\n", + "store = stam.AnnotationStore(file=f\"{workDir}/bhsa.store.stam.csv\")\n", + "print(time.time() - t, \"s\")\n", + "memUsage()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CBOR" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current: 8.46 GB\n", "Delta: 0.00 GB\n", - "Current: 4.42 GB\n", - "Delta: 1.29 GB\n" + "3.655712604522705 s\n", + "Current: 11.80 GB\n", + "Delta: 3.34 GB\n" ] } ], @@ -1960,7 +2043,9 @@ "memUsage()\n", "\n", "workDir = os.path.expanduser(\"~/github/ETCBC/bhsa/_temp/stam\")\n", - "store = stam.AnnotationStore(file=f\"{workDir}/bhsa.store.stam.csv\")\n", + "t = time.time()\n", + "store = stam.AnnotationStore(file=f\"{workDir}/bhsa.store.stam.cbor\")\n", + "print(time.time() - t, \"s\")\n", "memUsage()" ] }, @@ -1973,11 +2058,12 @@ "contender | load time (sec) | save time (sec) | mem usage (GB) | disk usage (GB)\n", "--- | --- | --- | --- | ---\n", "STAM | | | 3.05 | \n", - "STAM-JSON | ? | ? | | 6.8\n", - "STAM-CSV | ? | ? | | 1.2\n", + "STAM-JSON | 84.40 | 16.31 | | 6.8\n", + "STAM-CSV | 45.72 | 5.58 | | 1.2\n", + "STAM-CBOR | 3.66 | 2.37 | | 0.74\n", "TF | | | 2.7 |\n", - "TF text | 92 | | | 0.10\n", - "TF opt | 3 | | | 0.14\n", + "TF text | 92? | | | 0.10\n", + "TF opt | 3? | | | 0.14\n", "\n" ] }, From b2036eeb1d25bbb15c2b8e675deacc2cb61d7bb3 Mon Sep 17 00:00:00 2001 From: Maarten van Gompel Date: Wed, 9 Aug 2023 17:28:25 +0200 Subject: [PATCH 3/3] stam-nu: adapted to new API, removed old experiment (outdated now and no longer works with new API) --- programs/stam-nu.ipynb | 398 ++++++++--------------------------------- 1 file changed, 70 insertions(+), 328 deletions(-) diff --git a/programs/stam-nu.ipynb b/programs/stam-nu.ipynb index 19bcc48..ec22a29 100644 --- a/programs/stam-nu.ipynb +++ b/programs/stam-nu.ipynb @@ -2,11 +2,12 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", + "import time\n", "from itertools import chain\n", "\n", "from tf.app import use\n", @@ -15,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -33,7 +34,7 @@ { "data": { "text/html": [ - "app: ~/github/ETCBC/bhsa/app" + "app: ~/github/ETCBC/bhsa/app" ], "text/plain": [ "" @@ -45,7 +46,7 @@ { "data": { "text/html": [ - "data: ~/github/ETCBC/bhsa/tf/2021" + "data: ~/github/ETCBC/bhsa/tf/2021" ], "text/plain": [ "" @@ -57,7 +58,7 @@ { "data": { "text/html": [ - "data: ~/github/ETCBC/phono/tf/2021" + "data: ~/github/ETCBC/phono/tf/2021" ], "text/plain": [ "" @@ -69,7 +70,7 @@ { "data": { "text/html": [ - "data: ~/github/ETCBC/parallels/tf/2021" + "data: ~/github/ETCBC/parallels/tf/2021" ], "text/plain": [ "" @@ -804,7 +805,7 @@ " \n", "\n", "\n", - " Settings:
specified
  1. apiVersion: 3
  2. appName: ETCBC/bhsa
  3. appPath: /Users/me/github/ETCBC/bhsa/app
  4. commit: no value
  5. css: ''
  6. dataDisplay:
    • exampleSectionHtml:<code>Genesis 1:1</code> (use <a href=\"https://github.com/{org}/{repo}/blob/master/tf/{version}/book%40en.tf\" target=\"_blank\">English book names</a>)
    • excludedFeatures:
      • g_uvf_utf8
      • g_vbs
      • kq_hybrid
      • languageISO
      • g_nme
      • lex0
      • is_root
      • g_vbs_utf8
      • g_uvf
      • dist
      • root
      • suffix_person
      • g_vbe
      • dist_unit
      • suffix_number
      • distributional_parent
      • kq_hybrid_utf8
      • crossrefSET
      • instruction
      • g_prs
      • lexeme_count
      • rank_occ
      • g_pfm_utf8
      • freq_occ
      • crossrefLCS
      • functional_parent
      • g_pfm
      • g_nme_utf8
      • g_vbe_utf8
      • kind
      • g_prs_utf8
      • suffix_gender
      • mother_object_type
    • noneValues:
      • absent
      • n/a
      • none
      • unknown
      • no value
      • NA
  7. docs:
    • docBase: {docRoot}/{repo}
    • docExt: ''
    • docPage: ''
    • docRoot: https://{org}.github.io
    • featurePage: 0_home
  8. interfaceDefaults: {}
  9. isCompatible: True
  10. local: clone
  11. localDir: /Users/me/github/ETCBC/bhsa/_temp
  12. provenanceSpec:
    • corpus: BHSA = Biblia Hebraica Stuttgartensia Amstelodamensis
    • doi: 10.5281/zenodo.1007624
    • moduleSpecs:
      • :
        • backend: no value
        • corpus: Phonetic Transcriptions
        • docUrl:https://nbviewer.jupyter.org/github/etcbc/phono/blob/master/programs/phono.ipynb
        • doi: 10.5281/zenodo.1007636
        • org: ETCBC
        • relative: /tf
        • repo: phono
      • :
        • backend: no value
        • corpus: Parallel Passages
        • docUrl:https://nbviewer.jupyter.org/github/ETCBC/parallels/blob/master/programs/parallels.ipynb
        • doi: 10.5281/zenodo.1007642
        • org: ETCBC
        • relative: /tf
        • repo: parallels
    • org: ETCBC
    • relative: /tf
    • repo: bhsa
    • version: 2021
    • webBase: https://shebanq.ancient-data.org/hebrew
    • webHint: Show this on SHEBANQ
    • webLang: la
    • webLexId: True
    • webUrl:{webBase}/text?book=<1>&chapter=<2>&verse=<3>&version={version}&mr=m&qw=q&tp=txt_p&tr=hb&wget=v&qget=v&nget=vt
    • webUrlLex: {webBase}/word?version={version}&id=<lid>
  13. release: no value
  14. typeDisplay:
    • clause:
      • label: {typ} {rela}
      • style: ''
    • clause_atom:
      • hidden: True
      • label: {code}
      • level: 1
      • style: ''
    • half_verse:
      • hidden: True
      • label: {label}
      • style: ''
      • verselike: True
    • lex:
      • featuresBare: gloss
      • label: {voc_lex_utf8}
      • lexOcc: word
      • style: orig
      • template: {voc_lex_utf8}
    • phrase:
      • label: {typ} {function}
      • style: ''
    • phrase_atom:
      • hidden: True
      • label: {typ} {rela}
      • level: 1
      • style: ''
    • sentence:
      • label: {number}
      • style: ''
    • sentence_atom:
      • hidden: True
      • label: {number}
      • level: 1
      • style: ''
    • subphrase:
      • hidden: True
      • label: {number}
      • style: ''
    • word:
      • features: pdp vs vt
      • featuresBare: lex:gloss
  15. writing: hbo
\n" + " Settings:
specified
  1. apiVersion: 3
  2. appName: ETCBC/bhsa
  3. appPath: /home/proycon/github/ETCBC/bhsa/app
  4. commit: no value
  5. css: ''
  6. dataDisplay:
    • exampleSectionHtml:<code>Genesis 1:1</code> (use <a href=\"https://github.com/{org}/{repo}/blob/master/tf/{version}/book%40en.tf\" target=\"_blank\">English book names</a>)
    • excludedFeatures:
      • g_uvf_utf8
      • g_vbs
      • kq_hybrid
      • languageISO
      • g_nme
      • lex0
      • is_root
      • g_vbs_utf8
      • g_uvf
      • dist
      • root
      • suffix_person
      • g_vbe
      • dist_unit
      • suffix_number
      • distributional_parent
      • kq_hybrid_utf8
      • crossrefSET
      • instruction
      • g_prs
      • lexeme_count
      • rank_occ
      • g_pfm_utf8
      • freq_occ
      • crossrefLCS
      • functional_parent
      • g_pfm
      • g_nme_utf8
      • g_vbe_utf8
      • kind
      • g_prs_utf8
      • suffix_gender
      • mother_object_type
    • noneValues:
      • absent
      • n/a
      • none
      • unknown
      • no value
      • NA
  7. docs:
    • docBase: {docRoot}/{repo}
    • docExt: ''
    • docPage: ''
    • docRoot: https://{org}.github.io
    • featurePage: 0_home
  8. interfaceDefaults: {}
  9. isCompatible: True
  10. local: clone
  11. localDir: /home/proycon/github/ETCBC/bhsa/_temp
  12. provenanceSpec:
    • corpus: BHSA = Biblia Hebraica Stuttgartensia Amstelodamensis
    • doi: 10.5281/zenodo.1007624
    • moduleSpecs:
      • :
        • backend: no value
        • corpus: Phonetic Transcriptions
        • docUrl:https://nbviewer.jupyter.org/github/etcbc/phono/blob/master/programs/phono.ipynb
        • doi: 10.5281/zenodo.1007636
        • org: ETCBC
        • relative: /tf
        • repo: phono
      • :
        • backend: no value
        • corpus: Parallel Passages
        • docUrl:https://nbviewer.jupyter.org/github/ETCBC/parallels/blob/master/programs/parallels.ipynb
        • doi: 10.5281/zenodo.1007642
        • org: ETCBC
        • relative: /tf
        • repo: parallels
    • org: ETCBC
    • relative: /tf
    • repo: bhsa
    • version: 2021
    • webBase: https://shebanq.ancient-data.org/hebrew
    • webHint: Show this on SHEBANQ
    • webLang: la
    • webLexId: True
    • webUrl:{webBase}/text?book=<1>&chapter=<2>&verse=<3>&version={version}&mr=m&qw=q&tp=txt_p&tr=hb&wget=v&qget=v&nget=vt
    • webUrlLex: {webBase}/word?version={version}&id=<lid>
  13. release: no value
  14. typeDisplay:
    • clause:
      • label: {typ} {rela}
      • style: ''
    • clause_atom:
      • hidden: True
      • label: {code}
      • level: 1
      • style: ''
    • half_verse:
      • hidden: True
      • label: {label}
      • style: ''
      • verselike: True
    • lex:
      • featuresBare: gloss
      • label: {voc_lex_utf8}
      • lexOcc: word
      • style: orig
      • template: {voc_lex_utf8}
    • phrase:
      • label: {typ} {function}
      • style: ''
    • phrase_atom:
      • hidden: True
      • label: {typ} {rela}
      • level: 1
      • style: ''
    • sentence:
      • label: {number}
      • style: ''
    • sentence_atom:
      • hidden: True
      • label: {number}
      • level: 1
      • style: ''
    • subphrase:
      • hidden: True
      • label: {number}
      • style: ''
    • word:
      • features: pdp vs vt
      • featuresBare: lex:gloss
  15. writing: hbo
\n" ], "text/plain": [ "" @@ -1460,14 +1461,14 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 0.90s 15522 results\n" + " 0.82s 15522 results\n" ] } ], @@ -1486,7 +1487,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -1576,16 +1577,24 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 5, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1020241\n", + "0.2074108123779297 s\n" + ] + }, { "data": { "text/plain": [ "15522" ] }, - "execution_count": 19, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -1593,6 +1602,8 @@ "source": [ "resultsH = []\n", "\n", + "t = time.time()\n", + "print(len(F.otype.data))\n", "for p in F.otype.s(\"phrase\"):\n", " ws = L.d(p, otype=\"word\")\n", " if len(ws) < 2:\n", @@ -1602,6 +1613,7 @@ " if F.nu.v(fi) != F.nu.v(la):\n", " continue\n", " resultsH.append((p, fi, la))\n", + "print(time.time() - t,\"s\")\n", "\n", "len(resultsH)" ] @@ -1615,7 +1627,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -1624,7 +1636,7 @@ "True" ] }, - "execution_count": 20, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -1637,384 +1649,115 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## In STAM\n", - "\n", - "Challenges:\n", - "\n", - "### Find the first and last word in each phrase\n", - "\n", - "Given a phrase, we need to find its words.\n", - "Well, a phrase is an annotation with key `otype` and value `phrase` and target some word annotations.\n", - "These target annotations are easy to get, by means of the `annotations()` method on annotations.\n", - "\n", - "Then we have to find the first and last words among these targets.\n", - "\n", - "**That is currently difficult!**\n", - "\n", - "You need a concept of order between annotations.\n", - "One possibility is to put sequence numbers in the data, as annotations.\n", - "But that is very cumbersome, because you need to refer to yet another level of annotation.\n", - "And it will inflate the data.\n", - "\n", - "The other possibility is \"canonical ordering\".\n", - "Annotations that target the text can be ordered by their targets.\n", - "A target is a subset of textual positions. Two such subsets can be ordered as follows:\n", - "\n", - "* if A is a subset of B then B <= A\n", - "* if B is a subset of A then A <= B\n", - "* if A and B are no subsets of each other, the set that has the smallest element that does not belong to the other set, is the smallest.\n", - "\n", - "As part of the index building, you could create the rank of each annotation in this ordering.\n", - "\n", - "Annotations that target annotations that are already canonically ordered, can themselves be canonically ordered wrt their targets.\n", - "\n", - "Without this, the user will need to implement sorting in ad-hoc ways.\n", - "\n", - "### Retrieve values for the first and last word\n", - "\n", - "Given the annotations for the first and last word in a phrase, we have to find annotations with key `nu`\n", - "and target these words, and read off their value.\n", - "\n", - "**That is currently difficult!**\n", - "\n", - "A way out is this:\n", - "\n", - "As preparation, before looping through the phrases:\n", - "Make a dict that associates word anno ids with nu-values:\n", - "\n", - "* retrieve all annotations that have key `nu`, for each annotation:\n", - "* pick the target, it is a word annotation, pick its id and use that as key in the dict\n", - "* pick the data and from that the value and use that as value in the dict\n", - "\n", - "Then, for each phrase with at least two words:\n", - "\n", - "* retrieve the first word and from there the nu-value for that word\n", - "* retrieve the second word and from there the nu-value for that word\n", - "* compare the two values. If they are equal, we have a hit.\n", - "\n", - "This can be improved if the API offers a n efficient function to look up values.\n", - "That could be a pre computation of all those dicts.\n", - "\n", - "Even better: those dicts could be the primary data!" + "## In STAM\n" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Current: 3.01 GB\n", - "Delta: 3.01 GB\n" + "0.3.0\n", + "Current: 2.83 GB\n", + "Delta: 2.83 GB\n" ] } ], "source": [ "import stam\n", + "print(stam.VERSION)\n", "\n", "from memutil import memUsage\n", "memUsage()\n", "\n", "workDir = f\"{A.tempDir}/stam\"\n", - "storeC = stam.AnnotationStore(file=f\"{workDir}/bhsa.store.stam.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [], - "source": [ - "aDataSet = list(storeC.annotationsets())[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [], - "source": [ - "def stamOtype(otype):\n", - " otypeData = aDataSet.find_data(\"otype\", otype)\n", - " otypeAnnos = otypeData[0].annotations()\n", - " return otypeAnnos\n", - "\n", - "def idsOf(annos):\n", - " return {a.id() for a in annos}" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [], - "source": [ - "# get the word annotations, sorted, and the phrase annotations\n", - "\n", - "def getPos(wordAnno):\n", - " t = wordAnno.textselections()[0]\n", - " return (t.begin(), t.end())\n", - "\n", - "wordAnnos = stamOtype(\"word\")\n", - "wordIds = idsOf(wordAnnos)\n", - "phraseAnnos = stamOtype(\"phrase\")" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [], - "source": [ - "wordAnnos = sorted(wordAnnos, key=getPos)" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [], - "source": [ - "# make a rank of the word annos\n", - "\n", - "wordRank = {anno.id(): i for (i, anno) in enumerate(wordAnnos)}" + "storeC = stam.AnnotationStore(file=f\"{workDir}/bhsa.store.stam.cbor\")" ] }, { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "78754" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# get the phrase annotations together with their first and last word\n", - "\n", - "phrases = []\n", - "\n", - "for pAnno in phraseAnnos:\n", - " words = pAnno.annotations()\n", - " if len(words) < 2:\n", - " continue\n", - " sortedWords = sorted(words, key=lambda x: wordRank[x.id()])\n", - " phrases.append((p, words[0], words[-1]))\n", - "\n", - "len(phrases)" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 0.90s 78754 results\n" - ] - } - ], - "source": [ - "# intermediate check with TF\n", - "\n", - "query = \"\"\"\n", - "phrase\n", - " =: word\n", - " # word\n", - " :=\n", - "\"\"\"\n", - "results = A.search(query)" - ] - }, - { - "cell_type": "code", - "execution_count": 51, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# get the `nu` information ready\n", - "# we collect a dict keyed by word id with values the grammatical number of the word\n", - "\n", - "nuKey = aDataSet.key(\"nu\")\n", - "nuAnnos = nuKey.annotations()\n", - "\n", - "nuValue = {}\n", - "\n", - "for nuAnno in nuAnnos:\n", - " value = nuAnno.data()[0].value()\n", - " word = list(nuAnno.annotations())[0]\n", - "\n", - " nuValue[word.id()] = value\n" + "With the new higher-order API, all this can be done in one go similar to the manually coded TF fragment:" ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "בְּ NA\n", - "רֵאשִׁ֖ית sg\n", - "בָּרָ֣א sg\n", - "אֱלֹהִ֑ים pl\n", - "אֵ֥ת NA\n", - "הַ NA\n", - "שָּׁמַ֖יִם pl\n", - "וְ NA\n", - "אֵ֥ת NA\n", - "הָ NA\n", - "אָֽרֶץ00 sg\n" + "1.9822394847869873 s\n" ] - } - ], - "source": [ - "# check some values\n", - "\n", - "for wordAnno in wordAnnos[0:11]:\n", - " print(f\"{wordAnno} {nuValue[wordAnno.id()]}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So far so good!" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [ + }, { "data": { "text/plain": [ "15522" ] }, - "execution_count": 53, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# now compute the final result\n", + "import time\n", "\n", - "resultsSTAM = [x for x in phrases if nuValue[x[1].id()] == nuValue[x[2].id()]]\n", - "len(resultsSTAM)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Perfect!\n", - "\n", - "## Now in one go\n", + "t = time.time()\n", + "dataset = storeC.dataset(\"features\")\n", + "key_otype = dataset.key(\"otype\")\n", + "key_nu = dataset.key(\"nu\")\n", + "results = []\n", + "for phrase in key_otype.data(value=\"phrase\").annotations():\n", + " words = phrase.annotations_in_targets(filter=key_otype, value=\"word\")\n", + " if len(words) < 2:\n", + " continue\n", + " firstword = words[0]\n", + " lastword = words[-1]\n", + " for annotation in firstword.annotations(filter=key_nu, limit=1): #forgetting the limit would lead to a big performance penalty!\n", + " data = annotation.data(filter=key_nu, limit=1)\n", + " if lastword.test_annotations(filter=data):\n", + " results.append((phrase,firstword,lastword))\n", + "print(time.time() - t,\"s\")\n", "\n", - "In order to see the performance, let's do this again in one go." + "len(results)" ] }, { - "cell_type": "code", - "execution_count": 54, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# The complete task in one go\n", - "\n", - "def getNicePhrases():\n", - " aDataSet = list(storeC.annotationsets())[0]\n", - " wordAnnos = sorted(stamOtype(\"word\"), key=getPos)\n", - " wordIds = idsOf(wordAnnos)\n", - " wordRank = {anno.id(): i for (i, anno) in enumerate(wordAnnos)}\n", - "\n", - " phraseAnnos = stamOtype(\"phrase\")\n", - " phrases = []\n", - "\n", - " for pAnno in phraseAnnos:\n", - " words = pAnno.annotations()\n", - " if len(words) < 2:\n", - " continue\n", - " sortedWords = sorted(words, key=lambda x: wordRank[x.id()])\n", - " phrases.append((p, words[0], words[-1]))\n", + "The execution times for this task on my machine were (the STAM rust experiments are not in this notebook but external in https://github.com/knaw-huc/stam-experiments/blob/main/exp5/src/main.rs):\n", "\n", - " nuKey = aDataSet.key(\"nu\")\n", - " nuAnnos = nuKey.annotations()\n", - "\n", - " nuValue = {}\n", - "\n", - " for nuAnno in nuAnnos:\n", - " value = nuAnno.data()[0].value()\n", - " word = list(nuAnno.annotations())[0]\n", - "\n", - " nuValue[word.id()] = value\n", - "\n", - " results = [x for x in phrases if nuValue[x[1].id()] == nuValue[x[2].id()]]\n", - " print(len(results))\n", - " return results" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "15522\n" - ] - } - ], - "source": [ - "resultsSTAM = getNicePhrases()" + "TF query | TF hand coding | STAM Python | STAM Rust | STAM Rust parallel (32 cores)\n", + "--- | --- | --- | --- | ----\n", + "0.9 | 0.2 | 1.98 | 0.6 | 0.06\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The execution times for this task were\n", - "\n", - "TF query | TF hand coding | STAM\n", - "--- | --- | ---\n", - "0.9 | 0.2 | 2.0\n", + "We see that performance in STAM Python is significantly worse than the TF version. This is due to several things:\n", "\n", - "But in STAM we can move quite a bit out of the task:\n", - "\n", - "1. sorting the words should be taken care of during the index building/loading when loading the STAM dataset (saves 0.3 sec)\n", - "2. retrieving the `nu` value should be optimized (could save 0.9 sec)" + "1. The STAM Python API adds significant overhead, in part inevitable, but may improve a bit with further optimisation in the future.\n", + "2. The data is modelled in in a way that fits TF perfectly, TF is able to take some shortcuts here that aren't available in a STAM model. (STAM has an explicit notion of annotation, TF maps data more directly to targets)\n" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "bhsa-env", "language": "python", - "name": "python3" + "name": "bhsa-env" }, "language_info": { "codemirror_mode": { @@ -2026,10 +1769,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.1" - }, - "orig_nbformat": 4 + "version": "3.11.5" + } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 }