From a1cdfa74caa5e4549ed8e03de521ff0b8b5a8e23 Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Mon, 3 Jan 2022 15:55:47 +0100 Subject: [PATCH 01/17] Example of querying the Parquet Metadata --- db/parquet-info.sql | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 db/parquet-info.sql diff --git a/db/parquet-info.sql b/db/parquet-info.sql new file mode 100644 index 0000000..987ced9 --- /dev/null +++ b/db/parquet-info.sql @@ -0,0 +1,3 @@ +-- What's in the Parquet Files? +SELECT * FROM parquet_metadata('doc-ok.parquet') WHERE row_group_id = 0; +SELECT * FROM parquet_metadata('doc-ok_field_mapping.parquet'); From 69d8035f13acda83796650e1849f1ab819453a0c Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Mon, 3 Jan 2022 15:57:38 +0100 Subject: [PATCH 02/17] Just show the Field Mapping --- db/fields.sql | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 db/fields.sql diff --git a/db/fields.sql b/db/fields.sql new file mode 100644 index 0000000..3c11179 --- /dev/null +++ b/db/fields.sql @@ -0,0 +1,4 @@ +-- +-- Display the field mapping +-- +select * from 'msmarco_doc_00_field_mapping.parquet'; From 2be399364e000242e10ca73e3b1c05530cc0aeba Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Mon, 3 Jan 2022 15:58:54 +0100 Subject: [PATCH 03/17] Example commands using DuckDB over the MD phase output --- db/README.md | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 db/README.md diff --git a/db/README.md b/db/README.md new file mode 100644 index 0000000..3e492b5 --- /dev/null +++ b/db/README.md @@ -0,0 +1,37 @@ +# DuckDB experiments + +## Preliminaries + +1. +DuckDB has been compiled from source and installed. + +2. +Mention detection has run and created parquet files. + +3. +The parquet files are stored in `/export/data2/tmp`, +or another location specified as working directory +in `md.init`. + +## CLI + +### Sanity check + + duckdb md.init < t0.sql + duckdb md.init < + +### Another sanity check + +Copy output over: + + scp tusi:/scratch/ckamphuis/el-msmarcov2/msmarco_v2_md/msmarco_doc_00* /export/data2/tmp + + duckdb md.init < t2.sql + +### Create database + +We want a representation for querying that is less elaborate than all the string values. + + duckdb md.init < prepare-md.sql + + From 133b67cdc852cf7bcd61cbcc706837602c610ff9 Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Mon, 3 Jan 2022 18:46:06 +0100 Subject: [PATCH 04/17] Choose useful information from everything. --- db/parquet-info.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/db/parquet-info.sql b/db/parquet-info.sql index 987ced9..6ac5149 100644 --- a/db/parquet-info.sql +++ b/db/parquet-info.sql @@ -1,3 +1,3 @@ -- What's in the Parquet Files? -SELECT * FROM parquet_metadata('doc-ok.parquet') WHERE row_group_id = 0; -SELECT * FROM parquet_metadata('doc-ok_field_mapping.parquet'); +SELECT path_in_schema, type, stats_min_value, stats_max_value FROM parquet_metadata('msmarco_doc_00.parquet') WHERE row_group_id = 0; +SELECT path_in_schema, type, stats_min_value, stats_max_value FROM parquet_metadata('msmarco_doc_00_field_mapping.parquet') WHERE row_group_id = 0; From 7fc53a1c4455de209352c0aa474d189a50f2bdc3 Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Mon, 3 Jan 2022 18:47:01 +0100 Subject: [PATCH 05/17] A few initial tests --- db/t.init | 3 +++ db/t.py | 6 ++++++ db/t.sql | 5 +++++ db/t0.sql | 5 +++++ db/t1.sql | 5 +++++ db/t2.sql | 7 +++++++ 6 files changed, 31 insertions(+) create mode 100644 db/t.init create mode 100644 db/t.py create mode 100644 db/t.sql create mode 100644 db/t0.sql create mode 100644 db/t1.sql create mode 100644 db/t2.sql diff --git a/db/t.init b/db/t.init new file mode 100644 index 0000000..7dd971b --- /dev/null +++ b/db/t.init @@ -0,0 +1,3 @@ +.print Change to data dir. +.cd /export/data2/tmp +.print Ready to roll! diff --git a/db/t.py b/db/t.py new file mode 100644 index 0000000..ebc2330 --- /dev/null +++ b/db/t.py @@ -0,0 +1,6 @@ +import duckdb + +print(duckdb.query(''' +SELECT * +FROM 'doc-error.parquet' +''').fetchall()) diff --git a/db/t.sql b/db/t.sql new file mode 100644 index 0000000..f501d7c --- /dev/null +++ b/db/t.sql @@ -0,0 +1,5 @@ +select text, tag, count(*) as ef +from 'doc-ok.parquet' +group by text, tag +order by ef desc +limit 10; diff --git a/db/t0.sql b/db/t0.sql new file mode 100644 index 0000000..ef5f218 --- /dev/null +++ b/db/t0.sql @@ -0,0 +1,5 @@ +-- First ten rows +SELECT * -- identifier, text, tag +FROM 'doc-ok.parquet' +WHERE field=2 +LIMIT 10; diff --git a/db/t1.sql b/db/t1.sql new file mode 100644 index 0000000..f501d7c --- /dev/null +++ b/db/t1.sql @@ -0,0 +1,5 @@ +select text, tag, count(*) as ef +from 'doc-ok.parquet' +group by text, tag +order by ef desc +limit 10; diff --git a/db/t2.sql b/db/t2.sql new file mode 100644 index 0000000..0f4b78d --- /dev/null +++ b/db/t2.sql @@ -0,0 +1,7 @@ +-- Do we get the same results? +select text, tag, count(*) as ef +from 'msmarco_doc_00.parquet' +where identifier='msmarco_doc_00_21381293' +group by text, tag +order by ef desc +limit 10; From dc359c91dc8eefa172a4ffa084cdbc1e1bf0d1da Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Mon, 3 Jan 2022 18:48:00 +0100 Subject: [PATCH 06/17] Recode MD output for more efficient further processing. --- db/md.init | 2 + db/prepare-md.sql | 99 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 db/md.init create mode 100644 db/prepare-md.sql diff --git a/db/md.init b/db/md.init new file mode 100644 index 0000000..b9f206f --- /dev/null +++ b/db/md.init @@ -0,0 +1,2 @@ +.cd /export/data2/tmp +.open msmarco-doc-00.duckdb diff --git a/db/prepare-md.sql b/db/prepare-md.sql new file mode 100644 index 0000000..16726f7 --- /dev/null +++ b/db/prepare-md.sql @@ -0,0 +1,99 @@ +-- +-- Transform MD tables +-- + +-- +-- Define types +-- + +BEGIN TRANSACTION; + +-- Fields ("field", "tag") should be ENUMs +CREATE TYPE tags AS ENUM ('PER','LOC','ORG', 'MISC'); +CREATE TYPE fields AS ENUM ('title','headings','body'); + +-- MD field conversion table +-- Read field values from the field_mappings.parquet file +CREATE TABLE fielddict ( + id TINYINT, + field fields +); +INSERT INTO fielddict +SELECT stats_min_value, path_in_schema +FROM parquet_metadata('msmarco_doc_00_field_mapping.parquet'); + +COMMIT; + +-- +-- Create the Document Data Dictionary +-- + +BEGIN TRANSACTION; + +-- Data Dictionary +CREATE TABLE dict( + cpart UTINYINT, + docid UINTEGER, + identifier VARCHAR, + nent USMALLINT, + PRIMARY KEY(cpart, docid) +); + +-- Create the document identifiers (from the Parquet File) +INSERT INTO dict +SELECT + cpdocid[0]::UTINYINT AS cpart, + cpdocid[1]::UINTEGER AS docid, + identifier, + count(*) AS nent +FROM + (SELECT + string_split(replace(identifier,'msmarco_doc_',''),'_') as cpdocid, + identifier + FROM 'msmarco_doc_00.parquet' + ) +GROUP BY cpart, docid, identifier +ORDER BY docid; + +COMMIT; + +-- +-- The Document-Entity Table +-- + +BEGIN TRANSACTION; + +-- Document Entity +CREATE TABLE doc( + cpart UTINYINT, + docid UINTEGER, + field fields, + text VARCHAR(127), + start_pos UINTEGER, + end_pos UINTEGER, + score DOUBLE, + tag tags +); + +-- +-- Load and recode the data: +-- +-- + Document identifiers are looked up in the data dictionary +-- + Field and tag are mapped to their ENUM types +-- +INSERT INTO doc +SELECT + d.cpart, + d.docid, + fd.field, + docs.text, + docs.start_pos, + docs.end_pos, + docs.score, + docs.tag +FROM dict d, 'msmarco_doc_00.parquet' docs, fielddict fd +WHERE + d.identifier = docs.identifier + AND fd.id = docs.field; + +COMMIT; From 1bb0967d130b80fad3a7df7b3c30e4e7604cb79d Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Mon, 3 Jan 2022 18:49:35 +0100 Subject: [PATCH 07/17] Sanity check - should give same output of t2.sql (for tuples with freq > 1) --- db/md-t2.sql | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 db/md-t2.sql diff --git a/db/md-t2.sql b/db/md-t2.sql new file mode 100644 index 0000000..f645957 --- /dev/null +++ b/db/md-t2.sql @@ -0,0 +1,7 @@ +-- Do we get the same results? +select text, tag, count(*) as ef +from doc +where docid=21381293 +group by text, tag +order by ef desc +limit 10; From 59f07b1e3d58013bdc6c43b597ff41b58afa5d8b Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Mon, 3 Jan 2022 19:06:56 +0100 Subject: [PATCH 08/17] Explanation of directory --- db/README.md | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/db/README.md b/db/README.md index 3e492b5..4a0f7d1 100644 --- a/db/README.md +++ b/db/README.md @@ -17,21 +17,44 @@ in `md.init`. ### Sanity check +First, take the MD output on a single MS Marco V2 document +(`msmarco_doc_00_21381293`, more or less randomly selected, +in my tests called `doc-ok`). + +Read the field metadata: + + duckdb md.init < fields.sql + +Read 10 rows from the document: + duckdb md.init < t0.sql - duckdb md.init < + +Query `t1.sql` gives a few rows of entity frequency information. ### Another sanity check -Copy output over: +Now copy the batch MD output from `tusi`: scp tusi:/scratch/ckamphuis/el-msmarcov2/msmarco_v2_md/msmarco_doc_00* /export/data2/tmp +Test query `t2.sql` should give the same output as above, +but now reads from the Parquet file generated for the full batch (`00`). + duckdb md.init < t2.sql ### Create database -We want a representation for querying that is less elaborate than all the string values. +We want a representation for querying that is less elaborate than all those string values, +pretty much preferred by our JSON scripting friends, but not ideal for SQL processing. duckdb md.init < prepare-md.sql +The transformation took ~3½ seconds on my home machine for part `00`. + +### Test queries + +Query `md-t2.sql` should give the same output as `t2.sql` and `t1.sql`. + +TBC + From 7aa7425680cc553b6a11150befc5a7a12ba7265b Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Mon, 3 Jan 2022 19:07:23 +0100 Subject: [PATCH 09/17] Moved to other files --- db/t.sql | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 db/t.sql diff --git a/db/t.sql b/db/t.sql deleted file mode 100644 index f501d7c..0000000 --- a/db/t.sql +++ /dev/null @@ -1,5 +0,0 @@ -select text, tag, count(*) as ef -from 'doc-ok.parquet' -group by text, tag -order by ef desc -limit 10; From 56fa0bf7e1a6bd76cb78bc65d5bb11947ebb28a6 Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Mon, 3 Jan 2022 19:08:09 +0100 Subject: [PATCH 10/17] Deleted because I switched to using the CLI --- db/t.py | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 db/t.py diff --git a/db/t.py b/db/t.py deleted file mode 100644 index ebc2330..0000000 --- a/db/t.py +++ /dev/null @@ -1,6 +0,0 @@ -import duckdb - -print(duckdb.query(''' -SELECT * -FROM 'doc-error.parquet' -''').fetchall()) From d3412687893a6b1533fe8cf334d25cff005ea3cf Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Mon, 3 Jan 2022 19:08:48 +0100 Subject: [PATCH 11/17] A very minimal python script - not really used yet because the CLI suffices this far. --- db/test.py | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 db/test.py diff --git a/db/test.py b/db/test.py new file mode 100644 index 0000000..007a496 --- /dev/null +++ b/db/test.py @@ -0,0 +1,6 @@ +import duckdb + +print(duckdb.query(''' +SELECT * +FROM 'doc-ok.parquet' +''').fetchall()) From 3b7b05dc341835a01dd40f555aefb9efd65fee01 Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Mon, 3 Jan 2022 19:09:33 +0100 Subject: [PATCH 12/17] Slightly more robust version (but not sufficient on a GPU-less machine). --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 2172238..6fe0962 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ pandas pyarrow flair syntok +-f https://download.pytorch.org/whl/torch_stable.html torch>=1.5.0,!=1.8.* From f997eb97236090d893cce17c350449c8e32614d0 Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Tue, 4 Jan 2022 01:33:42 +0100 Subject: [PATCH 13/17] Introduce entity dictionary based on entity mentions from MD --- db/md-t2.sql | 10 ++++++---- db/md-t2b.sql | 7 +++++++ db/prepare-md.sql | 30 ++++++++++++++++++++++++++---- 3 files changed, 39 insertions(+), 8 deletions(-) create mode 100644 db/md-t2b.sql diff --git a/db/md-t2.sql b/db/md-t2.sql index f645957..5327cab 100644 --- a/db/md-t2.sql +++ b/db/md-t2.sql @@ -1,7 +1,9 @@ -- Do we get the same results? -select text, tag, count(*) as ef -from doc -where docid=21381293 -group by text, tag +SELECT edict.e, doc.tag, count(doc.e) as ef +FROM doc, edict +WHERE + docid=21381293 + AND doc.e = edict.eid +group by doc.e, edict.e, doc.tag order by ef desc limit 10; diff --git a/db/md-t2b.sql b/db/md-t2b.sql new file mode 100644 index 0000000..966e86f --- /dev/null +++ b/db/md-t2b.sql @@ -0,0 +1,7 @@ +-- Without joining the dictionary +SELECT e, tag, count(e) as ef +FROM doc +WHERE docid=21381293 +group by e, tag +order by ef desc +limit 10; diff --git a/db/prepare-md.sql b/db/prepare-md.sql index 16726f7..cb9e23a 100644 --- a/db/prepare-md.sql +++ b/db/prepare-md.sql @@ -2,6 +2,9 @@ -- Transform MD tables -- +-- Does the progress bar work? +SET enable_progress_bar=true; + -- -- Define types -- @@ -57,6 +60,24 @@ ORDER BY docid; COMMIT; +-- +-- Entity dictionary +-- +BEGIN TRANSACTION; + +CREATE TABLE edict(eid UINTEGER, e VARCHAR, ef UINTEGER); + +INSERT INTO edict +SELECT row_number() OVER (), text, ef +FROM + (SELECT text, count(*) as ef + FROM 'msmarco_doc_00.parquet' + GROUP by text + ORDER by ef DESC + ); + +COMMIT; + -- -- The Document-Entity Table -- @@ -68,7 +89,7 @@ CREATE TABLE doc( cpart UTINYINT, docid UINTEGER, field fields, - text VARCHAR(127), + e UINTEGER, start_pos UINTEGER, end_pos UINTEGER, score DOUBLE, @@ -86,14 +107,15 @@ SELECT d.cpart, d.docid, fd.field, - docs.text, + ed.eid, docs.start_pos, docs.end_pos, docs.score, docs.tag -FROM dict d, 'msmarco_doc_00.parquet' docs, fielddict fd +FROM dict d, 'msmarco_doc_00.parquet' docs, fielddict fd, edict ed WHERE d.identifier = docs.identifier - AND fd.id = docs.field; + AND fd.id = docs.field + AND ed.e = docs.text; COMMIT; From 1d48557db62bfe7bad0241dbc49d5586c7ede4ff Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Tue, 4 Jan 2022 01:34:27 +0100 Subject: [PATCH 14/17] Original script to test creating edict; now integrated in prepare-md.sql --- db/edict.sql | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 db/edict.sql diff --git a/db/edict.sql b/db/edict.sql new file mode 100644 index 0000000..f0b24dd --- /dev/null +++ b/db/edict.sql @@ -0,0 +1,20 @@ +-- Does the progress bar work? +SET enable_progress_bar=true; + +-- +-- Entity dictionary +-- +BEGIN TRANSACTION; + +CREATE TABLE edict(eid UINTEGER, e VARCHAR, ef UINTEGER); + +INSERT INTO edict +SELECT row_number() OVER (), text, ef +FROM + (SELECT text, count(*) as ef + FROM 'msmarco_doc_00.parquet' + GROUP by text + ORDER by ef DESC + ); + +COMMIT; From 95fb5ad422940c35bd97e0bad621310ac1f1e4ab Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Tue, 4 Jan 2022 01:56:09 +0100 Subject: [PATCH 15/17] Also create indices on entity text --- db/prepare-md.sql | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/db/prepare-md.sql b/db/prepare-md.sql index cb9e23a..86e48f3 100644 --- a/db/prepare-md.sql +++ b/db/prepare-md.sql @@ -2,8 +2,7 @@ -- Transform MD tables -- --- Does the progress bar work? -SET enable_progress_bar=true; +-- SET enable_progress_bar=true; -- -- Define types @@ -65,7 +64,10 @@ COMMIT; -- BEGIN TRANSACTION; -CREATE TABLE edict(eid UINTEGER, e VARCHAR, ef UINTEGER); +CREATE TABLE edict( + eid UINTEGER PRIMARY KEY, + e VARCHAR, + ef UINTEGER); INSERT INTO edict SELECT row_number() OVER (), text, ef @@ -76,6 +78,8 @@ FROM ORDER by ef DESC ); +CREATE INDEX e_idx ON edict(e); + COMMIT; -- @@ -118,4 +122,6 @@ WHERE AND fd.id = docs.field AND ed.e = docs.text; +CREATE INDEX de_idx ON doc(e); + COMMIT; From a7bd90375ba9fa07f26a64939d5e62f04bc6aa6e Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Tue, 4 Jan 2022 02:05:51 +0100 Subject: [PATCH 16/17] Indexes do not yet persist, so pretty useless to create them in preparation stage. --- db/prepare-md.sql | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/db/prepare-md.sql b/db/prepare-md.sql index 86e48f3..f3c535a 100644 --- a/db/prepare-md.sql +++ b/db/prepare-md.sql @@ -78,7 +78,8 @@ FROM ORDER by ef DESC ); -CREATE INDEX e_idx ON edict(e); +-- Would be useful if index persisted: +-- CREATE INDEX e_idx ON edict(e); COMMIT; @@ -122,6 +123,7 @@ WHERE AND fd.id = docs.field AND ed.e = docs.text; -CREATE INDEX de_idx ON doc(e); +-- Would be useful if the index persisted... +-- CREATE INDEX de_idx ON doc(e); COMMIT; From 5ff2d12939880a52816f803d2529b9055e4b937c Mon Sep 17 00:00:00 2001 From: "Arjen P. de Vries" Date: Tue, 4 Jan 2022 02:16:32 +0100 Subject: [PATCH 17/17] A few queries getting most frequent entity mentions in different ways. --- db/md-t3.sql | 11 +++++++++++ db/md-t3b.sql | 7 +++++++ db/md-t3c.sql | 5 +++++ db/t3.sql | 7 +++++++ 4 files changed, 30 insertions(+) create mode 100644 db/md-t3.sql create mode 100644 db/md-t3b.sql create mode 100644 db/md-t3c.sql create mode 100644 db/t3.sql diff --git a/db/md-t3.sql b/db/md-t3.sql new file mode 100644 index 0000000..2d51e87 --- /dev/null +++ b/db/md-t3.sql @@ -0,0 +1,11 @@ +-- Ten most frequent entities + +-- building the index does not pay off for a single query +-- create index ex on edict(eid); + +select edict.e, count(doc.e) as ef +from doc, edict +where field='body' AND doc.e = eid +group by doc.e, edict.e +order by ef desc +limit 10; diff --git a/db/md-t3b.sql b/db/md-t3b.sql new file mode 100644 index 0000000..d3d5a4b --- /dev/null +++ b/db/md-t3b.sql @@ -0,0 +1,7 @@ +-- Ten most frequent entities +select e, count(*) as ef +from doc +where field='body' +group by e +order by ef desc +limit 10; diff --git a/db/md-t3c.sql b/db/md-t3c.sql new file mode 100644 index 0000000..7bd17f3 --- /dev/null +++ b/db/md-t3c.sql @@ -0,0 +1,5 @@ +-- Ten most frequent entities +select edict.e, edict.ef +from edict +order by ef desc +limit 10; diff --git a/db/t3.sql b/db/t3.sql new file mode 100644 index 0000000..70426ae --- /dev/null +++ b/db/t3.sql @@ -0,0 +1,7 @@ +-- Ten most frequent entities +select text, count(*) as ef +from 'msmarco_doc_00.parquet' +where field=2 +group by text +order by ef desc +limit 10;