diff --git a/db/README.md b/db/README.md new file mode 100644 index 0000000..4a0f7d1 --- /dev/null +++ b/db/README.md @@ -0,0 +1,60 @@ +# DuckDB experiments + +## Preliminaries + +1. +DuckDB has been compiled from source and installed. + +2. +Mention detection has run and created parquet files. + +3. +The parquet files are stored in `/export/data2/tmp`, +or another location specified as working directory +in `md.init`. + +## CLI + +### Sanity check + +First, take the MD output on a single MS Marco V2 document +(`msmarco_doc_00_21381293`, more or less randomly selected, +in my tests called `doc-ok`). + +Read the field metadata: + + duckdb md.init < fields.sql + +Read 10 rows from the document: + + duckdb md.init < t0.sql + +Query `t1.sql` gives a few rows of entity frequency information. + +### Another sanity check + +Now copy the batch MD output from `tusi`: + + scp tusi:/scratch/ckamphuis/el-msmarcov2/msmarco_v2_md/msmarco_doc_00* /export/data2/tmp + +Test query `t2.sql` should give the same output as above, +but now reads from the Parquet file generated for the full batch (`00`). + + duckdb md.init < t2.sql + +### Create database + +We want a representation for querying that is less elaborate than all those string values, +pretty much preferred by our JSON scripting friends, but not ideal for SQL processing. + + duckdb md.init < prepare-md.sql + +The transformation took ~3½ seconds on my home machine for part `00`. + +### Test queries + +Query `md-t2.sql` should give the same output as `t2.sql` and `t1.sql`. + +TBC + + diff --git a/db/edict.sql b/db/edict.sql new file mode 100644 index 0000000..f0b24dd --- /dev/null +++ b/db/edict.sql @@ -0,0 +1,20 @@ +-- Does the progress bar work? +SET enable_progress_bar=true; + +-- +-- Entity dictionary +-- +BEGIN TRANSACTION; + +CREATE TABLE edict(eid UINTEGER, e VARCHAR, ef UINTEGER); + +INSERT INTO edict +SELECT row_number() OVER (), text, ef +FROM + (SELECT text, count(*) as ef + FROM 'msmarco_doc_00.parquet' + GROUP by text + ORDER by ef DESC + ); + +COMMIT; diff --git a/db/fields.sql b/db/fields.sql new file mode 100644 index 0000000..3c11179 --- /dev/null +++ b/db/fields.sql @@ -0,0 +1,4 @@ +-- +-- Display the field mapping +-- +select * from 'msmarco_doc_00_field_mapping.parquet'; diff --git a/db/md-t2.sql b/db/md-t2.sql new file mode 100644 index 0000000..5327cab --- /dev/null +++ b/db/md-t2.sql @@ -0,0 +1,9 @@ +-- Do we get the same results? +SELECT edict.e, doc.tag, count(doc.e) as ef +FROM doc, edict +WHERE + docid=21381293 + AND doc.e = edict.eid +group by doc.e, edict.e, doc.tag +order by ef desc +limit 10; diff --git a/db/md-t2b.sql b/db/md-t2b.sql new file mode 100644 index 0000000..966e86f --- /dev/null +++ b/db/md-t2b.sql @@ -0,0 +1,7 @@ +-- Without joining the dictionary +SELECT e, tag, count(e) as ef +FROM doc +WHERE docid=21381293 +group by e, tag +order by ef desc +limit 10; diff --git a/db/md-t3.sql b/db/md-t3.sql new file mode 100644 index 0000000..2d51e87 --- /dev/null +++ b/db/md-t3.sql @@ -0,0 +1,11 @@ +-- Ten most frequent entities + +-- building the index does not pay off for a single query +-- create index ex on edict(eid); + +select edict.e, count(doc.e) as ef +from doc, edict +where field='body' AND doc.e = eid +group by doc.e, edict.e +order by ef desc +limit 10; diff --git a/db/md-t3b.sql b/db/md-t3b.sql new file mode 100644 index 0000000..d3d5a4b --- /dev/null +++ b/db/md-t3b.sql @@ -0,0 +1,7 @@ +-- Ten most frequent entities +select e, count(*) as ef +from doc +where field='body' +group by e +order by ef desc +limit 10; diff --git a/db/md-t3c.sql b/db/md-t3c.sql new file mode 100644 index 0000000..7bd17f3 --- /dev/null +++ b/db/md-t3c.sql @@ -0,0 +1,5 @@ +-- Ten most frequent entities +select edict.e, edict.ef +from edict +order by ef desc +limit 10; diff --git a/db/md.init b/db/md.init new file mode 100644 index 0000000..b9f206f --- /dev/null +++ b/db/md.init @@ -0,0 +1,2 @@ +.cd /export/data2/tmp +.open msmarco-doc-00.duckdb diff --git a/db/parquet-info.sql b/db/parquet-info.sql new file mode 100644 index 0000000..6ac5149 --- /dev/null +++ b/db/parquet-info.sql @@ -0,0 +1,3 @@ +-- What's in the Parquet Files? +SELECT path_in_schema, type, stats_min_value, stats_max_value FROM parquet_metadata('msmarco_doc_00.parquet') WHERE row_group_id = 0; +SELECT path_in_schema, type, stats_min_value, stats_max_value FROM parquet_metadata('msmarco_doc_00_field_mapping.parquet') WHERE row_group_id = 0; diff --git a/db/prepare-md.sql b/db/prepare-md.sql new file mode 100644 index 0000000..f3c535a --- /dev/null +++ b/db/prepare-md.sql @@ -0,0 +1,129 @@ +-- +-- Transform MD tables +-- + +-- SET enable_progress_bar=true; + +-- +-- Define types +-- + +BEGIN TRANSACTION; + +-- Fields ("field", "tag") should be ENUMs +CREATE TYPE tags AS ENUM ('PER','LOC','ORG', 'MISC'); +CREATE TYPE fields AS ENUM ('title','headings','body'); + +-- MD field conversion table +-- Read field values from the field_mappings.parquet file +CREATE TABLE fielddict ( + id TINYINT, + field fields +); +INSERT INTO fielddict +SELECT stats_min_value, path_in_schema +FROM parquet_metadata('msmarco_doc_00_field_mapping.parquet'); + +COMMIT; + +-- +-- Create the Document Data Dictionary +-- + +BEGIN TRANSACTION; + +-- Data Dictionary +CREATE TABLE dict( + cpart UTINYINT, + docid UINTEGER, + identifier VARCHAR, + nent USMALLINT, + PRIMARY KEY(cpart, docid) +); + +-- Create the document identifiers (from the Parquet File) +INSERT INTO dict +SELECT + cpdocid[0]::UTINYINT AS cpart, + cpdocid[1]::UINTEGER AS docid, + identifier, + count(*) AS nent +FROM + (SELECT + string_split(replace(identifier,'msmarco_doc_',''),'_') as cpdocid, + identifier + FROM 'msmarco_doc_00.parquet' + ) +GROUP BY cpart, docid, identifier +ORDER BY docid; + +COMMIT; + +-- +-- Entity dictionary +-- +BEGIN TRANSACTION; + +CREATE TABLE edict( + eid UINTEGER PRIMARY KEY, + e VARCHAR, + ef UINTEGER); + +INSERT INTO edict +SELECT row_number() OVER (), text, ef +FROM + (SELECT text, count(*) as ef + FROM 'msmarco_doc_00.parquet' + GROUP by text + ORDER by ef DESC + ); + +-- Would be useful if index persisted: +-- CREATE INDEX e_idx ON edict(e); + +COMMIT; + +-- +-- The Document-Entity Table +-- + +BEGIN TRANSACTION; + +-- Document Entity +CREATE TABLE doc( + cpart UTINYINT, + docid UINTEGER, + field fields, + e UINTEGER, + start_pos UINTEGER, + end_pos UINTEGER, + score DOUBLE, + tag tags +); + +-- +-- Load and recode the data: +-- +-- + Document identifiers are looked up in the data dictionary +-- + Field and tag are mapped to their ENUM types +-- +INSERT INTO doc +SELECT + d.cpart, + d.docid, + fd.field, + ed.eid, + docs.start_pos, + docs.end_pos, + docs.score, + docs.tag +FROM dict d, 'msmarco_doc_00.parquet' docs, fielddict fd, edict ed +WHERE + d.identifier = docs.identifier + AND fd.id = docs.field + AND ed.e = docs.text; + +-- Would be useful if the index persisted... +-- CREATE INDEX de_idx ON doc(e); + +COMMIT; diff --git a/db/t.init b/db/t.init new file mode 100644 index 0000000..7dd971b --- /dev/null +++ b/db/t.init @@ -0,0 +1,3 @@ +.print Change to data dir. +.cd /export/data2/tmp +.print Ready to roll! diff --git a/db/t0.sql b/db/t0.sql new file mode 100644 index 0000000..ef5f218 --- /dev/null +++ b/db/t0.sql @@ -0,0 +1,5 @@ +-- First ten rows +SELECT * -- identifier, text, tag +FROM 'doc-ok.parquet' +WHERE field=2 +LIMIT 10; diff --git a/db/t1.sql b/db/t1.sql new file mode 100644 index 0000000..f501d7c --- /dev/null +++ b/db/t1.sql @@ -0,0 +1,5 @@ +select text, tag, count(*) as ef +from 'doc-ok.parquet' +group by text, tag +order by ef desc +limit 10; diff --git a/db/t2.sql b/db/t2.sql new file mode 100644 index 0000000..0f4b78d --- /dev/null +++ b/db/t2.sql @@ -0,0 +1,7 @@ +-- Do we get the same results? +select text, tag, count(*) as ef +from 'msmarco_doc_00.parquet' +where identifier='msmarco_doc_00_21381293' +group by text, tag +order by ef desc +limit 10; diff --git a/db/t3.sql b/db/t3.sql new file mode 100644 index 0000000..70426ae --- /dev/null +++ b/db/t3.sql @@ -0,0 +1,7 @@ +-- Ten most frequent entities +select text, count(*) as ef +from 'msmarco_doc_00.parquet' +where field=2 +group by text +order by ef desc +limit 10; diff --git a/db/test.py b/db/test.py new file mode 100644 index 0000000..007a496 --- /dev/null +++ b/db/test.py @@ -0,0 +1,6 @@ +import duckdb + +print(duckdb.query(''' +SELECT * +FROM 'doc-ok.parquet' +''').fetchall()) diff --git a/requirements.txt b/requirements.txt index 3a29441..a191ecb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ pandas pyarrow flair +syntok git+git://github.com/informagi/syntok -torch>=1.5.0,!=1.8.* +-f https://download.pytorch.org/whl/torch_stable.html +torch>=1.5.0,!=1.8.* \ No newline at end of file