informagi · arjenpdevries · Jan 3, 2022 · Jan 3, 2022 · Jan 3, 2022 · Jan 3, 2022
diff --git a/db/README.md b/db/README.md
@@ -0,0 +1,60 @@
+# DuckDB experiments
+
+## Preliminaries
+
+1.
+DuckDB has been compiled from source and installed.
+
+2.
+Mention detection has run and created parquet files.
+
+3.
+The parquet files are stored in `/export/data2/tmp`,
+or another location specified as working directory 
+in `md.init`.
+
+## CLI
+
+### Sanity check
+
+First, take the MD output on a single MS Marco V2 document
+(`msmarco_doc_00_21381293`, more or less randomly selected,
+in my tests called `doc-ok`).
+
+Read the field metadata:
+
+    duckdb md.init < fields.sql
+
+Read 10 rows from the document:
+
+    duckdb md.init < t0.sql
+
+Query `t1.sql` gives a few rows of entity frequency information.
+
+### Another sanity check
+
+Now copy the batch MD output from `tusi`:
+
+    scp tusi:/scratch/ckamphuis/el-msmarcov2/msmarco_v2_md/msmarco_doc_00* /export/data2/tmp
+
+Test query `t2.sql` should give the same output as above,
+but now reads from the Parquet file generated for the full batch (`00`).
+
+    duckdb md.init < t2.sql
+
+### Create database
+
+We want a representation for querying that is less elaborate than all those string values,
+pretty much preferred by our JSON scripting friends, but not ideal for SQL processing.
+
+    duckdb md.init < prepare-md.sql
+
+The transformation took ~3½ seconds on my home machine for part `00`.
+
+### Test queries
+
+Query `md-t2.sql` should give the same output as `t2.sql` and `t1.sql`.
+
+TBC
+
+
diff --git a/db/edict.sql b/db/edict.sql
@@ -0,0 +1,20 @@
+-- Does the progress bar work?
+SET enable_progress_bar=true;
+
+--
+-- Entity dictionary
+--
+BEGIN TRANSACTION;
+
+CREATE TABLE edict(eid UINTEGER, e VARCHAR, ef UINTEGER);
+
+INSERT INTO edict
+SELECT row_number() OVER (), text, ef
+FROM
+  (SELECT text, count(*) as ef
+   FROM 'msmarco_doc_00.parquet'
+   GROUP by text
+   ORDER by ef DESC
+  );
+
+COMMIT;
diff --git a/db/fields.sql b/db/fields.sql
@@ -0,0 +1,4 @@
+--
+-- Display the field mapping
+--
+select * from 'msmarco_doc_00_field_mapping.parquet';
diff --git a/db/md-t2.sql b/db/md-t2.sql
@@ -0,0 +1,9 @@
+-- Do we get the same results?
+SELECT edict.e, doc.tag, count(doc.e) as ef
+FROM doc, edict
+WHERE 
+      docid=21381293
+  AND doc.e = edict.eid
+group by doc.e, edict.e, doc.tag 
+order by ef desc
+limit 10;
diff --git a/db/md-t2b.sql b/db/md-t2b.sql
@@ -0,0 +1,7 @@
+-- Without joining the dictionary
+SELECT e, tag, count(e) as ef
+FROM doc
+WHERE docid=21381293
+group by e, tag 
+order by ef desc
+limit 10;
diff --git a/db/md-t3.sql b/db/md-t3.sql
@@ -0,0 +1,11 @@
+-- Ten most frequent entities
+
+-- building the index does not pay off for a single query
+-- create index ex on edict(eid);
+
+select edict.e, count(doc.e) as ef
+from doc, edict
+where field='body' AND doc.e = eid
+group by doc.e, edict.e
+order by ef desc
+limit 10;
diff --git a/db/md-t3b.sql b/db/md-t3b.sql
@@ -0,0 +1,7 @@
+-- Ten most frequent entities
+select e, count(*) as ef
+from doc
+where field='body'
+group by e
+order by ef desc
+limit 10;
diff --git a/db/md-t3c.sql b/db/md-t3c.sql
@@ -0,0 +1,5 @@
+-- Ten most frequent entities
+select edict.e, edict.ef
+from edict
+order by ef desc
+limit 10;
diff --git a/db/md.init b/db/md.init
@@ -0,0 +1,2 @@
+.cd /export/data2/tmp
+.open msmarco-doc-00.duckdb
diff --git a/db/parquet-info.sql b/db/parquet-info.sql
@@ -0,0 +1,3 @@
+-- What's in the Parquet Files?
+SELECT path_in_schema, type, stats_min_value, stats_max_value FROM parquet_metadata('msmarco_doc_00.parquet') WHERE row_group_id = 0;
+SELECT path_in_schema, type, stats_min_value, stats_max_value FROM parquet_metadata('msmarco_doc_00_field_mapping.parquet') WHERE row_group_id = 0;
diff --git a/db/prepare-md.sql b/db/prepare-md.sql
@@ -0,0 +1,129 @@
+--
+-- Transform MD tables
+--
+
+-- SET enable_progress_bar=true;
+
+--
+-- Define types
+--
+
+BEGIN TRANSACTION;
+
+-- Fields ("field", "tag") should be ENUMs
+CREATE TYPE tags AS ENUM ('PER','LOC','ORG', 'MISC');
+CREATE TYPE fields AS ENUM ('title','headings','body');
+
+-- MD field conversion table
+-- Read field values from the field_mappings.parquet file
+CREATE TABLE fielddict (
+	id    TINYINT,
+	field fields
+);
+INSERT INTO fielddict 
+SELECT stats_min_value, path_in_schema
+FROM parquet_metadata('msmarco_doc_00_field_mapping.parquet');
+
+COMMIT;
+
+--
+-- Create the Document Data Dictionary
+--
+
+BEGIN TRANSACTION;
+
+-- Data Dictionary
+CREATE TABLE dict(
+	cpart UTINYINT, 
+	docid UINTEGER, 
+	identifier VARCHAR,
+	nent USMALLINT, 
+	PRIMARY KEY(cpart, docid)
+);
+
+-- Create the document identifiers (from the Parquet File)
+INSERT INTO dict
+SELECT
+  cpdocid[0]::UTINYINT AS cpart,
+  cpdocid[1]::UINTEGER AS docid,
+  identifier,
+  count(*) AS nent
+FROM
+  (SELECT 
+     string_split(replace(identifier,'msmarco_doc_',''),'_') as cpdocid,
+     identifier
+   FROM 'msmarco_doc_00.parquet' 
+  )
+GROUP BY cpart, docid, identifier
+ORDER BY docid;
+
+COMMIT;
+
+--
+-- Entity dictionary
+--
+BEGIN TRANSACTION;
+
+CREATE TABLE edict(
+	eid UINTEGER PRIMARY KEY, 
+	e VARCHAR, 
+	ef UINTEGER);
+
+INSERT INTO edict
+SELECT row_number() OVER (), text, ef
+FROM
+  (SELECT text, count(*) as ef
+   FROM 'msmarco_doc_00.parquet'
+   GROUP by text
+   ORDER by ef DESC
+  );
+
+-- Would be useful if index persisted:
+-- CREATE INDEX e_idx ON edict(e);
+
+COMMIT;
+
+--
+-- The Document-Entity Table
+--
+
+BEGIN TRANSACTION;
+
+-- Document Entity
+CREATE TABLE doc(
+	cpart UTINYINT,
+	docid UINTEGER,
+	field fields,
+	e UINTEGER,
+	start_pos UINTEGER,
+	end_pos UINTEGER,
+	score DOUBLE,
+	tag tags
+);
+
+--
+-- Load and recode the data:
+-- 
+--   + Document identifiers are looked up in the data dictionary
+--   + Field and tag are mapped to their ENUM types
+--
+INSERT INTO doc
+SELECT 
+  d.cpart, 
+  d.docid,
+  fd.field,
+  ed.eid,
+  docs.start_pos,
+  docs.end_pos,
+  docs.score,
+  docs.tag
+FROM dict d, 'msmarco_doc_00.parquet' docs, fielddict fd, edict ed
+WHERE 
+      d.identifier = docs.identifier
+  AND fd.id = docs.field
+  AND ed.e  = docs.text;
+
+-- Would be useful if the index persisted...
+-- CREATE INDEX de_idx ON doc(e);
+
+COMMIT;
diff --git a/db/t.init b/db/t.init
@@ -0,0 +1,3 @@
+.print Change to data dir.
+.cd /export/data2/tmp
+.print Ready to roll!
diff --git a/db/t0.sql b/db/t0.sql
@@ -0,0 +1,5 @@
+-- First ten rows
+SELECT * -- identifier, text, tag 
+FROM 'doc-ok.parquet'
+WHERE field=2
+LIMIT 10;
diff --git a/db/t1.sql b/db/t1.sql
@@ -0,0 +1,5 @@
+select text, tag, count(*) as ef
+from 'doc-ok.parquet' 
+group by text, tag 
+order by ef desc
+limit 10;
diff --git a/db/t2.sql b/db/t2.sql
@@ -0,0 +1,7 @@
+-- Do we get the same results?
+select text, tag, count(*) as ef
+from 'msmarco_doc_00.parquet' 
+where identifier='msmarco_doc_00_21381293'
+group by text, tag 
+order by ef desc
+limit 10;
diff --git a/db/t3.sql b/db/t3.sql
@@ -0,0 +1,7 @@
+-- Ten most frequent entities
+select text, count(*) as ef
+from 'msmarco_doc_00.parquet'
+where field=2
+group by text
+order by ef desc
+limit 10;
diff --git a/db/test.py b/db/test.py
@@ -0,0 +1,6 @@
+import duckdb
+
+print(duckdb.query('''
+SELECT *
+FROM 'doc-ok.parquet'
+''').fetchall())
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,7 @@
 pandas
 pyarrow
 flair
+syntok
 git+git://github.com/informagi/syntok
-torch>=1.5.0,!=1.8.*
+-f https://download.pytorch.org/whl/torch_stable.html
+torch>=1.5.0,!=1.8.*
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		.cd /export/data2/tmp
		.open msmarco-doc-00.duckdb