From a1cdfa74caa5e4549ed8e03de521ff0b8b5a8e23 Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Mon, 3 Jan 2022 15:55:47 +0100
Subject: [PATCH 01/17] Example of querying the Parquet Metadata

---
 db/parquet-info.sql | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 db/parquet-info.sql

diff --git a/db/parquet-info.sql b/db/parquet-info.sql
new file mode 100644
index 0000000..987ced9
--- /dev/null
+++ b/db/parquet-info.sql
@@ -0,0 +1,3 @@
+-- What's in the Parquet Files?
+SELECT * FROM parquet_metadata('doc-ok.parquet') WHERE row_group_id = 0;
+SELECT * FROM parquet_metadata('doc-ok_field_mapping.parquet');

From 69d8035f13acda83796650e1849f1ab819453a0c Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Mon, 3 Jan 2022 15:57:38 +0100
Subject: [PATCH 02/17] Just show the Field Mapping

---
 db/fields.sql | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 db/fields.sql

diff --git a/db/fields.sql b/db/fields.sql
new file mode 100644
index 0000000..3c11179
--- /dev/null
+++ b/db/fields.sql
@@ -0,0 +1,4 @@
+--
+-- Display the field mapping
+--
+select * from 'msmarco_doc_00_field_mapping.parquet';

From 2be399364e000242e10ca73e3b1c05530cc0aeba Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Mon, 3 Jan 2022 15:58:54 +0100
Subject: [PATCH 03/17] Example commands using DuckDB over the MD phase output

---
 db/README.md | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 db/README.md

diff --git a/db/README.md b/db/README.md
new file mode 100644
index 0000000..3e492b5
--- /dev/null
+++ b/db/README.md
@@ -0,0 +1,37 @@
+# DuckDB experiments
+
+## Preliminaries
+
+1.
+DuckDB has been compiled from source and installed.
+
+2.
+Mention detection has run and created parquet files.
+
+3.
+The parquet files are stored in `/export/data2/tmp`,
+or another location specified as working directory 
+in `md.init`.
+
+## CLI
+
+### Sanity check
+
+    duckdb md.init < t0.sql
+    duckdb md.init < 
+
+### Another sanity check
+
+Copy output over:
+
+    scp tusi:/scratch/ckamphuis/el-msmarcov2/msmarco_v2_md/msmarco_doc_00* /export/data2/tmp
+
+    duckdb md.init < t2.sql
+
+### Create database
+
+We want a representation for querying that is less elaborate than all the string values.
+
+    duckdb md.init < prepare-md.sql
+
+

From 133b67cdc852cf7bcd61cbcc706837602c610ff9 Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Mon, 3 Jan 2022 18:46:06 +0100
Subject: [PATCH 04/17] Choose useful information from everything.

---
 db/parquet-info.sql | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/parquet-info.sql b/db/parquet-info.sql
index 987ced9..6ac5149 100644
--- a/db/parquet-info.sql
+++ b/db/parquet-info.sql
@@ -1,3 +1,3 @@
 -- What's in the Parquet Files?
-SELECT * FROM parquet_metadata('doc-ok.parquet') WHERE row_group_id = 0;
-SELECT * FROM parquet_metadata('doc-ok_field_mapping.parquet');
+SELECT path_in_schema, type, stats_min_value, stats_max_value FROM parquet_metadata('msmarco_doc_00.parquet') WHERE row_group_id = 0;
+SELECT path_in_schema, type, stats_min_value, stats_max_value FROM parquet_metadata('msmarco_doc_00_field_mapping.parquet') WHERE row_group_id = 0;

From 7fc53a1c4455de209352c0aa474d189a50f2bdc3 Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Mon, 3 Jan 2022 18:47:01 +0100
Subject: [PATCH 05/17] A few initial tests

---
 db/t.init | 3 +++
 db/t.py   | 6 ++++++
 db/t.sql  | 5 +++++
 db/t0.sql | 5 +++++
 db/t1.sql | 5 +++++
 db/t2.sql | 7 +++++++
 6 files changed, 31 insertions(+)
 create mode 100644 db/t.init
 create mode 100644 db/t.py
 create mode 100644 db/t.sql
 create mode 100644 db/t0.sql
 create mode 100644 db/t1.sql
 create mode 100644 db/t2.sql

diff --git a/db/t.init b/db/t.init
new file mode 100644
index 0000000..7dd971b
--- /dev/null
+++ b/db/t.init
@@ -0,0 +1,3 @@
+.print Change to data dir.
+.cd /export/data2/tmp
+.print Ready to roll!
diff --git a/db/t.py b/db/t.py
new file mode 100644
index 0000000..ebc2330
--- /dev/null
+++ b/db/t.py
@@ -0,0 +1,6 @@
+import duckdb
+
+print(duckdb.query('''
+SELECT *
+FROM 'doc-error.parquet'
+''').fetchall())
diff --git a/db/t.sql b/db/t.sql
new file mode 100644
index 0000000..f501d7c
--- /dev/null
+++ b/db/t.sql
@@ -0,0 +1,5 @@
+select text, tag, count(*) as ef
+from 'doc-ok.parquet' 
+group by text, tag 
+order by ef desc
+limit 10;
diff --git a/db/t0.sql b/db/t0.sql
new file mode 100644
index 0000000..ef5f218
--- /dev/null
+++ b/db/t0.sql
@@ -0,0 +1,5 @@
+-- First ten rows
+SELECT * -- identifier, text, tag 
+FROM 'doc-ok.parquet'
+WHERE field=2
+LIMIT 10;
diff --git a/db/t1.sql b/db/t1.sql
new file mode 100644
index 0000000..f501d7c
--- /dev/null
+++ b/db/t1.sql
@@ -0,0 +1,5 @@
+select text, tag, count(*) as ef
+from 'doc-ok.parquet' 
+group by text, tag 
+order by ef desc
+limit 10;
diff --git a/db/t2.sql b/db/t2.sql
new file mode 100644
index 0000000..0f4b78d
--- /dev/null
+++ b/db/t2.sql
@@ -0,0 +1,7 @@
+-- Do we get the same results?
+select text, tag, count(*) as ef
+from 'msmarco_doc_00.parquet' 
+where identifier='msmarco_doc_00_21381293'
+group by text, tag 
+order by ef desc
+limit 10;

From dc359c91dc8eefa172a4ffa084cdbc1e1bf0d1da Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Mon, 3 Jan 2022 18:48:00 +0100
Subject: [PATCH 06/17] Recode MD output for more efficient further processing.

---
 db/md.init        |  2 +
 db/prepare-md.sql | 99 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+)
 create mode 100644 db/md.init
 create mode 100644 db/prepare-md.sql

diff --git a/db/md.init b/db/md.init
new file mode 100644
index 0000000..b9f206f
--- /dev/null
+++ b/db/md.init
@@ -0,0 +1,2 @@
+.cd /export/data2/tmp
+.open msmarco-doc-00.duckdb
diff --git a/db/prepare-md.sql b/db/prepare-md.sql
new file mode 100644
index 0000000..16726f7
--- /dev/null
+++ b/db/prepare-md.sql
@@ -0,0 +1,99 @@
+--
+-- Transform MD tables
+--
+
+--
+-- Define types
+--
+
+BEGIN TRANSACTION;
+
+-- Fields ("field", "tag") should be ENUMs
+CREATE TYPE tags AS ENUM ('PER','LOC','ORG', 'MISC');
+CREATE TYPE fields AS ENUM ('title','headings','body');
+
+-- MD field conversion table
+-- Read field values from the field_mappings.parquet file
+CREATE TABLE fielddict (
+	id    TINYINT,
+	field fields
+);
+INSERT INTO fielddict 
+SELECT stats_min_value, path_in_schema
+FROM parquet_metadata('msmarco_doc_00_field_mapping.parquet');
+
+COMMIT;
+
+--
+-- Create the Document Data Dictionary
+--
+
+BEGIN TRANSACTION;
+
+-- Data Dictionary
+CREATE TABLE dict(
+	cpart UTINYINT, 
+	docid UINTEGER, 
+	identifier VARCHAR,
+	nent USMALLINT, 
+	PRIMARY KEY(cpart, docid)
+);
+
+-- Create the document identifiers (from the Parquet File)
+INSERT INTO dict
+SELECT
+  cpdocid[0]::UTINYINT AS cpart,
+  cpdocid[1]::UINTEGER AS docid,
+  identifier,
+  count(*) AS nent
+FROM
+  (SELECT 
+     string_split(replace(identifier,'msmarco_doc_',''),'_') as cpdocid,
+     identifier
+   FROM 'msmarco_doc_00.parquet' 
+  )
+GROUP BY cpart, docid, identifier
+ORDER BY docid;
+
+COMMIT;
+
+--
+-- The Document-Entity Table
+--
+
+BEGIN TRANSACTION;
+
+-- Document Entity
+CREATE TABLE doc(
+	cpart UTINYINT,
+	docid UINTEGER,
+	field fields,
+	text VARCHAR(127),
+	start_pos UINTEGER,
+	end_pos UINTEGER,
+	score DOUBLE,
+	tag tags
+);
+
+--
+-- Load and recode the data:
+-- 
+--   + Document identifiers are looked up in the data dictionary
+--   + Field and tag are mapped to their ENUM types
+--
+INSERT INTO doc
+SELECT 
+  d.cpart, 
+  d.docid,
+  fd.field,
+  docs.text,
+  docs.start_pos,
+  docs.end_pos,
+  docs.score,
+  docs.tag
+FROM dict d, 'msmarco_doc_00.parquet' docs, fielddict fd
+WHERE 
+      d.identifier = docs.identifier
+  AND fd.id = docs.field;
+
+COMMIT;

From 1bb0967d130b80fad3a7df7b3c30e4e7604cb79d Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Mon, 3 Jan 2022 18:49:35 +0100
Subject: [PATCH 07/17] Sanity check - should give same output of t2.sql (for
 tuples with freq > 1)

---
 db/md-t2.sql | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 db/md-t2.sql

diff --git a/db/md-t2.sql b/db/md-t2.sql
new file mode 100644
index 0000000..f645957
--- /dev/null
+++ b/db/md-t2.sql
@@ -0,0 +1,7 @@
+-- Do we get the same results?
+select text, tag, count(*) as ef
+from doc
+where docid=21381293
+group by text, tag 
+order by ef desc
+limit 10;

From 59f07b1e3d58013bdc6c43b597ff41b58afa5d8b Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Mon, 3 Jan 2022 19:06:56 +0100
Subject: [PATCH 08/17] Explanation of directory

---
 db/README.md | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/db/README.md b/db/README.md
index 3e492b5..4a0f7d1 100644
--- a/db/README.md
+++ b/db/README.md
@@ -17,21 +17,44 @@ in `md.init`.
 
 ### Sanity check
 
+First, take the MD output on a single MS Marco V2 document
+(`msmarco_doc_00_21381293`, more or less randomly selected,
+in my tests called `doc-ok`).
+
+Read the field metadata:
+
+    duckdb md.init < fields.sql
+
+Read 10 rows from the document:
+
     duckdb md.init < t0.sql
-    duckdb md.init < 
+
+Query `t1.sql` gives a few rows of entity frequency information.
 
 ### Another sanity check
 
-Copy output over:
+Now copy the batch MD output from `tusi`:
 
     scp tusi:/scratch/ckamphuis/el-msmarcov2/msmarco_v2_md/msmarco_doc_00* /export/data2/tmp
 
+Test query `t2.sql` should give the same output as above,
+but now reads from the Parquet file generated for the full batch (`00`).
+
     duckdb md.init < t2.sql
 
 ### Create database
 
-We want a representation for querying that is less elaborate than all the string values.
+We want a representation for querying that is less elaborate than all those string values,
+pretty much preferred by our JSON scripting friends, but not ideal for SQL processing.
 
     duckdb md.init < prepare-md.sql
 
+The transformation took ~3½ seconds on my home machine for part `00`.
+
+### Test queries
+
+Query `md-t2.sql` should give the same output as `t2.sql` and `t1.sql`.
+
+TBC
+
 

From 7aa7425680cc553b6a11150befc5a7a12ba7265b Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Mon, 3 Jan 2022 19:07:23 +0100
Subject: [PATCH 09/17] Moved to other files

---
 db/t.sql | 5 -----
 1 file changed, 5 deletions(-)
 delete mode 100644 db/t.sql

diff --git a/db/t.sql b/db/t.sql
deleted file mode 100644
index f501d7c..0000000
--- a/db/t.sql
+++ /dev/null
@@ -1,5 +0,0 @@
-select text, tag, count(*) as ef
-from 'doc-ok.parquet' 
-group by text, tag 
-order by ef desc
-limit 10;

From 56fa0bf7e1a6bd76cb78bc65d5bb11947ebb28a6 Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Mon, 3 Jan 2022 19:08:09 +0100
Subject: [PATCH 10/17] Deleted because I switched to using the CLI

---
 db/t.py | 6 ------
 1 file changed, 6 deletions(-)
 delete mode 100644 db/t.py

diff --git a/db/t.py b/db/t.py
deleted file mode 100644
index ebc2330..0000000
--- a/db/t.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import duckdb
-
-print(duckdb.query('''
-SELECT *
-FROM 'doc-error.parquet'
-''').fetchall())

From d3412687893a6b1533fe8cf334d25cff005ea3cf Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Mon, 3 Jan 2022 19:08:48 +0100
Subject: [PATCH 11/17] A very minimal python script - not really used yet
 because the CLI suffices this far.

---
 db/test.py | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 db/test.py

diff --git a/db/test.py b/db/test.py
new file mode 100644
index 0000000..007a496
--- /dev/null
+++ b/db/test.py
@@ -0,0 +1,6 @@
+import duckdb
+
+print(duckdb.query('''
+SELECT *
+FROM 'doc-ok.parquet'
+''').fetchall())

From 3b7b05dc341835a01dd40f555aefb9efd65fee01 Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Mon, 3 Jan 2022 19:09:33 +0100
Subject: [PATCH 12/17] Slightly more robust version (but not sufficient on a
 GPU-less machine).

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 2172238..6fe0962 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,5 @@ pandas
 pyarrow
 flair
 syntok
+-f https://download.pytorch.org/whl/torch_stable.html
 torch>=1.5.0,!=1.8.*

From f997eb97236090d893cce17c350449c8e32614d0 Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Tue, 4 Jan 2022 01:33:42 +0100
Subject: [PATCH 13/17] Introduce entity dictionary based on entity mentions
 from MD

---
 db/md-t2.sql      | 10 ++++++----
 db/md-t2b.sql     |  7 +++++++
 db/prepare-md.sql | 30 ++++++++++++++++++++++++++----
 3 files changed, 39 insertions(+), 8 deletions(-)
 create mode 100644 db/md-t2b.sql

diff --git a/db/md-t2.sql b/db/md-t2.sql
index f645957..5327cab 100644
--- a/db/md-t2.sql
+++ b/db/md-t2.sql
@@ -1,7 +1,9 @@
 -- Do we get the same results?
-select text, tag, count(*) as ef
-from doc
-where docid=21381293
-group by text, tag 
+SELECT edict.e, doc.tag, count(doc.e) as ef
+FROM doc, edict
+WHERE 
+      docid=21381293
+  AND doc.e = edict.eid
+group by doc.e, edict.e, doc.tag 
 order by ef desc
 limit 10;
diff --git a/db/md-t2b.sql b/db/md-t2b.sql
new file mode 100644
index 0000000..966e86f
--- /dev/null
+++ b/db/md-t2b.sql
@@ -0,0 +1,7 @@
+-- Without joining the dictionary
+SELECT e, tag, count(e) as ef
+FROM doc
+WHERE docid=21381293
+group by e, tag 
+order by ef desc
+limit 10;
diff --git a/db/prepare-md.sql b/db/prepare-md.sql
index 16726f7..cb9e23a 100644
--- a/db/prepare-md.sql
+++ b/db/prepare-md.sql
@@ -2,6 +2,9 @@
 -- Transform MD tables
 --
 
+-- Does the progress bar work?
+SET enable_progress_bar=true;
+
 --
 -- Define types
 --
@@ -57,6 +60,24 @@ ORDER BY docid;
 
 COMMIT;
 
+--
+-- Entity dictionary
+--
+BEGIN TRANSACTION;
+
+CREATE TABLE edict(eid UINTEGER, e VARCHAR, ef UINTEGER);
+
+INSERT INTO edict
+SELECT row_number() OVER (), text, ef
+FROM
+  (SELECT text, count(*) as ef
+   FROM 'msmarco_doc_00.parquet'
+   GROUP by text
+   ORDER by ef DESC
+  );
+
+COMMIT;
+
 --
 -- The Document-Entity Table
 --
@@ -68,7 +89,7 @@ CREATE TABLE doc(
 	cpart UTINYINT,
 	docid UINTEGER,
 	field fields,
-	text VARCHAR(127),
+	e UINTEGER,
 	start_pos UINTEGER,
 	end_pos UINTEGER,
 	score DOUBLE,
@@ -86,14 +107,15 @@ SELECT
   d.cpart, 
   d.docid,
   fd.field,
-  docs.text,
+  ed.eid,
   docs.start_pos,
   docs.end_pos,
   docs.score,
   docs.tag
-FROM dict d, 'msmarco_doc_00.parquet' docs, fielddict fd
+FROM dict d, 'msmarco_doc_00.parquet' docs, fielddict fd, edict ed
 WHERE 
       d.identifier = docs.identifier
-  AND fd.id = docs.field;
+  AND fd.id = docs.field
+  AND ed.e  = docs.text;
 
 COMMIT;

From 1d48557db62bfe7bad0241dbc49d5586c7ede4ff Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Tue, 4 Jan 2022 01:34:27 +0100
Subject: [PATCH 14/17] Original script to test creating edict; now integrated
 in prepare-md.sql

---
 db/edict.sql | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 db/edict.sql

diff --git a/db/edict.sql b/db/edict.sql
new file mode 100644
index 0000000..f0b24dd
--- /dev/null
+++ b/db/edict.sql
@@ -0,0 +1,20 @@
+-- Does the progress bar work?
+SET enable_progress_bar=true;
+
+--
+-- Entity dictionary
+--
+BEGIN TRANSACTION;
+
+CREATE TABLE edict(eid UINTEGER, e VARCHAR, ef UINTEGER);
+
+INSERT INTO edict
+SELECT row_number() OVER (), text, ef
+FROM
+  (SELECT text, count(*) as ef
+   FROM 'msmarco_doc_00.parquet'
+   GROUP by text
+   ORDER by ef DESC
+  );
+
+COMMIT;

From 95fb5ad422940c35bd97e0bad621310ac1f1e4ab Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Tue, 4 Jan 2022 01:56:09 +0100
Subject: [PATCH 15/17] Also create indices on entity text

---
 db/prepare-md.sql | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/db/prepare-md.sql b/db/prepare-md.sql
index cb9e23a..86e48f3 100644
--- a/db/prepare-md.sql
+++ b/db/prepare-md.sql
@@ -2,8 +2,7 @@
 -- Transform MD tables
 --
 
--- Does the progress bar work?
-SET enable_progress_bar=true;
+-- SET enable_progress_bar=true;
 
 --
 -- Define types
@@ -65,7 +64,10 @@ COMMIT;
 --
 BEGIN TRANSACTION;
 
-CREATE TABLE edict(eid UINTEGER, e VARCHAR, ef UINTEGER);
+CREATE TABLE edict(
+	eid UINTEGER PRIMARY KEY, 
+	e VARCHAR, 
+	ef UINTEGER);
 
 INSERT INTO edict
 SELECT row_number() OVER (), text, ef
@@ -76,6 +78,8 @@ FROM
    ORDER by ef DESC
   );
 
+CREATE INDEX e_idx ON edict(e);
+
 COMMIT;
 
 --
@@ -118,4 +122,6 @@ WHERE
   AND fd.id = docs.field
   AND ed.e  = docs.text;
 
+CREATE INDEX de_idx ON doc(e);
+
 COMMIT;

From a7bd90375ba9fa07f26a64939d5e62f04bc6aa6e Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Tue, 4 Jan 2022 02:05:51 +0100
Subject: [PATCH 16/17] Indexes do not yet persist, so pretty useless to create
 them in preparation stage.

---
 db/prepare-md.sql | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/db/prepare-md.sql b/db/prepare-md.sql
index 86e48f3..f3c535a 100644
--- a/db/prepare-md.sql
+++ b/db/prepare-md.sql
@@ -78,7 +78,8 @@ FROM
    ORDER by ef DESC
   );
 
-CREATE INDEX e_idx ON edict(e);
+-- Would be useful if index persisted:
+-- CREATE INDEX e_idx ON edict(e);
 
 COMMIT;
 
@@ -122,6 +123,7 @@ WHERE
   AND fd.id = docs.field
   AND ed.e  = docs.text;
 
-CREATE INDEX de_idx ON doc(e);
+-- Would be useful if the index persisted...
+-- CREATE INDEX de_idx ON doc(e);
 
 COMMIT;

From 5ff2d12939880a52816f803d2529b9055e4b937c Mon Sep 17 00:00:00 2001
From: "Arjen P. de Vries" <arjen@acm.org>
Date: Tue, 4 Jan 2022 02:16:32 +0100
Subject: [PATCH 17/17] A few queries getting most frequent entity mentions in
 different ways.

---
 db/md-t3.sql  | 11 +++++++++++
 db/md-t3b.sql |  7 +++++++
 db/md-t3c.sql |  5 +++++
 db/t3.sql     |  7 +++++++
 4 files changed, 30 insertions(+)
 create mode 100644 db/md-t3.sql
 create mode 100644 db/md-t3b.sql
 create mode 100644 db/md-t3c.sql
 create mode 100644 db/t3.sql

diff --git a/db/md-t3.sql b/db/md-t3.sql
new file mode 100644
index 0000000..2d51e87
--- /dev/null
+++ b/db/md-t3.sql
@@ -0,0 +1,11 @@
+-- Ten most frequent entities
+
+-- building the index does not pay off for a single query
+-- create index ex on edict(eid);
+
+select edict.e, count(doc.e) as ef
+from doc, edict
+where field='body' AND doc.e = eid
+group by doc.e, edict.e
+order by ef desc
+limit 10;
diff --git a/db/md-t3b.sql b/db/md-t3b.sql
new file mode 100644
index 0000000..d3d5a4b
--- /dev/null
+++ b/db/md-t3b.sql
@@ -0,0 +1,7 @@
+-- Ten most frequent entities
+select e, count(*) as ef
+from doc
+where field='body'
+group by e
+order by ef desc
+limit 10;
diff --git a/db/md-t3c.sql b/db/md-t3c.sql
new file mode 100644
index 0000000..7bd17f3
--- /dev/null
+++ b/db/md-t3c.sql
@@ -0,0 +1,5 @@
+-- Ten most frequent entities
+select edict.e, edict.ef
+from edict
+order by ef desc
+limit 10;
diff --git a/db/t3.sql b/db/t3.sql
new file mode 100644
index 0000000..70426ae
--- /dev/null
+++ b/db/t3.sql
@@ -0,0 +1,7 @@
+-- Ten most frequent entities
+select text, count(*) as ef
+from 'msmarco_doc_00.parquet'
+where field=2
+group by text
+order by ef desc
+limit 10;