raphaelhoffmann · xiaoling · Nov 4, 2015 · Nov 4, 2015 · Nov 4, 2015 · Nov 4, 2015
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+run/
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 
 [Tutorial 1: Entity Linking for Locations](TUTORIAL1.md)
-Learn how to identify mentions of locations in news articles and unambiguously link them to entities in Wikidata.
+Learn how to identify mentions of locations in news articles and unambiguously link them to entities in Wikidata. [DDLOG version](TUTORIAL1_ddlog.md)
 
 [Tutorial 2: Coreference Resolution within Documents](TUTORIAL2.md)
 Learn how to cluster mentions of the same entity within a document without the need for an entity database.

diff --git a/TUTORIAL1_ddlog.md b/TUTORIAL1_ddlog.md
diff --git a/app.ddlog b/app.ddlog
@@ -0,0 +1,178 @@
+articles(
+  @key 
+  id int,
+  @searchable
+  body text,
+  @searchable
+  title text
+).
+
+@source
+sentences(
+  document_id  int,    # which document it comes from
+  sentence_offset int, # which sentence (0, 1, 2...) is it in document
+  sentence     text,      # sentence content
+  words        text[],    # array of words in this sentence
+  lemma        text[],    # array of lemmatized words
+  pos_tags     text[],    # array of part-of-speech tags
+  ner_tags     text[],    # array of named entity tags (PERSON, LOCATION, etc)
+  char_offsets int[],   # array of character offsets (begin)
+  dep_labels text[],    # array of dependency labels
+  dep_parents int[],  
+  sentence_id  text       # unique identifier for sentences
+  ).
+
+wikidata_names (
+        item_id int,
+        language text,
+        label text,
+        name text
+).
+
+wikidata_instanceof (
+        item_id int,
+        clazz_id int
+).
+
+wikidata_coordinate_locations (
+        item_id int,
+        latitude float,
+        longitude float
+).
+
+context_features (
+        sentence_id text,
+        mention_num int,
+        features text[]
+).
+
+@extraction
+locations (        
+        @key
+        mention_id text,
+        document_id text,
+        @references(relation="sentences", column="sentence_id")
+        sentence_id text,
+        mention_num int,
+        mention_str text,
+        @textspan_start()
+        w_from int,
+        w_to int,
+        loc_id int,
+        is_correct boolean,
+        features text[]
+).
+
+v_mentions(
+  sentence_id text,
+  mention_num int,
+  w_from int,
+  w_to int
+).
+
+link?(
+  mention_id text).
+
+# process the text
+#function extract_preprocess over (id int, body text)
+#                returns rows like sentences
+#  implementation "../deepdive/examples/nlp_extractor/run.sh -k id -v body -l 100 -t 1 -a tokenize,ssplit,pos" handles json lines.
+
+#sentences += 
+#  extract_preprocess(id, body) :-
+#  articles(id, body, _).
+
+# extract pairs
+function extract_pairs over (document_id int, sentence_id text, words text, pos_tags text)
+               returns rows like locations
+  implementation "udf/extract_pairs.py" handles tsv lines.
+
+locations += 
+  extract_pairs(doc_id, id, ARRAY_TO_STRING(words, "~^~"), ARRAY_TO_STRING(pos_tags, "~^~")) :-
+  sentences(doc_id, _, _, words, _, pos_tags, _, _, _, _, id).
+
+# extract context features
+function extract_context_features over (sentence_id text, mention_num int, w_from int, w_to int, words text, pos_tags text)
+              returns rows like context_features
+   implementation "udf/extract_context_features.py" handles tsv lines.
+
+context_features +=
+  extract_context_features(sentence_id, mention_num, w_from, w_to, ARRAY_TO_STRING(words, "~^~"), ARRAY_TO_STRING(pos_tags, "~^~")) :-
+  sentences(_, _, _, words, _, pos_tags, _, _, _, _, sentence_id),
+  v_mentions(sentence_id, mention_num, w_from, w_to).
+
+# TODO supervise (not sure if this helps) 
+#function supervise over (document_id int, sentence text, words, )
+#                return rows like locations
+#  implementataion "supervise_locations.py.save" handles tsv lines.
+
+#locations += 
+#  supervise(mention_id, sent_id, mention_num, mention_str, w_from, w_to, loc_id) :-
+#  locations(mention_id, sent_id, mention_num, mention_str, w_from, w_to, loc_id, _, _)
+
+#label
+@label(is_true)
+link(mid) :- locations(mid, _, _, _, _, _, _, _, is_true, _).
+
+# negative_bias
+@weight(-1)
+link(mid) :-
+  locations(mid, _, _, _, _, _, _, _, _, _).
+
+# one_of_n_features
+# TODO: what if the entity doesn't exist in the KB
+@weight(-10)
+link(mid1) ^ link(mid2) :-
+  locations(mid1, _, sentence_id, mention_num, _, _, _, _, _, _),
+  locations(mid2, _, sentence_id, mention_num, _, _, _, _, _, _).
+
+# prefer if subsequently mentioned cities are within 1000km distance
+# consecutive_in_proximity
+@weight(3)
+link(mid1) ^ link(mid2) :-
+  locations(mid1, doc_id, _, _, _, _, _, loc_id1, _, _),
+  locations(mid2, doc_id, _, _, _, _, _, loc_id2, _, _),
+  wikidata_coordinate_locations(loc_id1, lat1, lon1),
+  wikidata_coordinate_locations(loc_id2, lat2, lon2),
+  [earth_distance(ll_to_earth(lat1,lon1), ll_to_earth(lat2,lon2)) < 1000].
+
+# penalize same word mapped to different location
+# same_to_same
+@weight(-3)
+link(mid1) ^ link(mid2) :-
+  locations(mid1, doc_id, _, mention_num1, mention_str, _, _, loc1, _, _),
+  locations(mid2, doc_id, _, mention_num2, mention_str, _, _, loc2, _, _).
+
+# prefer larger cities
+@weight(2)
+link(mid) :- 
+  locations(mid, _, _, _, _, _, _, loc_id, _, _),
+  wikidata_instanceof(loc_id, 515).
+
+# x00M population
+@weight(2)
+link(mid) :- 
+  locations(mid, _, _, _, _, _, _, loc_id, _, _),
+  wikidata_instanceof(loc_id, 1549591).
+
+# xM population
+@weight(2)
+link(mid) :- 
+  locations(mid, _, _, _, _, _, _, loc_id, _, _),
+  wikidata_instanceof(loc_id, 1637706).
+
+# boost_countries
+@weight(5)
+link(mid) :- 
+  locations(mid, _, _, _, _, _, _, loc_id, _, _),
+  wikidata_instanceof(loc_id, 6256).
+
+#context features
+@weight(f)
+link(mid) :-
+  locations(_, _, sentence_id, mention_num, _, _, _, loc_id, _, _),
+  context_features(sentence_id, mention_num, f).
+
+
+
+
diff --git a/data/sentences.tsv.gz b/data/sentences.tsv.gz
diff --git a/db.url b/db.url
@@ -0,0 +1 @@
+postgresql://localhost/geo
diff --git a/env_local.sh.TEMPLATE b/env_local.sh.TEMPLATE
@@ -2,6 +2,7 @@
 export DEEPDIVE_HOME=`pwd`/../deepdive
 export APP_HOME=`pwd`
 export MEMORY="8g"
+export INPUT_BATCH_SIZE=10000
 export PARALLELISM=4
 export DBNAME="geo"
 export PGHOST="localhost"

diff --git a/input/init.sh b/input/init.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# A script for loading reuters and wikidata into PostgreSQL database
+set -eux
+#cd "$(dirname "$0")"
+
+#bzcat ./articles_dump.csv.bz2  | deepdive sql "COPY articles  FROM STDIN CSV"
+#bzcat ./sentences_dump.csv.bz2 |
+#if [[ -z ${SUBSAMPLE_NUM_SENTENCES:-} ]]; then cat; else head -n ${SUBSAMPLE_NUM_SENTENCES}; fi |
+#deepdive sql "COPY sentences FROM STDIN CSV"
+
+#!/bin/bash
+
+cd $(dirname $0)/..
+
+. ./env_local.sh
+
+
+cd "$(dirname "$0")"
+
+# article content
+cat $(pwd)/converted.csv | deepdive sql "copy articles from STDIN csv"
+
+# wiki data
+cat `pwd`/names.tsv | deepdive sql "copy wikidata_names from STDIN CSV DELIMITER E'\t' QUOTE E'\1';"
+cat `pwd`/coordinate-locations.tsv | deepdive sql "copy wikidata_coordinate_locations from STDIN;"
+cat `pwd`/transitive.tsv | deepdive sql "copy wikidata_instanceof from STDIN;"
+
+#import sentence parses
+cat `pwd`/../data/sentences.tsv.gz | gunzip - | deepdive sql "COPY sentences from STDIN"
+
+# intermediate view 
+#TODO move this to app.ddlog
+# use ":-" to create a view/table. Don't know if DISTINCT is supported
+deepdive sql 'INSERT INTO v_mentions SELECT DISTINCT sentence_id, mention_num, w_from, w_to FROM locations'
+
+
+# install psql extensions
+deepdive sql "CREATE EXTENSION cube;"
+deepdive sql "CREATE EXTENSION earthdistance;"
diff --git a/run.sh b/run.sh
@@ -1,11 +1,9 @@
-#!/bin/bash
-
-DIRNAME=`dirname $0`
-
-. "${DIRNAME}/env_local.sh"
-
-cd $DEEPDIVE_HOME
-export PYTHONPATH=$DEEPDIVE_HOME/ddlib:$PYTHONPATH
-
-### Compile and run:
-sbt/sbt "run -c $APP_HOME/${APP_CONF:-application.conf} -o ${TMP_DIR}"
+read -p "WARNING: Script will erase sentences in database. Are you sure? " -n 1 -r
+echo
+if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+  exit
+fi
+
+dropdb geo
+deepdive initdb
+deepdive run
diff --git a/script/fetch-wikidata.py b/script/fetch-wikidata.py
@@ -10,13 +10,26 @@
 import json
 import csv
 import os.path
+import urllib.request
 
 BASE_DIR, throwaway = os.path.split(os.path.realpath(__file__))
 BASE_DIR = os.path.realpath(BASE_DIR + "/..")
 DATA_DIR = BASE_DIR + '/data'
 
+def get_latest_dump_name():
+    with urllib.request.urlopen('http://dumps.wikimedia.org/other/wikidata/') as f:
+        for line in f:
+            html = str(line, encoding='utf-8')
+            if 'href' not in html:
+                continue
+            start_idx = html.index('>')+1
+            end_idx = html.index('<', start_idx)
+            name = html[start_idx:end_idx]
+        return name
+
 def download_wikidata():
-    DOWNLOAD_URL = ('http://dumps.wikimedia.org/other/wikidata/20150330.json.gz')
+    dump_name = get_latest_dump_name()
+    DOWNLOAD_URL = ('http://dumps.wikimedia.org/other/wikidata/'+dump_name)
     ARCHIVE_FILENAME = 'dump.json.gz'
     data_path = os.path.join(DATA_DIR, "wikidata")
     archive_path = os.path.join(data_path, ARCHIVE_FILENAME)

diff --git a/script/get-reuters-json-csv.py b/script/get-reuters-json-csv.py
@@ -121,7 +121,8 @@ def json_to_csv():
         tsvout = csv.writer(tsvout)
         for line in jsonin:
             obj = json.loads(line)
-            tsvout.writerow([obj['id'], obj['body'], obj['title']])
+            if obj['body']:
+                tsvout.writerow([obj['id'], obj['body'].replace('\x7F', ''), obj['title']])
     print("saved output as %s" % out_path)
 
 

diff --git a/script/get-wikidata-coordinate-locations.py b/script/get-wikidata-coordinate-locations.py
@@ -28,7 +28,7 @@ def parse_json(line, w):
         except KeyError:
           print('ignoring keyerror', file=sys.stderr) 
 
-with open(DATA_DIR + '/wikidata/dump.json', 'r') as f, open(DATA_DIR + '/wikidata/coordinate-locations.tsv', 'w') as w:
+with open(DATA_DIR + '/wikidata/dump.json', 'r') as f, open(BASE_DIR + '/input/coordinate-locations.tsv', 'w') as w:
     for line in f:
         line = line.rstrip()
         if line == '[' or line == ']':

diff --git a/script/get-wikidata-names.py b/script/get-wikidata-names.py
@@ -26,7 +26,7 @@ def parse_json(line,w):
                 print(id + '\t' + lang + '\t' + 'alias' + '\t' + alias.replace('\t', ' ').replace('\n', ' '), file=w)
 
 
-with open(DATA_DIR + '/wikidata/dump.json', 'r') as f, open(DATA_DIR + '/wikidata/names.tsv', 'w') as w:
+with open(DATA_DIR + '/wikidata/dump.json', 'r') as f, open(BASE_DIR + '/input/names.tsv', 'w') as w:
     for line in f:
         line = line.rstrip()
         if line == '[' or line == ']':

diff --git a/script/get-wikidata-transitive.py b/script/get-wikidata-transitive.py
@@ -63,7 +63,7 @@ def get_items(sel, clazz, w):
                 print(str(id1) + '\t' + str(clazz), file=w)
                 last = id1
 
-with open(DATA_DIR + '/wikidata/transitive.tsv', 'w') as w:
+with open(BASE_DIR + '/input/transitive.tsv', 'w') as w:
     build_map()
     # compute transitive closure for each class
     for clazz in clazzes:

diff --git a/udf/extract_pairs.py b/udf/extract_pairs.py
@@ -12,7 +12,7 @@
 Loc = collections.namedtuple('Loc', ['item_id', 'name'])
 
 loc_ids_set = set()
-with open(BASE_DIR + "/data/wikidata/transitive.tsv", 'rt') as transitive_file:
+with open(BASE_DIR + "/input/transitive.tsv", 'rt') as transitive_file:
     print('loading transitive.tsv', file=sys.stderr)
     for line in transitive_file:
         cols = line.split('\t')
@@ -24,7 +24,7 @@
 
 
 cities_dict = dict()
-with open(BASE_DIR + "/data/wikidata/names.tsv", 'rt') as cities_file:
+with open(BASE_DIR + "/input/names.tsv", 'rt') as cities_file:
     print('loading names.tsv', file=sys.stderr)
     for line in cities_file:
         cols = line.split('\t')
@@ -100,8 +100,8 @@ def generate_candidates(doc_id, sent_id, words, poses, phrases):
             #    if m.country_code == 'US' and m == loc:
             #        true_str = '1'
 
-            print('\t'.join(['\\N', mention_id, str(doc_id), str(sent_id), str(mention_num), mention_str, str(phrase[0]), str(phrase[1]), str(loc.item_id), '\\N', features_str ]))
-            #print('\t'.join(['\\N', mention_id, str(doc_id), str(sent_id), str(mention_num), mention_str, str(phrase[0]), str(phrase[1]), str(loc.item_id), true_str, features_str ]))
+            print('\t'.join([ mention_id, str(doc_id), str(sent_id), str(mention_num), mention_str, str(phrase[0]), str(phrase[1]), str(loc.item_id), '\\N', features_str ]))
+            #print('\t'.join([ mention_id, str(doc_id), str(sent_id), str(mention_num), mention_str, str(phrase[0]), str(phrase[1]), str(loc.item_id), true_str, features_str ]))
 
         mention_num += 1
 
@@ -122,7 +122,7 @@ def generate_nnp_phrases(words, poses):
         for line in input_files:
             #print(line, file=sys.stderr)
             doc_id, sent_id, words_str, poses_str = line.split('\t')
-            words = words_str.split(' ')
-            poses = poses_str.split(' ')
+            words = words_str.split('~^~')
+            poses = poses_str.split('~^~')
             phrases = generate_nnp_phrases(words, poses)
             generate_candidates(doc_id, sent_id, words, poses, phrases)