Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
run/
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


[Tutorial 1: Entity Linking for Locations](TUTORIAL1.md)
Learn how to identify mentions of locations in news articles and unambiguously link them to entities in Wikidata.
Learn how to identify mentions of locations in news articles and unambiguously link them to entities in Wikidata. [DDLOG version](TUTORIAL1_ddlog.md)

[Tutorial 2: Coreference Resolution within Documents](TUTORIAL2.md)
Learn how to cluster mentions of the same entity within a document without the need for an entity database.
Expand Down
461 changes: 461 additions & 0 deletions TUTORIAL1_ddlog.md

Large diffs are not rendered by default.

178 changes: 178 additions & 0 deletions app.ddlog
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
articles(
@key
id int,
@searchable
body text,
@searchable
title text
).

@source
sentences(
document_id int, # which document it comes from
sentence_offset int, # which sentence (0, 1, 2...) is it in document
sentence text, # sentence content
words text[], # array of words in this sentence
lemma text[], # array of lemmatized words
pos_tags text[], # array of part-of-speech tags
ner_tags text[], # array of named entity tags (PERSON, LOCATION, etc)
char_offsets int[], # array of character offsets (begin)
dep_labels text[], # array of dependency labels
dep_parents int[],
sentence_id text # unique identifier for sentences
).

wikidata_names (
item_id int,
language text,
label text,
name text
).

wikidata_instanceof (
item_id int,
clazz_id int
).

wikidata_coordinate_locations (
item_id int,
latitude float,
longitude float
).

context_features (
sentence_id text,
mention_num int,
features text[]
).

@extraction
locations (
@key
mention_id text,
document_id text,
@references(relation="sentences", column="sentence_id")
sentence_id text,
mention_num int,
mention_str text,
@textspan_start()
w_from int,
w_to int,
loc_id int,
is_correct boolean,
features text[]
).

v_mentions(
sentence_id text,
mention_num int,
w_from int,
w_to int
).

link?(
mention_id text).

# process the text
#function extract_preprocess over (id int, body text)
# returns rows like sentences
# implementation "../deepdive/examples/nlp_extractor/run.sh -k id -v body -l 100 -t 1 -a tokenize,ssplit,pos" handles json lines.

#sentences +=
# extract_preprocess(id, body) :-
# articles(id, body, _).

# extract pairs
function extract_pairs over (document_id int, sentence_id text, words text, pos_tags text)
returns rows like locations
implementation "udf/extract_pairs.py" handles tsv lines.

locations +=
extract_pairs(doc_id, id, ARRAY_TO_STRING(words, "~^~"), ARRAY_TO_STRING(pos_tags, "~^~")) :-
sentences(doc_id, _, _, words, _, pos_tags, _, _, _, _, id).

# extract context features
function extract_context_features over (sentence_id text, mention_num int, w_from int, w_to int, words text, pos_tags text)
returns rows like context_features
implementation "udf/extract_context_features.py" handles tsv lines.

context_features +=
extract_context_features(sentence_id, mention_num, w_from, w_to, ARRAY_TO_STRING(words, "~^~"), ARRAY_TO_STRING(pos_tags, "~^~")) :-
sentences(_, _, _, words, _, pos_tags, _, _, _, _, sentence_id),
v_mentions(sentence_id, mention_num, w_from, w_to).

# TODO supervise (not sure if this helps)
#function supervise over (document_id int, sentence text, words, )
# return rows like locations
# implementataion "supervise_locations.py.save" handles tsv lines.

#locations +=
# supervise(mention_id, sent_id, mention_num, mention_str, w_from, w_to, loc_id) :-
# locations(mention_id, sent_id, mention_num, mention_str, w_from, w_to, loc_id, _, _)

#label
@label(is_true)
link(mid) :- locations(mid, _, _, _, _, _, _, _, is_true, _).

# negative_bias
@weight(-1)
link(mid) :-
locations(mid, _, _, _, _, _, _, _, _, _).

# one_of_n_features
# TODO: what if the entity doesn't exist in the KB
@weight(-10)
link(mid1) ^ link(mid2) :-
locations(mid1, _, sentence_id, mention_num, _, _, _, _, _, _),
locations(mid2, _, sentence_id, mention_num, _, _, _, _, _, _).

# prefer if subsequently mentioned cities are within 1000km distance
# consecutive_in_proximity
@weight(3)
link(mid1) ^ link(mid2) :-
locations(mid1, doc_id, _, _, _, _, _, loc_id1, _, _),
locations(mid2, doc_id, _, _, _, _, _, loc_id2, _, _),
wikidata_coordinate_locations(loc_id1, lat1, lon1),
wikidata_coordinate_locations(loc_id2, lat2, lon2),
[earth_distance(ll_to_earth(lat1,lon1), ll_to_earth(lat2,lon2)) < 1000].

# penalize same word mapped to different location
# same_to_same
@weight(-3)
link(mid1) ^ link(mid2) :-
locations(mid1, doc_id, _, mention_num1, mention_str, _, _, loc1, _, _),
locations(mid2, doc_id, _, mention_num2, mention_str, _, _, loc2, _, _).

# prefer larger cities
@weight(2)
link(mid) :-
locations(mid, _, _, _, _, _, _, loc_id, _, _),
wikidata_instanceof(loc_id, 515).

# x00M population
@weight(2)
link(mid) :-
locations(mid, _, _, _, _, _, _, loc_id, _, _),
wikidata_instanceof(loc_id, 1549591).

# xM population
@weight(2)
link(mid) :-
locations(mid, _, _, _, _, _, _, loc_id, _, _),
wikidata_instanceof(loc_id, 1637706).

# boost_countries
@weight(5)
link(mid) :-
locations(mid, _, _, _, _, _, _, loc_id, _, _),
wikidata_instanceof(loc_id, 6256).

#context features
@weight(f)
link(mid) :-
locations(_, _, sentence_id, mention_num, _, _, _, loc_id, _, _),
context_features(sentence_id, mention_num, f).




Binary file added data/sentences.tsv.gz
Binary file not shown.
1 change: 1 addition & 0 deletions db.url
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
postgresql://localhost/geo
1 change: 1 addition & 0 deletions env_local.sh.TEMPLATE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
export DEEPDIVE_HOME=`pwd`/../deepdive
export APP_HOME=`pwd`
export MEMORY="8g"
export INPUT_BATCH_SIZE=10000
export PARALLELISM=4
export DBNAME="geo"
export PGHOST="localhost"
Expand Down
39 changes: 39 additions & 0 deletions input/init.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env bash
# A script for loading reuters and wikidata into PostgreSQL database
set -eux
#cd "$(dirname "$0")"

#bzcat ./articles_dump.csv.bz2 | deepdive sql "COPY articles FROM STDIN CSV"
#bzcat ./sentences_dump.csv.bz2 |
#if [[ -z ${SUBSAMPLE_NUM_SENTENCES:-} ]]; then cat; else head -n ${SUBSAMPLE_NUM_SENTENCES}; fi |
#deepdive sql "COPY sentences FROM STDIN CSV"

#!/bin/bash

cd $(dirname $0)/..

. ./env_local.sh


cd "$(dirname "$0")"

# article content
cat $(pwd)/converted.csv | deepdive sql "copy articles from STDIN csv"

# wiki data
cat `pwd`/names.tsv | deepdive sql "copy wikidata_names from STDIN CSV DELIMITER E'\t' QUOTE E'\1';"
cat `pwd`/coordinate-locations.tsv | deepdive sql "copy wikidata_coordinate_locations from STDIN;"
cat `pwd`/transitive.tsv | deepdive sql "copy wikidata_instanceof from STDIN;"

#import sentence parses
cat `pwd`/../data/sentences.tsv.gz | gunzip - | deepdive sql "COPY sentences from STDIN"

# intermediate view
#TODO move this to app.ddlog
# use ":-" to create a view/table. Don't know if DISTINCT is supported
deepdive sql 'INSERT INTO v_mentions SELECT DISTINCT sentence_id, mention_num, w_from, w_to FROM locations'


# install psql extensions
deepdive sql "CREATE EXTENSION cube;"
deepdive sql "CREATE EXTENSION earthdistance;"
20 changes: 9 additions & 11 deletions run.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
#!/bin/bash

DIRNAME=`dirname $0`

. "${DIRNAME}/env_local.sh"

cd $DEEPDIVE_HOME
export PYTHONPATH=$DEEPDIVE_HOME/ddlib:$PYTHONPATH

### Compile and run:
sbt/sbt "run -c $APP_HOME/${APP_CONF:-application.conf} -o ${TMP_DIR}"
read -p "WARNING: Script will erase sentences in database. Are you sure? " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
exit
fi

dropdb geo
deepdive initdb
deepdive run
15 changes: 14 additions & 1 deletion script/fetch-wikidata.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,26 @@
import json
import csv
import os.path
import urllib.request

BASE_DIR, throwaway = os.path.split(os.path.realpath(__file__))
BASE_DIR = os.path.realpath(BASE_DIR + "/..")
DATA_DIR = BASE_DIR + '/data'

def get_latest_dump_name():
with urllib.request.urlopen('http://dumps.wikimedia.org/other/wikidata/') as f:
for line in f:
html = str(line, encoding='utf-8')
if 'href' not in html:
continue
start_idx = html.index('>')+1
end_idx = html.index('<', start_idx)
name = html[start_idx:end_idx]
return name

def download_wikidata():
DOWNLOAD_URL = ('http://dumps.wikimedia.org/other/wikidata/20150330.json.gz')
dump_name = get_latest_dump_name()
DOWNLOAD_URL = ('http://dumps.wikimedia.org/other/wikidata/'+dump_name)
ARCHIVE_FILENAME = 'dump.json.gz'
data_path = os.path.join(DATA_DIR, "wikidata")
archive_path = os.path.join(data_path, ARCHIVE_FILENAME)
Expand Down
3 changes: 2 additions & 1 deletion script/get-reuters-json-csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ def json_to_csv():
tsvout = csv.writer(tsvout)
for line in jsonin:
obj = json.loads(line)
tsvout.writerow([obj['id'], obj['body'], obj['title']])
if obj['body']:
tsvout.writerow([obj['id'], obj['body'].replace('\x7F', ''), obj['title']])
print("saved output as %s" % out_path)


Expand Down
2 changes: 1 addition & 1 deletion script/get-wikidata-coordinate-locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def parse_json(line, w):
except KeyError:
print('ignoring keyerror', file=sys.stderr)

with open(DATA_DIR + '/wikidata/dump.json', 'r') as f, open(DATA_DIR + '/wikidata/coordinate-locations.tsv', 'w') as w:
with open(DATA_DIR + '/wikidata/dump.json', 'r') as f, open(BASE_DIR + '/input/coordinate-locations.tsv', 'w') as w:
for line in f:
line = line.rstrip()
if line == '[' or line == ']':
Expand Down
2 changes: 1 addition & 1 deletion script/get-wikidata-names.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def parse_json(line,w):
print(id + '\t' + lang + '\t' + 'alias' + '\t' + alias.replace('\t', ' ').replace('\n', ' '), file=w)


with open(DATA_DIR + '/wikidata/dump.json', 'r') as f, open(DATA_DIR + '/wikidata/names.tsv', 'w') as w:
with open(DATA_DIR + '/wikidata/dump.json', 'r') as f, open(BASE_DIR + '/input/names.tsv', 'w') as w:
for line in f:
line = line.rstrip()
if line == '[' or line == ']':
Expand Down
2 changes: 1 addition & 1 deletion script/get-wikidata-transitive.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def get_items(sel, clazz, w):
print(str(id1) + '\t' + str(clazz), file=w)
last = id1

with open(DATA_DIR + '/wikidata/transitive.tsv', 'w') as w:
with open(BASE_DIR + '/input/transitive.tsv', 'w') as w:
build_map()
# compute transitive closure for each class
for clazz in clazzes:
Expand Down
12 changes: 6 additions & 6 deletions udf/extract_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
Loc = collections.namedtuple('Loc', ['item_id', 'name'])

loc_ids_set = set()
with open(BASE_DIR + "/data/wikidata/transitive.tsv", 'rt') as transitive_file:
with open(BASE_DIR + "/input/transitive.tsv", 'rt') as transitive_file:
print('loading transitive.tsv', file=sys.stderr)
for line in transitive_file:
cols = line.split('\t')
Expand All @@ -24,7 +24,7 @@


cities_dict = dict()
with open(BASE_DIR + "/data/wikidata/names.tsv", 'rt') as cities_file:
with open(BASE_DIR + "/input/names.tsv", 'rt') as cities_file:
print('loading names.tsv', file=sys.stderr)
for line in cities_file:
cols = line.split('\t')
Expand Down Expand Up @@ -100,8 +100,8 @@ def generate_candidates(doc_id, sent_id, words, poses, phrases):
# if m.country_code == 'US' and m == loc:
# true_str = '1'

print('\t'.join(['\\N', mention_id, str(doc_id), str(sent_id), str(mention_num), mention_str, str(phrase[0]), str(phrase[1]), str(loc.item_id), '\\N', features_str ]))
#print('\t'.join(['\\N', mention_id, str(doc_id), str(sent_id), str(mention_num), mention_str, str(phrase[0]), str(phrase[1]), str(loc.item_id), true_str, features_str ]))
print('\t'.join([ mention_id, str(doc_id), str(sent_id), str(mention_num), mention_str, str(phrase[0]), str(phrase[1]), str(loc.item_id), '\\N', features_str ]))
#print('\t'.join([ mention_id, str(doc_id), str(sent_id), str(mention_num), mention_str, str(phrase[0]), str(phrase[1]), str(loc.item_id), true_str, features_str ]))

mention_num += 1

Expand All @@ -122,7 +122,7 @@ def generate_nnp_phrases(words, poses):
for line in input_files:
#print(line, file=sys.stderr)
doc_id, sent_id, words_str, poses_str = line.split('\t')
words = words_str.split(' ')
poses = poses_str.split(' ')
words = words_str.split('~^~')
poses = poses_str.split('~^~')
phrases = generate_nnp_phrases(words, poses)
generate_candidates(doc_id, sent_id, words, poses, phrases)