diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7e3b24e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+run/
diff --git a/README.md b/README.md
index ead2196..e5ca6be 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 
 [Tutorial 1: Entity Linking for Locations](TUTORIAL1.md)
-Learn how to identify mentions of locations in news articles and unambiguously link them to entities in Wikidata.
+Learn how to identify mentions of locations in news articles and unambiguously link them to entities in Wikidata. [DDLOG version](TUTORIAL1_ddlog.md)
 
 [Tutorial 2: Coreference Resolution within Documents](TUTORIAL2.md)
 Learn how to cluster mentions of the same entity within a document without the need for an entity database.
diff --git a/TUTORIAL1_ddlog.md b/TUTORIAL1_ddlog.md
new file mode 100644
index 0000000..dc1f336
--- /dev/null
+++ b/TUTORIAL1_ddlog.md
@@ -0,0 +1,461 @@
+# Tutorial: Entity Linking for Locations (in DDLOG)
+
+References to locations are ubiquitous in text, but many such references are ambiguous. For
+example, Wikipedia lists [more than 30 locations](https://en.wikipedia.org/wiki/San_Francisco_(disambiguation)) named 'San Francisco', 10 songs with
+that name, 2 movies, a magazine, and several other things as well.
+In this tutorial, we develop a system that detects mentions of geographic locations
+and links these unambiguously to a database of locations.
+
+We start with a corpus of 20,000 news articles, the [Reuters-21578 dataset](http://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.html), which 
+represents articles that appeared on the Reuters newswire in 1987. Our goal is to identify
+mentions of locations in these articles and unambiguously link them to entities in [Wikidata](http://www.wikidata.org), 
+a community-edited database containing 14 million entities, including more than 2 million 
+geographic locations. 
+
+Using wikidata as a database has three major advantages: 
+
+* Wikidata contains not just locations but also many other types of entities in the real 
+  world. This makes it easy to re-use the tools developed for this tutorial to other 
+  entity linking tasks.
+* Wikidata is dense in the sense that it contains many attributes and relationships
+  between entities. As we will see, we can exploit this information to more accurately
+  disambiguate mentions of entities. 
+* Wikidata has an active community enhancing the data, absorbing other available sources
+  (including Freebase) and adding open-data links to other resources. Wikidata is thus
+  growing quickly.
+
+This tutorial assumes that you are already familiar with setting up and running
+deepdive applications. 
+
+## Preparing the Reuters dataset
+
+We first download the Reuters corpus from [UC Irvine repository](http://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.html).
+The original data is in SGML which we convert to JSON for readability and CSV for loading into the database. The following scripts perform these steps:
+
+    script/fetch-reuters.py
+    script/get-reuters-json-csv.py
+
+We create a symbolic link to `converted.csv` in the `input` folder for later loading the data in the database.
+
+    ln data/reuters/converted.csv input/c.csv
+
+The articles are stored as strings of text. To more easily identify mentions we would like
+to compute word boundaries by running a tokenizer, splitting sentences, and computing part-of-speech tags. Deepdive offers the 
+`nlp_extractor` for that. Here we provide a parse dump, `data/sentences.tsv.gz`.
+
+
+
+## Preparing the Wikidata dataset
+
+We now download the Wikidata database as a [json dump](http://dumps.wikimedia.org/other/wikidata/).
+Again, we have a script for that:
+
+    script/fetch-wikidata.py
+
+Note that Wikidata`s dumps are updated frequently, and you may need to update the path to the
+latest dump inside the script. See above link for the most recent dumps.
+
+After downloading, unpack it:
+
+    gunzip data/wikidata/dump.json.gz
+
+This data contains much information that we don't need. We therefore extract the information we
+are interested in and store it in a format that we can load into our database system. First,
+we get a list of names (and aliases) for the entities in Wikidata:
+
+    script/get-wikidata-names.py
+
+This will create a file `data/wikidata/names.tsv` with content as follows:
+
+```
+    1	en	label	universe
+    1	en	alias	cosmos
+    1	en	alias	The Universe
+    1	en	alias	Space
+    8	en	label	happiness
+    16	en	label	highway system
+    19	en	label	place of birth
+    19	en	alias	birthplace
+    19	en	alias	born in
+    19	en	alias	POB
+```
+
+The first column is Wikidata's unique identifier; to access information about an entity, add prefix
+Q to its id and point your browser to `http://www.wikidata.org/wiki/Q[ID]`. The second column represents language, the third
+indicates if a name is the canonical label, or an alias, and the fourth column is the name itself.
+
+Our list of names contains names for all entities in Wikidata, but we would also like to know
+which refer to geographic locations. To obtain geographic locations, we must analyze the relations
+between entities in Wikidata. There is one entity named `geographic location` with id Q2221906.
+Other entities share a relation of type `instance of` with id P31 with this entity. 
+
+We can identify all entities that are instances of geographic locations by analyzing these relations,
+but we have to be careful: There exists another type of relation called `subclass of` with id P279.
+Most entities representing geographic locations are not direct instances of entity `geographic location`,
+but rather of one of its subclasses such as `town`, `city`, or `country`. Even worse, geographic
+locations could be instances of a subclass of a subclass of `geographic location`. To obtain all
+geographic locations, we must compute the transitive closure under the subclass relation.
+
+We first compute all instances of relations `instance_of` (P31) and `subclass_of` (P279):
+
+    script/get-wikidata-relations.py
+
+This creates a file `data/wikidata/relations.tsv` containing triples of entity id, relation id, and entity id.
+
+```
+    1	31	1454986
+    8	31	331769
+    8	31	9415
+    19	31	18608756
+    19	31	18608871
+    19	31	18635217
+    22	31	18608871
+```
+
+Next, we compute the transitive closure under `subclass_of` to obtain all instances of `geographic location` (Q2221906):
+
+    script/get-wikidata-transitive.py
+
+This script actually does more than that: it also computes the transitive closures for instances of `city`
+(Q515), `city with hundreds of thousands of inhabitants` (Q1549591), `city with millions of inhabitants`
+(Q1637706), and `country` (Q6256). As we will see, these distinctions can help us in scoring potential location disambiguations.
+The output is a file `data/wikidata/transitive.tsv`:
+
+```
+    31	2221906
+    33	2221906
+    45	2221906
+    51	2221906
+    55	2221906
+```
+
+We obtain 2,907,062 instances of class `geographic location`, 36,036 of type `city`, 401 of type `city with hundreds of inhabitants` 
+178 of type `city with millions of inhabitants`, and 2148 of type `country`. Countries include ones that no longer exist, hence the large number.
+
+Finally, we would like to extract latitude and longitude of locations. Again, this information will be
+useful in scoring location disambiguations.
+
+    script/get-wikidata-coordinate-locations.py 
+
+This script creates a file `data/wikidata/coordinate-locations.tsv` with triples of entity id, latitude, longitude:
+
+```
+    31      51      5
+    33      64      26
+    45      38.7    -9.1833333333333
+    51      -90     0
+    55      52.316666666667 5.55
+    62      37.766666666667 -122.43333333333
+```
+
+We obtain this information for 2,139,073 entities.
+
+We now have all data ready for finding location mentions and linking them to Wikidata entity ids.
+
+## Generating Candidates
+
+To find references to geographic locations, we first identify spans in the article text 
+that may represent such references. We assume that geographic locations are typically
+refered to by named entities, so we compute all sequences of consecutive tokens 
+with NNP part-of-speech tags. For example, for the sentence:
+
+    NNP  VB               NNP NNP       .
+    Cook gave a speech in San Francisco . 
+
+we would identify the two spans `Cook` and `San Francisco`. Our corpus contains
+202,055 such spans.
+
+A naive approach to our entity linking problem would simply return exact matches of these
+spans with names in our database. There are two problems to this:
+  
+1. We may miss links, perhaps because a city has multiple names or spellings.
+
+2. We may get multiple links, because there are multiple cities with the same name
+
+We can tackle the first problem by including alternate names from the database.
+We have therefore not only kept the label of each Wikidata entity, but also its
+aliases which we can use for matching.
+
+The more important problem, however, is the second one: There are dozens of entities
+named `San Francisco`. And indeed, this is not a contrived example but a very general
+problem: On average we find 7 cities with the same name, across all mentions with 
+matches in our database. How do we determine which one is referenced?
+
+As typical for a Deepdive application, we are going to apply probabilistic inference
+to determine which location is most likely referenced. This requires us splitting the 
+problem into two tasks: generating candidates and assigning truth values to these
+candidates.
+
+Our candidates are pairs of mentions in the text and entities in the database,
+and we define a boolean random variable for each candidate to indicate if the
+mention refers to the entity in the database. We thus define the following type
+of variable in `app.ddlog`:
+
+```
+@label(is_true)
+link(mid) :- locations(mid, _, _, _, _, _, _, _, is_true, _).
+```
+
+For our candidate table, we choose the following schema: 
+
+```
+locations (        
+        @key
+        mention_id text,
+        document_id text,
+        @references(relation="sentences", column="sentence_id")
+        sentence_id text,
+        mention_num int,
+        mention_str text,
+        @textspan_start()
+        w_from int,
+        w_to int,
+        loc_id int,
+        is_correct boolean,
+        features text[]
+).
+```
+
+We add one row for each combination of named entity span and entity in our 
+location database. But wait – that is impossible! There are 202,055 spans and
+2,139,073 geographic locations, hence 432,210,395,015 combinations.
+ 
+This means that we would need to do probabilistic inference over more than
+432 billion variables, not an easy problem.
+
+To reduce the search space, we don't generate every possible combination of
+named entity span and entry in our database, but only those that are
+promising according to some heuristic.
+
+For simplicity, we only consider combinations for which we get an exact
+string match. For example, for the mention `San Francisco` we only consider
+the 30+ locations with name `San Francisco` but not any other. This may
+limit our recall, since our text might contain other spellings referring
+to the same entity, such as `S.F.`, `SanFran`, or `San Franzisko`. For each
+entity linking problem, it is therefore important to come up with candidate
+generation rules that reduce the search space but do not significantly
+reduce recall.
+
+With our heuristic, we obtain 344,806 candidates covering 51,218 unique
+mentions for which we found at least one match. This means that on average
+we have to disambiguate among 7 alternative entities for each mention.
+
+## Probabilistic Inference
+
+To disambiguate mentions, we need to design features that allow the system
+to differentially weight different mention-entity pairings. Both, information
+about entities and information about the context of a mention in text may help.
+
+Information about entities in database:
+
+* Locations that are larger or are generally more important are more likely to
+  be referenced that others. For example, locations with
+  a larger population, or a country vs. a city, city vs. a town.
+* Multiple locations referenced in the same document are more likely to be
+  close to each other. For example, if a document contains references to
+  different cities in Argentina and it also contains a mention `San Francisco`,
+  then it may be more likely that this mention indeed refers to San Francisco, Córdoba
+  in Argentina and not San Francisco, California.
+
+Information about context of mention in text:
+
+* Words appearing before or after a mention may help to determine if the mention
+  refers to a location, or which location it refers to. For example, 
+  a prefix `baseball stadium in` makes it more
+  likely that a mention is indeed a location, that it's a city, that the city
+  is in the U.S., and that the city is one of those having a baseball stadium.
+* Other named entities appearing in the same sentence may help. For example, mentions
+  of `Washington` are more likely to refer to the nation's capital when the
+  president or Congress are named as well; conversely, they are more likely to
+  refer to the state when Seattle or Mount Rainier are mentioned.
+
+Let's now encode these intuitions as factors over our variables. In this section,
+we focus on factors about entities in the database and manually assign a weight
+to each factor. The following section then discusses factors about context and describes
+how we can learn the weights automatically from distantly supervised annotations.
+
+First, we would like to assign more weight to larger, more important locations.
+Population would be a great attribute to use for this, but Wikidata's population
+coverage is too small, so we instead use Wikidata's classification of `city`, `city with hundreds
+of thousands of inhabitants`, `city with millions of inhabitants`, and `country`.
+For each of these classes we create a factor of the following form:
+
+```
+# preference for cities
+@weight(2)
+link(mid) :- 
+  locations(mid, _, _, _, _, _, _, loc_id, _, _),
+  wikidata_instanceof(loc_id, 515).
+
+# x00M population
+@weight(2)
+link(mid) :- 
+  locations(mid, _, _, _, _, _, _, loc_id, _, _),
+  wikidata_instanceof(loc_id, 1549591).
+
+# xM population
+@weight(2)
+link(mid) :- 
+  locations(mid, _, _, _, _, _, _, loc_id, _, _),
+  wikidata_instanceof(loc_id, 1637706).
+
+# boost_countries
+@weight(5)
+link(mid) :- 
+  locations(mid, _, _, _, _, _, _, loc_id, _, _),
+  wikidata_instanceof(loc_id, 6256).
+```
+
+We give larger weights to classes of larger locations; for details see [app.ddlog](app.ddlog).
+
+Next, we would like to give a preference to subsequently mentioned cities that
+are close to each other in geographic distance.
+
+```
+# prefer if subsequently mentioned cities are within 1000km distance
+@weight(3)
+link(mid1) ^ link(mid2) :-
+  locations(mid1, doc_id, _, _, _, _, _, loc_id1, _, _),
+  locations(mid2, doc_id, _, _, _, _, _, loc_id2, _, _),
+  wikidata_coordinate_locations(loc_id1, lat1, lon1),
+  wikidata_coordinate_locations(loc_id2, lat2, lon2),
+  [earth_distance(ll_to_earth(lat1,lon1), ll_to_earth(lat2,lon2)) < 1000].
+```
+
+Note: In order to computate distances between geographic locations, you must
+install the [cube](http://www.postgresql.org/docs/9.4/static/cube.html) and
+[earthdistance](http://www.postgresql.org/docs/9.4/static/earthdistance.html) modules into postgresql.
+See this [documentation](http://www.postgresql.org/docs/9.4/static/contrib.html) for more
+information on how to install these modules.
+
+Finally, we must ensure that the system maps each mention to at most
+one location entity. We encode this constraint using a factor that gives a penalty
+when two variables of the same mention have a positive boolean value:
+
+```
+# one of n 
+@weight(-10)
+link(mid1) ^ link(mid2) :-
+  locations(mid1, _, sentence_id, mention_num, _, _, _, _, _, _),
+  locations(mid2, _, sentence_id, mention_num, _, _, _, _, _, _).
+```
+
+[comment]: # (Although there's noise in the output, many locations are resolved correctly, for example:)
+
+[comment]: # (```)
+[comment]: # (London      |      84 | The SES is discussing the idea with the London and New York authorities .)
+[comment]: # (Shanghai    |    8686 | It said the venture will be based in Shanghai and produce agents for use in hotels and industries .)
+[comment]: # (Tianjin     |   11736 | China has signed a 130 mln dlr loan agreement with the World Bank to partly finance 12 new berths with an annual capacity of 6.28 mln tonnes at the 20 mln tonne a year capacity Tianjin port , the New China News Agency said .)
+[comment]: # (```)
+
+[comment]: # (You can verify the target locations by opening Wikidata's pages for [Q84](http://www.wikidata.org/wiki/Q84), [Q8686](http://www.wikidata.org/wiki/Q8686), and [Q11736](http://www.wikidata.org/wiki/Q11736) and Reuters' full articles.)
+
+[comment]: # (We are now going to try to remove noise. )
+Many of the incorrect matches are due to very small, little known locations
+matching to words in the text, but the words are ambiguous and have a different meaning in the article.
+To reduce such co-incidental matches, we will assign a very small negative weight to every match. This means that
+in the absence of additional evidence, the system will not generate a link. However, when there is additional
+evidence, for example multiple nearby locations matching in an article, the evidence will outweigh this
+negative prior.
+
+```
+negative_bias {
+  input_query: """
+    SELECT l.id as "locations.id", l.is_correct as "locations.is_correct"
+    FROM locations l
+     """
+  function: "IsTrue(locations.is_correct)"
+  weight: -1
+}
+```
+
+[comment]: # (You can see that the precision goes up significantly when adding this factor.)
+
+Another source of errors are the same words appearing in an article multiple times,
+and being mapped to different locations. Since it is unlikely that the author meant
+to refer to different locations with the same name in the same article, we can
+again assign a small penalty for this case:
+
+```
+same_to_same {
+  input_query: """
+    SELECT l1.id as "linking1.id", l1.is_correct as "linking1.is_correct",
+           l2.id as "linking2.id", l2.is_correct as "linking2.is_correct"
+    FROM locations l1, locations l2
+    WHERE
+    l1.document_id = l2.document_id
+    AND l1.mention_str = l2.mention_str
+    AND l1.mention_num != l2.mention_num
+    AND l1.loc_id != l2.loc_id;
+    """
+  function: "And(linking1.is_correct, linking2.is_correct)"
+  weight: -3
+}
+```
+
+At this point, we have a functioning entity-linking system for locations.
+Run `./run.sh` and inspect the outputs (use `deepdive sql` to create an SQL prompt):
+
+```sql
+SELECT mention_str, loc_id, sentence 
+FROM locations_is_correct_inference l, sentences s 
+WHERE l.sentence_id = s.sentence_id
+AND expectation > .9 
+ORDER BY random()
+LIMIT 100;
+```
+
+Feel free to analyze the results and encode additional intuitions.
+
+
+## Weight learning (TODO; unfinished)
+
+So far, we have manually set weights for our factors based on intuitions. These weights,
+however, may not be optimal and we may obtain more accurate results by learning weights
+from data. Furthermore, we would like to leverage a large number of distinct features
+about the context of a mention. It would be difficult or impossible to manually assign
+weights to such features.
+
+To learn weights, we must make two changes to our Deepdive application:
+
+1. We must replace our manually set weights with random variables.
+
+2. We must provide annotations on a subset of the variables.
+ 
+While manually annotating data is expensive, we can write distant supervision rules
+to more efficiently generate annotations.
+
+Here are a variety of ideas for distant supervision rules:
+
+1. annotate unambiguous locations
+2. annotate locations that can be disambiguated by zip codes and phone area codes
+  appearing in the same document
+3. many documents contain references to companies and persons; use background
+  information from Wikidata for disambiguation
+4. find matches to other (non-location) Wikidata entities; if these share a relation
+  with a location appearing in the same document, annotate
+5. write prefix/suffix patterns that have high precision
+6. meta information in the corpus allows disambiguation (eg. document tags such as `U.S. national`)
+
+Our distant supervision rules use a combination of 1. and 5. 
+
+We have also created an extractor that populates a table called `context_features` with
+features for phrases appearing before or after a mention, and other named entities appearing
+in the same sentence. These features are then added to our inference with the following factor: 
+
+```
+@weight(f)
+link(mid) :-
+  locations(_, _, sentence_id, mention_num, _, _, _, loc_id, _, _),
+  context_features(sentence_id, mention_num, f).
+```
+
+We leave it as an excercise to expand the set of features and add additional rules
+for distant supervision.
+
+Run `./run.sh` and inspect the outputs as described in the previous section.
+
+## Plotting locations on a map
+
+You can use [Cartopy](https://github.com/SciTools/cartopy) (python) or [Mapbox](http://www.mapbox.com) to visualize
+the locations on a map. 
diff --git a/app.ddlog b/app.ddlog
new file mode 100644
index 0000000..29adb59
--- /dev/null
+++ b/app.ddlog
@@ -0,0 +1,178 @@
+articles(
+  @key 
+  id int,
+  @searchable
+  body text,
+  @searchable
+  title text
+).
+
+@source
+sentences(
+  document_id  int,    # which document it comes from
+  sentence_offset int, # which sentence (0, 1, 2...) is it in document
+  sentence     text,      # sentence content
+  words        text[],    # array of words in this sentence
+  lemma        text[],    # array of lemmatized words
+  pos_tags     text[],    # array of part-of-speech tags
+  ner_tags     text[],    # array of named entity tags (PERSON, LOCATION, etc)
+  char_offsets int[],   # array of character offsets (begin)
+  dep_labels text[],    # array of dependency labels
+  dep_parents int[],  
+  sentence_id  text       # unique identifier for sentences
+  ).
+
+wikidata_names (
+        item_id int,
+        language text,
+        label text,
+        name text
+).
+
+wikidata_instanceof (
+        item_id int,
+        clazz_id int
+).
+
+wikidata_coordinate_locations (
+        item_id int,
+        latitude float,
+        longitude float
+).
+
+context_features (
+        sentence_id text,
+        mention_num int,
+        features text[]
+).
+
+@extraction
+locations (        
+        @key
+        mention_id text,
+        document_id text,
+        @references(relation="sentences", column="sentence_id")
+        sentence_id text,
+        mention_num int,
+        mention_str text,
+        @textspan_start()
+        w_from int,
+        w_to int,
+        loc_id int,
+        is_correct boolean,
+        features text[]
+).
+
+v_mentions(
+  sentence_id text,
+  mention_num int,
+  w_from int,
+  w_to int
+).
+
+link?(
+  mention_id text).
+
+# process the text
+#function extract_preprocess over (id int, body text)
+#                returns rows like sentences
+#  implementation "../deepdive/examples/nlp_extractor/run.sh -k id -v body -l 100 -t 1 -a tokenize,ssplit,pos" handles json lines.
+
+#sentences += 
+#  extract_preprocess(id, body) :-
+#  articles(id, body, _).
+
+# extract pairs
+function extract_pairs over (document_id int, sentence_id text, words text, pos_tags text)
+               returns rows like locations
+  implementation "udf/extract_pairs.py" handles tsv lines.
+
+locations += 
+  extract_pairs(doc_id, id, ARRAY_TO_STRING(words, "~^~"), ARRAY_TO_STRING(pos_tags, "~^~")) :-
+  sentences(doc_id, _, _, words, _, pos_tags, _, _, _, _, id).
+
+# extract context features
+function extract_context_features over (sentence_id text, mention_num int, w_from int, w_to int, words text, pos_tags text)
+              returns rows like context_features
+   implementation "udf/extract_context_features.py" handles tsv lines.
+
+context_features +=
+  extract_context_features(sentence_id, mention_num, w_from, w_to, ARRAY_TO_STRING(words, "~^~"), ARRAY_TO_STRING(pos_tags, "~^~")) :-
+  sentences(_, _, _, words, _, pos_tags, _, _, _, _, sentence_id),
+  v_mentions(sentence_id, mention_num, w_from, w_to).
+
+# TODO supervise (not sure if this helps) 
+#function supervise over (document_id int, sentence text, words, )
+#                return rows like locations
+#  implementataion "supervise_locations.py.save" handles tsv lines.
+
+#locations += 
+#  supervise(mention_id, sent_id, mention_num, mention_str, w_from, w_to, loc_id) :-
+#  locations(mention_id, sent_id, mention_num, mention_str, w_from, w_to, loc_id, _, _)
+
+#label
+@label(is_true)
+link(mid) :- locations(mid, _, _, _, _, _, _, _, is_true, _).
+
+# negative_bias
+@weight(-1)
+link(mid) :-
+  locations(mid, _, _, _, _, _, _, _, _, _).
+  
+# one_of_n_features
+# TODO: what if the entity doesn't exist in the KB
+@weight(-10)
+link(mid1) ^ link(mid2) :-
+  locations(mid1, _, sentence_id, mention_num, _, _, _, _, _, _),
+  locations(mid2, _, sentence_id, mention_num, _, _, _, _, _, _).
+
+# prefer if subsequently mentioned cities are within 1000km distance
+# consecutive_in_proximity
+@weight(3)
+link(mid1) ^ link(mid2) :-
+  locations(mid1, doc_id, _, _, _, _, _, loc_id1, _, _),
+  locations(mid2, doc_id, _, _, _, _, _, loc_id2, _, _),
+  wikidata_coordinate_locations(loc_id1, lat1, lon1),
+  wikidata_coordinate_locations(loc_id2, lat2, lon2),
+  [earth_distance(ll_to_earth(lat1,lon1), ll_to_earth(lat2,lon2)) < 1000].
+
+# penalize same word mapped to different location
+# same_to_same
+@weight(-3)
+link(mid1) ^ link(mid2) :-
+  locations(mid1, doc_id, _, mention_num1, mention_str, _, _, loc1, _, _),
+  locations(mid2, doc_id, _, mention_num2, mention_str, _, _, loc2, _, _).
+
+# prefer larger cities
+@weight(2)
+link(mid) :- 
+  locations(mid, _, _, _, _, _, _, loc_id, _, _),
+  wikidata_instanceof(loc_id, 515).
+
+# x00M population
+@weight(2)
+link(mid) :- 
+  locations(mid, _, _, _, _, _, _, loc_id, _, _),
+  wikidata_instanceof(loc_id, 1549591).
+
+# xM population
+@weight(2)
+link(mid) :- 
+  locations(mid, _, _, _, _, _, _, loc_id, _, _),
+  wikidata_instanceof(loc_id, 1637706).
+
+# boost_countries
+@weight(5)
+link(mid) :- 
+  locations(mid, _, _, _, _, _, _, loc_id, _, _),
+  wikidata_instanceof(loc_id, 6256).
+
+#context features
+@weight(f)
+link(mid) :-
+  locations(_, _, sentence_id, mention_num, _, _, _, loc_id, _, _),
+  context_features(sentence_id, mention_num, f).
+
+
+
+
diff --git a/data/sentences.tsv.gz b/data/sentences.tsv.gz
new file mode 100644
index 0000000..4f8a82b
Binary files /dev/null and b/data/sentences.tsv.gz differ
diff --git a/db.url b/db.url
new file mode 100644
index 0000000..ba4f5f2
--- /dev/null
+++ b/db.url
@@ -0,0 +1 @@
+postgresql://localhost/geo
diff --git a/env_local.sh.TEMPLATE b/env_local.sh.TEMPLATE
index 038ef51..7f87977 100644
--- a/env_local.sh.TEMPLATE
+++ b/env_local.sh.TEMPLATE
@@ -2,6 +2,7 @@
 export DEEPDIVE_HOME=`pwd`/../deepdive
 export APP_HOME=`pwd`
 export MEMORY="8g"
+export INPUT_BATCH_SIZE=10000
 export PARALLELISM=4
 export DBNAME="geo"
 export PGHOST="localhost"
diff --git a/input/init.sh b/input/init.sh
new file mode 100755
index 0000000..fcd4632
--- /dev/null
+++ b/input/init.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# A script for loading reuters and wikidata into PostgreSQL database
+set -eux
+#cd "$(dirname "$0")"
+
+#bzcat ./articles_dump.csv.bz2  | deepdive sql "COPY articles  FROM STDIN CSV"
+#bzcat ./sentences_dump.csv.bz2 |
+#if [[ -z ${SUBSAMPLE_NUM_SENTENCES:-} ]]; then cat; else head -n ${SUBSAMPLE_NUM_SENTENCES}; fi |
+#deepdive sql "COPY sentences FROM STDIN CSV"
+
+#!/bin/bash
+
+cd $(dirname $0)/..
+
+. ./env_local.sh
+
+
+cd "$(dirname "$0")"
+
+# article content
+cat $(pwd)/converted.csv | deepdive sql "copy articles from STDIN csv"
+
+# wiki data
+cat `pwd`/names.tsv | deepdive sql "copy wikidata_names from STDIN CSV DELIMITER E'\t' QUOTE E'\1';"
+cat `pwd`/coordinate-locations.tsv | deepdive sql "copy wikidata_coordinate_locations from STDIN;"
+cat `pwd`/transitive.tsv | deepdive sql "copy wikidata_instanceof from STDIN;"
+
+#import sentence parses
+cat `pwd`/../data/sentences.tsv.gz | gunzip - | deepdive sql "COPY sentences from STDIN"
+
+# intermediate view 
+#TODO move this to app.ddlog
+# use ":-" to create a view/table. Don't know if DISTINCT is supported
+deepdive sql 'INSERT INTO v_mentions SELECT DISTINCT sentence_id, mention_num, w_from, w_to FROM locations'
+
+
+# install psql extensions
+deepdive sql "CREATE EXTENSION cube;"
+deepdive sql "CREATE EXTENSION earthdistance;"
diff --git a/run.sh b/run.sh
index 449191a..aa32462 100755
--- a/run.sh
+++ b/run.sh
@@ -1,11 +1,9 @@
-#!/bin/bash
-
-DIRNAME=`dirname $0`
-
-. "${DIRNAME}/env_local.sh"
-
-cd $DEEPDIVE_HOME
-export PYTHONPATH=$DEEPDIVE_HOME/ddlib:$PYTHONPATH
-
-### Compile and run:
-sbt/sbt "run -c $APP_HOME/${APP_CONF:-application.conf} -o ${TMP_DIR}"
+read -p "WARNING: Script will erase sentences in database. Are you sure? " -n 1 -r
+echo
+if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+  exit
+fi
+
+dropdb geo
+deepdive initdb
+deepdive run
diff --git a/script/fetch-wikidata.py b/script/fetch-wikidata.py
index 81ae01e..4141b6f 100755
--- a/script/fetch-wikidata.py
+++ b/script/fetch-wikidata.py
@@ -10,13 +10,26 @@
 import json
 import csv
 import os.path
+import urllib.request
 
 BASE_DIR, throwaway = os.path.split(os.path.realpath(__file__))
 BASE_DIR = os.path.realpath(BASE_DIR + "/..")
 DATA_DIR = BASE_DIR + '/data'
 
+def get_latest_dump_name():
+    with urllib.request.urlopen('http://dumps.wikimedia.org/other/wikidata/') as f:
+        for line in f:
+            html = str(line, encoding='utf-8')
+            if 'href' not in html:
+                continue
+            start_idx = html.index('>')+1
+            end_idx = html.index('<', start_idx)
+            name = html[start_idx:end_idx]
+        return name
+
 def download_wikidata():
-    DOWNLOAD_URL = ('http://dumps.wikimedia.org/other/wikidata/20150330.json.gz')
+    dump_name = get_latest_dump_name()
+    DOWNLOAD_URL = ('http://dumps.wikimedia.org/other/wikidata/'+dump_name)
     ARCHIVE_FILENAME = 'dump.json.gz'
     data_path = os.path.join(DATA_DIR, "wikidata")
     archive_path = os.path.join(data_path, ARCHIVE_FILENAME)
diff --git a/script/get-reuters-json-csv.py b/script/get-reuters-json-csv.py
index d4c2059..e65e094 100755
--- a/script/get-reuters-json-csv.py
+++ b/script/get-reuters-json-csv.py
@@ -121,7 +121,8 @@ def json_to_csv():
         tsvout = csv.writer(tsvout)
         for line in jsonin:
             obj = json.loads(line)
-            tsvout.writerow([obj['id'], obj['body'], obj['title']])
+            if obj['body']:
+                tsvout.writerow([obj['id'], obj['body'].replace('\x7F', ''), obj['title']])
     print("saved output as %s" % out_path)
 
 
diff --git a/script/get-wikidata-coordinate-locations.py b/script/get-wikidata-coordinate-locations.py
index 3fd31d8..c9f3743 100755
--- a/script/get-wikidata-coordinate-locations.py
+++ b/script/get-wikidata-coordinate-locations.py
@@ -28,7 +28,7 @@ def parse_json(line, w):
         except KeyError:
           print('ignoring keyerror', file=sys.stderr) 
 
-with open(DATA_DIR + '/wikidata/dump.json', 'r') as f, open(DATA_DIR + '/wikidata/coordinate-locations.tsv', 'w') as w:
+with open(DATA_DIR + '/wikidata/dump.json', 'r') as f, open(BASE_DIR + '/input/coordinate-locations.tsv', 'w') as w:
     for line in f:
         line = line.rstrip()
         if line == '[' or line == ']':
diff --git a/script/get-wikidata-names.py b/script/get-wikidata-names.py
index b9ba8b6..c4b0896 100755
--- a/script/get-wikidata-names.py
+++ b/script/get-wikidata-names.py
@@ -26,7 +26,7 @@ def parse_json(line,w):
                 print(id + '\t' + lang + '\t' + 'alias' + '\t' + alias.replace('\t', ' ').replace('\n', ' '), file=w)
 
 
-with open(DATA_DIR + '/wikidata/dump.json', 'r') as f, open(DATA_DIR + '/wikidata/names.tsv', 'w') as w:
+with open(DATA_DIR + '/wikidata/dump.json', 'r') as f, open(BASE_DIR + '/input/names.tsv', 'w') as w:
     for line in f:
         line = line.rstrip()
         if line == '[' or line == ']':
diff --git a/script/get-wikidata-transitive.py b/script/get-wikidata-transitive.py
index 343c450..ceb2f21 100755
--- a/script/get-wikidata-transitive.py
+++ b/script/get-wikidata-transitive.py
@@ -63,7 +63,7 @@ def get_items(sel, clazz, w):
                 print(str(id1) + '\t' + str(clazz), file=w)
                 last = id1
 
-with open(DATA_DIR + '/wikidata/transitive.tsv', 'w') as w:
+with open(BASE_DIR + '/input/transitive.tsv', 'w') as w:
     build_map()
     # compute transitive closure for each class
     for clazz in clazzes:
diff --git a/udf/extract_pairs.py b/udf/extract_pairs.py
index 57fe10e..e64bafd 100755
--- a/udf/extract_pairs.py
+++ b/udf/extract_pairs.py
@@ -12,7 +12,7 @@
 Loc = collections.namedtuple('Loc', ['item_id', 'name'])
 
 loc_ids_set = set()
-with open(BASE_DIR + "/data/wikidata/transitive.tsv", 'rt') as transitive_file:
+with open(BASE_DIR + "/input/transitive.tsv", 'rt') as transitive_file:
     print('loading transitive.tsv', file=sys.stderr)
     for line in transitive_file:
         cols = line.split('\t')
@@ -24,7 +24,7 @@
 
 
 cities_dict = dict()
-with open(BASE_DIR + "/data/wikidata/names.tsv", 'rt') as cities_file:
+with open(BASE_DIR + "/input/names.tsv", 'rt') as cities_file:
     print('loading names.tsv', file=sys.stderr)
     for line in cities_file:
         cols = line.split('\t')
@@ -100,8 +100,8 @@ def generate_candidates(doc_id, sent_id, words, poses, phrases):
             #    if m.country_code == 'US' and m == loc:
             #        true_str = '1'
 
-            print('\t'.join(['\\N', mention_id, str(doc_id), str(sent_id), str(mention_num), mention_str, str(phrase[0]), str(phrase[1]), str(loc.item_id), '\\N', features_str ]))
-            #print('\t'.join(['\\N', mention_id, str(doc_id), str(sent_id), str(mention_num), mention_str, str(phrase[0]), str(phrase[1]), str(loc.item_id), true_str, features_str ]))
+            print('\t'.join([ mention_id, str(doc_id), str(sent_id), str(mention_num), mention_str, str(phrase[0]), str(phrase[1]), str(loc.item_id), '\\N', features_str ]))
+            #print('\t'.join([ mention_id, str(doc_id), str(sent_id), str(mention_num), mention_str, str(phrase[0]), str(phrase[1]), str(loc.item_id), true_str, features_str ]))
 
         mention_num += 1
 
@@ -122,7 +122,7 @@ def generate_nnp_phrases(words, poses):
         for line in input_files:
             #print(line, file=sys.stderr)
             doc_id, sent_id, words_str, poses_str = line.split('\t')
-            words = words_str.split(' ')
-            poses = poses_str.split(' ')
+            words = words_str.split('~^~')
+            poses = poses_str.split('~^~')
             phrases = generate_nnp_phrases(words, poses)
             generate_candidates(doc_id, sent_id, words, poses, phrases)