Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 23 additions & 150 deletions data-loading/setup-and-load-solr.sh
Original file line number Diff line number Diff line change
@@ -1,162 +1,35 @@
#!/usr/bin/env bash

SOLR_PORT=8983

is_solr_up(){
echo "Checking if solr is up on http://localhost:$SOLR_PORT/solr/admin/cores"
http_code=`echo $(curl -s -o /dev/null -w "%{http_code}" "http://localhost:$SOLR_PORT/solr/admin/cores")`
echo $http_code
return `test $http_code = "200"`
}

wait_for_solr(){
while ! is_solr_up; do
sleep 3
done
}

wait_for_solr

# add collection
curl -X POST 'http://localhost:8983/solr/admin/collections?action=CREATE&name=name_lookup&numShards=1&replicationFactor=1'

# do not autocreate fields
curl 'http://localhost:8983/solr/name_lookup/config' -d '{"set-user-property": {"update.autoCreateFields": "false"}}'

# add lowercase text type
curl -X POST -H 'Content-type:application/json' --data-binary '{
"add-field-type" : {
"name": "LowerTextField",
"class": "solr.TextField",
"positionIncrementGap": "100",
"analyzer": {
"tokenizer": {
"class": "solr.StandardTokenizerFactory"
},
"filters": [{
"class": "solr.LowerCaseFilterFactory"
}]
}
}
}' 'http://localhost:8983/solr/name_lookup/schema'

# add exactish text type (as described at https://stackoverflow.com/a/29105025/27310)
curl -X POST -H 'Content-type:application/json' --data-binary '{
"add-field-type" : {
"name": "exactish",
"class": "solr.TextField",
"positionIncrementGap": "100",
"analyzer": {
"tokenizer": {
"class": "solr.KeywordTokenizerFactory"
},
"filters": [{
"class": "solr.LowerCaseFilterFactory"
}]
}
}
}' 'http://localhost:8983/solr/name_lookup/schema'



# add fields
curl -X POST -H 'Content-type:application/json' --data-binary '{
"add-field": [
{
"name":"names",
"type":"LowerTextField",
"indexed":true,
"stored":true,
"multiValued":true
},
{
"name":"names_exactish",
"type":"exactish",
"indexed":true,
"stored":false,
"multiValued":true
},
{
"name":"curie",
"type":"string",
"stored":true
},
{
"name":"preferred_name",
"type":"LowerTextField",
"stored":true
},
{
"name":"preferred_name_exactish",
"type":"exactish",
"indexed":true,
"stored":false,
"multiValued":false
},
{
"name":"types",
"type":"string",
"stored":true
"multiValued":true
},
{
"name":"shortest_name_length",
"type":"pint",
"stored":true
},
{
"name":"curie_suffix",
"type":"plong",
"docValues":true,
"stored":true,
"required":false,
"sortMissingLast":true
},
{
"name":"taxa",
"type":"string",
"stored":true,
"multiValued":true
},
{
"name":"taxon_specific",
"type":"boolean",
"stored":true,
"multiValued":false,
"sortMissingLast":true
},
{
"name":"clique_identifier_count",
"type":"pint",
"stored":true
}
] }' 'http://localhost:8983/solr/name_lookup/schema'

# Add a copy field to copy names into names_exactish.
curl -X POST -H 'Content-type:application/json' --data-binary '{
"add-copy-field": {
"source": "names",
"dest": "names_exactish"
}
}' 'http://localhost:8983/solr/name_lookup/schema'
# We don't use set -e because the loop test relies on failures being ignored.
set -uo pipefail

# Configuration options
SOLR_SERVER="http://localhost:8983"

# Step 1. Make sure the Solr service is up and running.
HEALTH_ENDPOINT="${SOLR_SERVER}/solr/admin/cores?action=STATUS"
response=$(wget --spider --server-response ${HEALTH_ENDPOINT} 2>&1 | grep "HTTP/" | awk '{ print $2 }') >&2
until [ "$response" = "200" ]; do
response=$(wget --spider --server-response ${HEALTH_ENDPOINT} 2>&1 | grep "HTTP/" | awk '{ print $2 }') >&2
echo " -- SOLR is unavailable - sleeping"
sleep 3
done
echo "SOLR is up and running at ${SOLR_SERVER}."

# Add a copy field to copy preferred_name into preferred_name_exactish.
curl -X POST -H 'Content-type:application/json' --data-binary '{
"add-copy-field": {
"source": "preferred_name",
"dest": "preferred_name_exactish"
}
}' 'http://localhost:8983/solr/name_lookup/schema'
# Step 2. Create fields for search.
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/setup_solr.sh"
echo Solr database has been set up.

# add data
# Step 3. Load specified files.
for f in $1; do
echo "Loading $f..."
# curl -d @$f needs to load the entire file into memory before uploading it, whereas
# curl -X POST -T $f will stream it. See https://github.com/TranslatorSRI/NameResolution/issues/194
curl -H 'Content-Type: application/json' -X POST -T $f \
'http://localhost:8983/solr/name_lookup/update/json/docs?processor=uuid&uuid.fieldName=id&commit=true'
sleep 30
"$SOLR_SERVER/solr/name_lookup/update/json/docs?processor=uuid&uuid.fieldName=id&commit=true"
sleep 60
done
echo "Check solr"
curl -s --negotiate -u: 'localhost:8983/solr/name_lookup/query?q=*:*&rows=0'
curl -s --negotiate -u: "$SOLR_SERVER/solr/name_lookup/query?q=*:*&rows=0"

147 changes: 147 additions & 0 deletions data-loading/setup_solr.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#!/usr/bin/env bash
#
# Set up the fields and types needed by NameRes.
#
# This file should be sourced, not called directly.

# require sourcing
[[ "${BASH_SOURCE[0]}" != "$0" ]] || {
echo "Must be sourced: source $0" >&2
exit 1
}

# require SOLR_SERVER
: "${SOLR_SERVER:?SOLR_SERVER must be set}"

echo "Setting up Solr database with SOLR_SERVER='$SOLR_SERVER'"

# add collection
curl -X POST "$SOLR_SERVER/solr/admin/collections?action=CREATE&name=name_lookup&numShards=1&replicationFactor=1"

# do not autocreate fields
curl "$SOLR_SERVER/solr/name_lookup/config" -d '{"set-user-property": {"update.autoCreateFields": "false"}}'

# add lowercase text type
curl -X POST -H 'Content-type:application/json' --data-binary '{
"add-field-type" : {
"name": "LowerTextField",
"class": "solr.TextField",
"positionIncrementGap": "100",
"analyzer": {
"tokenizer": {
"class": "solr.StandardTokenizerFactory"
},
"filters": [{
"class": "solr.LowerCaseFilterFactory"
}]
}
}
}' "$SOLR_SERVER/solr/name_lookup/schema"

# add exactish text type (as described at https://stackoverflow.com/a/29105025/27310)
curl -X POST -H 'Content-type:application/json' --data-binary '{
"add-field-type" : {
"name": "exactish",
"class": "solr.TextField",
"positionIncrementGap": "100",
"analyzer": {
"tokenizer": {
"class": "solr.KeywordTokenizerFactory"
},
"filters": [{
"class": "solr.LowerCaseFilterFactory"
}]
}
}
}' "$SOLR_SERVER/solr/name_lookup/schema"



# add fields
curl -X POST -H 'Content-type:application/json' --data-binary '{
"add-field": [
{
"name":"names",
"type":"LowerTextField",
"indexed":true,
"stored":true,
"multiValued":true
},
{
"name":"names_exactish",
"type":"exactish",
"indexed":true,
"stored":false,
"multiValued":true
},
{
"name":"curie",
"type":"string",
"stored":true
},
{
"name":"preferred_name",
"type":"LowerTextField",
"stored":true
},
{
"name":"preferred_name_exactish",
"type":"exactish",
"indexed":true,
"stored":false,
"multiValued":false
},
{
"name":"types",
"type":"string",
"stored":true
"multiValued":true
},
{
"name":"shortest_name_length",
"type":"pint",
"stored":true
},
{
"name":"curie_suffix",
"type":"plong",
"docValues":true,
"stored":true,
"required":false,
"sortMissingLast":true
},
{
"name":"taxa",
"type":"string",
"stored":true,
"multiValued":true
},
{
"name":"taxon_specific",
"type":"boolean",
"stored":true,
"multiValued":false,
"sortMissingLast":true
},
{
"name":"clique_identifier_count",
"type":"pint",
"stored":true
}
] }' "$SOLR_SERVER/solr/name_lookup/schema"

# Add a copy field to copy names into names_exactish.
curl -X POST -H 'Content-type:application/json' --data-binary '{
"add-copy-field": {
"source": "names",
"dest": "names_exactish"
}
}' "$SOLR_SERVER/solr/name_lookup/schema"

# Add a copy field to copy preferred_name into preferred_name_exactish.
curl -X POST -H 'Content-type:application/json' --data-binary '{
"add-copy-field": {
"source": "preferred_name",
"dest": "preferred_name_exactish"
}
}' "$SOLR_SERVER/solr/name_lookup/schema"
16 changes: 9 additions & 7 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
services:
solr:
container_name: name_solr
nameres_solr:
container_name: nameres_solr
image: solr:9.1
mem_limit: 18G
environment:
# Change this setting to control how much memory you would like your Solr setup to have.
# Note that your Docker will need to be configured to allow this amount of memory.
SOLR_JAVA_MEM: '-Xms25G -Xmx25G'
SOLR_JAVA_MEM: '-Xmx16G'
ports:
- '8983:8983'
command: ['-DzkRun']
Expand All @@ -17,13 +18,14 @@ services:
source: ./data/solr
target: /var/solr/data

nameres:
container_name: nameres
nameres_web:
container_name: nameres_web
platform: linux/amd64
environment:
- SOLR_HOST=name_solr
- SOLR_HOST=nameres_solr
- BABEL_VERSION= # e.g. 2025mar31
- BABEL_VERSION_URL= # The URL of the Babel version URL
- LOCATION_VALUE=RENCI
- LOCATION_VALUE=localhost
- MATURITY_VALUE=development
ports:
- '2433:2433'
Expand Down
Loading