diff --git a/Dockerfile b/Dockerfile index 024de9f2f..559ae1367 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,7 @@ WORKDIR /app COPY . /app RUN apk add --no-cache --virtual build-dependencies python make git g++ && \ + apk add --no-cache curl jq && \ npm install && \ chown -R node:node . && \ npm cache clean --force && \ diff --git a/documentation/DataLoader.md b/documentation/DataLoader.md new file mode 100644 index 000000000..acee04e7a --- /dev/null +++ b/documentation/DataLoader.md @@ -0,0 +1,66 @@ +# CouchDB Data Loader + +(`scripts/deleteAndLoadSnapsets.sh`) + +This script is used to setup CouchDB database and is executed as a Kubernetes batch Job every time new version of the +universal image is deployed to the cluster (also when cluster is initially created). + +It does following: + +- Converts the preferences in universal into `snapset` Prefs Safes and GPII Keys, +- Optionally deletes existing database, +- Creates a CouchDB database if none exits, +- Updates the database with respect to its `design/views` document, as required, +- Loads the latest snapsets created into the database. + +## Environment Variables + +- `COUCHDB_URL`: URL of the CouchDB database. (required) +- `CLEAR_INDEX`: If set to `true`, the database at $COUCHDB_URL will be deleted and recreated. (optional) +- `STATIC_DATA_DIR`: The directory where the static data to be loaded into CouchDB resides. (optional) +- `BUILD_DATA_DIR`: The directory where the data built from the conversion step resides. (optional) + +The use of environment variables for data directories is useful if you want to mount the database data using a Docker +volume and point the data loader at it. + +Note that since [the docker doesn't support the environment variable type of +array](https://github.com/moby/moby/issues/20169), two separate environment variables are used for inputting data +directories instead of one array that holds these directories. + +## Running + +Example using containers: + +```bash +$ docker run -d -p 5984:5984 --name couchdb couchdb +$ docker run --rm --link couchdb -e COUCHDB_URL=http://couchdb:5984/gpii \ + -e CLEAR_INDEX=true vagrant-universal scripts/deleteAndLoadSnapsets.sh +$ docker run -d -p 8081:8081 --name preferences --link couchdb \ + -e NODE_ENV=gpii.config.preferencesServer.standalone.production \ + -e PREFERENCESSERVER_LISTEN_PORT=8081 -e DATASOURCE_HOSTNAME=http://couchdb \ + -e DATASOURCE_PORT=5984 vagrant-universal +``` + +Below are two versions of loading couchdb data from a different location (e.g. +/home/vagrant/sync/universal/testData/dbData for static data directory and /home/vagrant/sync/universal/build/dbData for +build data directory). The first version has the optional `CLEAR_INDEX` set to true to erase and reset the database +prior to other database changes: + +```bash +$ docker run --name dataloader --link couchdb \ + -v /home/vagrant/sync/universal/testData/dbData:/static_data -e STATIC_DATA_DIR=/static_data \ + -v /home/vagrant/sync/universal/build/dbData:/build_data -e BUILD_DATA_DIR=/build_data \ + -e COUCHDB_URL=http://couchdb:5984/gpii \ + -e CLEAR_INDEX=true vagrant-universal scripts/deleteAndLoadSnapsets.sh +``` + +The second version does not set `CLEAR_INDEX` such that any existing database is left intact prior to subsequent changes +to it (e.g., deleting the snapsets): + +```bash +$ docker run --name dataloader --link couchdb \ + -v /home/vagrant/sync/universal/testData/dbData:/static_data -e STATIC_DATA_DIR=/static_data \ + -v /home/vagrant/sync/universal/build/dbData:/build_data -e BUILD_DATA_DIR=/build_data \ + -e COUCHDB_URL=http://couchdb:5984/gpii \ + vagrant-universal scripts/deleteAndLoadSnapsets.sh +``` diff --git a/documentation/README.md b/documentation/README.md index 2de699630..c3952effe 100644 --- a/documentation/README.md +++ b/documentation/README.md @@ -9,6 +9,7 @@ * [Preferences Server](PreferencesServer.md) * [Data Model for Preferences and OAuth Data](DataModel.md) * [Pouch Manager](PouchManager.md) + * [Data Loader](DataLoader.md) * [MatchMakerFramework](MatchMakerFramework.md) * [Flat Match Maker](FlatMatchMaker.md) * [Apptology](Apptology.md) diff --git a/scripts/convertPrefs.js b/scripts/convertPrefs.js index d3429cabc..3c6a44075 100644 --- a/scripts/convertPrefs.js +++ b/scripts/convertPrefs.js @@ -45,7 +45,7 @@ rimraf(targetDir, function () { filenames.forEach(function (filename) { if (filename.endsWith(".json5")) { var gpiiKey = filename.substr(0, filename.length - 6); - var preferences = fs.readFileSync(inputDir + filename, "utf-8"); + var preferences = fs.readFileSync(inputDir + "/" + filename, "utf-8"); var currentTime = new Date().toISOString(); var prefsSafeId = "prefsSafe-" + gpiiKey; @@ -80,11 +80,11 @@ rimraf(targetDir, function () { }); // Write the target files - var prefsSafesFile = targetDir + "prefsSafes.json"; + var prefsSafesFile = targetDir + "/prefsSafes.json"; console.log("prefsSafesFile: " + prefsSafesFile); fs.writeFileSync(prefsSafesFile, JSON.stringify(prefsSafes, null, 4)); - var gpiiKeysFile = targetDir + "gpiiKeys.json"; + var gpiiKeysFile = targetDir + "/gpiiKeys.json"; fs.writeFileSync(gpiiKeysFile, JSON.stringify(gpiiKeys, null, 4)); console.log("Finished converting preferences data in the source directory " + inputDir + " to the target directory " + targetDir); diff --git a/scripts/deleteAndLoadSnapsets.sh b/scripts/deleteAndLoadSnapsets.sh new file mode 100755 index 000000000..f27730d07 --- /dev/null +++ b/scripts/deleteAndLoadSnapsets.sh @@ -0,0 +1,86 @@ +#!/bin/sh +APP_DIR=${APP_DIR:-"/app"} + +STATIC_DATA_DIR=${STATIC_DATA_DIR:-"${APP_DIR}/testData/dbData"} +PREFERENCES_DATA_DIR=${PREFERENCES_DATA_DIR:-"${APP_DIR}/testData/preferences"} +BUILD_DATA_DIR=${BUILD_DATA_DIR:-'/tmp/build/dbData'} + +DATALOADER_JS="${APP_DIR}/scripts/deleteAndLoadSnapsets.js" +CONVERT_JS="${APP_DIR}/scripts/convertPrefs.js" + +log() { + echo "$(date +'%Y-%m-%d %H:%M:%S') - $1" +} + +warm_indices(){ + log "Warming indices..." + + for view in $(curl -s "${COUCHDB_URL}/_design/views/" | jq -r '.views | keys[]'); do + curl -fsS "${COUCHDB_URL}/_design/views/_view/${view}" >/dev/null + done + + log "Finished warming indices..." +} + +# Verify variables +if [ -z "${COUCHDB_URL}" ]; then + echo "COUCHDB_URL environment variable must be defined" + exit 1 +fi + +COUCHDB_URL_SANITIZED=$(echo "${COUCHDB_URL}" | sed -e 's,\(://\)[^/]*\(@\),\1\2,g') + +log 'Starting' +log "CouchDB: ${COUCHDB_URL_SANITIZED}" +log "Clear index: ${CLEAR_INDEX}" +log "Static: ${STATIC_DATA_DIR}" +log "Build: ${BUILD_DATA_DIR}" +log "Working directory: $(pwd)" + +# Check we can connect to CouchDB +COUCHDB_URL_ROOT=$(echo "${COUCHDB_URL}" | sed 's/[^\/]*$//g') +RET_CODE=$(curl --write-out '%{http_code}' --silent --output /dev/null "${COUCHDB_URL_ROOT}/_up") +if [ "$RET_CODE" != '200' ]; then + log "[ERROR] Failed to connect to CouchDB: ${COUCHDB_URL_SANITIZED}" + exit 1 +fi + +# Create build dir if it does not exist +if [ ! -d "${BUILD_DATA_DIR}" ]; then + mkdir -p "${BUILD_DATA_DIR}" +fi + +# Convert preferences json5 to GPII keys and preferences safes +if [ -d "${PREFERENCES_DATA_DIR}" ]; then + node "${CONVERT_JS}" "${PREFERENCES_DATA_DIR}" "${BUILD_DATA_DIR}" snapset + if [ "$?" != '0' ]; then + log "[ERROR] ${CONVERT_JS} failed (exit code: $?)" + exit 1 + fi +else + log "PREFERENCES_DATA_DIR ($PREFERENCES_DATA_DIR) does not exist, nothing to convert" +fi + +# Initialize (possibly clear) data base +if [ "${CLEAR_INDEX}" == 'true' ]; then + log "Deleting database at ${COUCHDB_URL_SANITIZED}" + if ! curl -fsS -X DELETE "${COUCHDB_URL}"; then + log "Error deleting database" + fi +fi + +log "Creating database at ${COUCHDB_URL_SANITIZED}" +if ! curl -fsS -X PUT "${COUCHDB_URL}"; then + log "Database already exists at ${COUCHDB_URL_SANITIZED}" +fi + +# Submit data +node "${DATALOADER_JS}" "${COUCHDB_URL}" "${STATIC_DATA_DIR}" "${BUILD_DATA_DIR}" +err=$? +if [ "${err}" != '0' ]; then + log "${DATALOADER_JS} failed with ${err}, exiting" + exit "${err}" +fi + +# Warm Data +warm_indices diff --git a/scripts/vagrantCloudBasedContainers.sh b/scripts/vagrantCloudBasedContainers.sh index 49bcc8d4f..600eb4b50 100755 --- a/scripts/vagrantCloudBasedContainers.sh +++ b/scripts/vagrantCloudBasedContainers.sh @@ -35,16 +35,16 @@ COUCHDB_HEALTHCHECK_TIMEOUT=30 if [ "$NO_REBUILD" == "true" ] ; then CLEAR_INDEX= else - CLEAR_INDEX=1 + CLEAR_INDEX='true' fi UNIVERSAL_DIR="/home/vagrant/sync/universal" STATIC_DATA_DIR="$UNIVERSAL_DIR/testData/dbData" BUILD_DATA_DIR="$UNIVERSAL_DIR/build/dbData/snapset" -DATALOADER_IMAGE="herrclown/gpii-dataloader" DATALOADER_COUCHDB_URL="http://couchdb:${COUCHDB_PORT}/gpii" DATASOURCE_HOSTNAME="http://couchdb" +DATALOADER_CMD='/app/scripts/deleteAndLoadSnapsets.sh' GPII_PREFERENCES_CONFIG="gpii.config.preferencesServer.standalone.production" GPII_PREFERENCES_PORT=9081 @@ -82,7 +82,7 @@ docker run -d -p $COUCHDB_PORT:$COUCHDB_PORT --name couchdb $COUCHDB_IMAGE wget -O /dev/null --retry-connrefused --waitretry=$COUCHDB_HEALTHCHECK_DELAY --read-timeout=20 --timeout=1 --tries=$COUCHDB_HEALTHCHECK_TIMEOUT http://localhost:$COUCHDB_PORT # Load the CouchDB data -docker run --rm --link couchdb -v $STATIC_DATA_DIR:/static_data -e STATIC_DATA_DIR=/static_data -v $BUILD_DATA_DIR:/build_data -e BUILD_DATA_DIR=/build_data -e COUCHDB_URL=$DATALOADER_COUCHDB_URL -e CLEAR_INDEX=$CLEAR_INDEX $DATALOADER_IMAGE +docker run --rm --link couchdb -v $STATIC_DATA_DIR:/static_data -e STATIC_DATA_DIR=/static_data -v $BUILD_DATA_DIR:/build_data -e BUILD_DATA_DIR=/build_data -e COUCHDB_URL=$DATALOADER_COUCHDB_URL -e CLEAR_INDEX=$CLEAR_INDEX $UNIVERSAL_IMAGE $DATALOADER_CMD # Wait for the CouchDB views become accessible. Accessing the view URL forced the view index to build which take time. # The URL returns 500 when the index is not ready, so use "--retry-on-http-error" option to continue retries at 500 response code.