diff --git a/Dockerfile b/Dockerfile index 024de9f2f..559ae1367 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,7 @@ WORKDIR /app COPY . /app RUN apk add --no-cache --virtual build-dependencies python make git g++ && \ + apk add --no-cache curl jq && \ npm install && \ chown -R node:node . && \ npm cache clean --force && \ diff --git a/documentation/DataLoader.md b/documentation/DataLoader.md new file mode 100644 index 000000000..aec227398 --- /dev/null +++ b/documentation/DataLoader.md @@ -0,0 +1,81 @@ +# CouchDB Data Loader + +(`scripts/deleteAndLoadSnapsets.sh`) + +This script is used to setup CouchDB database and is executed as a Kubernetes batch Job every time a new version of the +universal image is deployed to the cluster (also when cluster is initially created). + +It does the following: + +1. Converts the preferences in universal into `snapset` Prefs Safes and their associated GPII Keys, +2. Optionally deletes the existing database, +3. Creates a CouchDB database if none exists, +4. Updates the database with respect to its `_design/views` document, as required, +5. Deletes the `snapset` Prefs Safes and their associated GPII Keys, if any, currently in the database, +6. Loads the latest snapsets and associated keys created at step 1. into the database. + +Steps 4, 5, and 6 are handled by, and documented further in [`scripts/deleteAndLoadSnapsets.js`](https://github.com/GPII/universal/blob/master/scripts/deleteAndLoadSnapsets.js#L11). + +## Environment Variables + +With the exception of `GPII_COUCHDB_URL`, the following environment variables have default values defined within +`scripts/deleteAndLoadSnapsets.sh`. The database, `GPII_COUCHDB_URL`, must be set outside of the script. Developers +can set these variables as needed for testing and experimentation. + +The use of environment variables for data directories is also useful if you want to mount the database data using a Docker +volume and point the data loader at it. + +WARNING: setting `GPII_CLEAR_INDEX` to `true` will erase all the contents of the database. Use with caution, and with +your own database for development. In a staging or production environment, these variables are set appropriately for +those contexts; in particular `GPII_CLEAR_INDEX` will not be set. + +- `GPII_COUCHDB_URL`: URL of the CouchDB database. (required) +- `GPII_CLEAR_INDEX`: If set to `true`, the database at `$GPII_COUCHDB_URL` will be deleted and replaced with an empty + database. (optional) +- `GPII_STATIC_DATA_DIR`: The directory where the static data to be loaded into CouchDB resides. (optional) +- `GPII_PREFERENCES_DATA_DIR`: The directory containing the "raw" preferences that are converted into `snapset` Prefs + Safes and their associated GPII Keys (step 1 above). (optional) +- `GPII_BUILD_DATA_DIR`: The directory where the data built from the conversion step reside. (optional) +- `GPII_APP_DIR`: The main directory, typically `universal`. (optional) + +Note that since [the docker doesn't support the environment variable type of +array](https://github.com/moby/moby/issues/20169), separate environment variables are used for inputting data +directories instead of one array that holds these directories. + +## Running + +Example using containers: + +```bash +$ docker run -d -p 5984:5984 --name couchdb couchdb +$ docker run --rm --link couchdb -e GPII_COUCHDB_URL=http://couchdb:5984/gpii \ + -e GPII_CLEAR_INDEX=true vagrant-universal scripts/deleteAndLoadSnapsets.sh +$ docker run -d -p 8081:8081 --name preferences --link couchdb \ + -e NODE_ENV=gpii.config.preferencesServer.standalone.production \ + -e PREFERENCESSERVER_LISTEN_PORT=8081 -e DATASOURCE_HOSTNAME=http://couchdb \ + -e DATASOURCE_PORT=5984 vagrant-universal +``` + +Below are two versions of loading couchdb data from a different location (e.g. +/home/vagrant/sync/universal/testData/dbData for static data directory and /home/vagrant/sync/universal/build/dbData for +build data directory). The first version has the optional `GPII_CLEAR_INDEX` set to true to erase and reset the +database prior to other database changes: + +```bash +$ docker run --name dataloader --link couchdb \ + -v /home/vagrant/sync/universal/testData/dbData:/static_data -e GPII_STATIC_DATA_DIR=/static_data \ + -v /home/vagrant/sync/universal/build/dbData:/build_data -e GPII_BUILD_DATA_DIR=/build_data \ + -e GPII_COUCHDB_URL=http://couchdb:5984/gpii \ + -e GPII_CLEAR_INDEX=true vagrant-universal scripts/deleteAndLoadSnapsets.sh +``` + +The second version does not set `GPII_CLEAR_INDEX` such that any existing database is left intact prior to subsequent +changes to it (e.g., deleting the snapsets): + +```bash +$ docker run --name dataloader --link couchdb \ + -v /home/vagrant/sync/universal/testData/dbData:/static_data -e GPII_STATIC_DATA_DIR=/static_data \ + -v /home/vagrant/sync/universal/build/dbData:/build_data -e GPII_BUILD_DATA_DIR=/build_data \ + -e GPII_COUCHDB_URL=http://couchdb:5984/gpii \ + vagrant-universal scripts/deleteAndLoadSnapsets.sh +``` diff --git a/documentation/README.md b/documentation/README.md index 2de699630..c3952effe 100644 --- a/documentation/README.md +++ b/documentation/README.md @@ -9,6 +9,7 @@ * [Preferences Server](PreferencesServer.md) * [Data Model for Preferences and OAuth Data](DataModel.md) * [Pouch Manager](PouchManager.md) + * [Data Loader](DataLoader.md) * [MatchMakerFramework](MatchMakerFramework.md) * [Flat Match Maker](FlatMatchMaker.md) * [Apptology](Apptology.md) diff --git a/gpii/node_modules/pouchManager/test/pouchManagerTests.js b/gpii/node_modules/pouchManager/test/pouchManagerTests.js index d30b60047..9cf92c139 100644 --- a/gpii/node_modules/pouchManager/test/pouchManagerTests.js +++ b/gpii/node_modules/pouchManager/test/pouchManagerTests.js @@ -254,8 +254,9 @@ fluid.defaults("gpii.tests.pouchManager.testEnvironment", { baseDir: { expander: { funcName: "fluid.stringTemplate", - args: ["%base/pouchManagerTests", { - base: "@expand:{settingsDir}.getBaseSettingsDir()" + args: ["%base/pouchManagerTests-%id", { + base: "@expand:{settingsDir}.getBaseSettingsDir()", + id: "{that}.id" }] } }, diff --git a/scripts/convertPrefs.js b/scripts/convertPrefs.js index d3429cabc..5daa8ef59 100644 --- a/scripts/convertPrefs.js +++ b/scripts/convertPrefs.js @@ -27,7 +27,7 @@ var inputDir = process.argv[2]; var targetDir = process.argv[3]; var prefsSafeType = process.argv[4] || "user"; -if (prefsSafeType !== "snapset" && prefsSafeType !== "user") { +if (process.argv.length < 4 || (prefsSafeType !== "snapset" && prefsSafeType !== "user")) { console.log("Usage: node scripts/convertPrefs.js InputFolder OutputFolder PrefsSafeType"); console.log(" where PrefsSafeType, is one of 'snapset' or 'user' (defaults to 'user')"); process.exit(1); @@ -45,7 +45,7 @@ rimraf(targetDir, function () { filenames.forEach(function (filename) { if (filename.endsWith(".json5")) { var gpiiKey = filename.substr(0, filename.length - 6); - var preferences = fs.readFileSync(inputDir + filename, "utf-8"); + var preferences = fs.readFileSync(inputDir + "/" + filename, "utf-8"); var currentTime = new Date().toISOString(); var prefsSafeId = "prefsSafe-" + gpiiKey; @@ -80,11 +80,11 @@ rimraf(targetDir, function () { }); // Write the target files - var prefsSafesFile = targetDir + "prefsSafes.json"; + var prefsSafesFile = targetDir + "/prefsSafes.json"; console.log("prefsSafesFile: " + prefsSafesFile); fs.writeFileSync(prefsSafesFile, JSON.stringify(prefsSafes, null, 4)); - var gpiiKeysFile = targetDir + "gpiiKeys.json"; + var gpiiKeysFile = targetDir + "/gpiiKeys.json"; fs.writeFileSync(gpiiKeysFile, JSON.stringify(gpiiKeys, null, 4)); console.log("Finished converting preferences data in the source directory " + inputDir + " to the target directory " + targetDir); diff --git a/scripts/deleteAndLoadSnapsets.sh b/scripts/deleteAndLoadSnapsets.sh new file mode 100755 index 000000000..4677f7a5a --- /dev/null +++ b/scripts/deleteAndLoadSnapsets.sh @@ -0,0 +1,86 @@ +#!/bin/sh +GPII_APP_DIR=${GPII_APP_DIR:-"/app"} + +GPII_STATIC_DATA_DIR=${GPII_STATIC_DATA_DIR:-"${GPII_APP_DIR}/testData/dbData"} +GPII_PREFERENCES_DATA_DIR=${GPII_PREFERENCES_DATA_DIR:-"${GPII_APP_DIR}/testData/preferences"} +GPII_BUILD_DATA_DIR=${GPII_BUILD_DATA_DIR:-'/tmp/build/dbData'} + +DATALOADER_JS="${GPII_APP_DIR}/scripts/deleteAndLoadSnapsets.js" +CONVERT_JS="${GPII_APP_DIR}/scripts/convertPrefs.js" + +log() { + echo "$(date +'%Y-%m-%d %H:%M:%S') - $1" +} + +warm_indices(){ + log "Warming indices..." + + for view in $(curl -s "${GPII_COUCHDB_URL}/_design/views/" | jq -r '.views | keys[]'); do + curl -fsS "${GPII_COUCHDB_URL}/_design/views/_view/${view}" >/dev/null + done + + log "Finished warming indices..." +} + +# Verify variables +if [ -z "${GPII_COUCHDB_URL}" ]; then + echo "GPII_COUCHDB_URL environment variable must be defined" + exit 1 +fi + +GPII_COUCHDB_URL_SANITIZED=$(echo "${GPII_COUCHDB_URL}" | sed -e 's,\(://\)[^/]*\(@\),\1\2,g') + +log 'Starting' +log "CouchDB: ${GPII_COUCHDB_URL_SANITIZED}" +log "Clear index: ${GPII_CLEAR_INDEX}" +log "Static: ${GPII_STATIC_DATA_DIR}" +log "Build: ${GPII_BUILD_DATA_DIR}" +log "Working directory: $(pwd)" + +# Check we can connect to CouchDB +GPII_COUCHDB_URL_ROOT=$(echo "${GPII_COUCHDB_URL}" | sed 's/[^\/]*$//g') +RET_CODE=$(curl --write-out '%{http_code}' --silent --output /dev/null "${GPII_COUCHDB_URL_ROOT}/_up") +if [ "$RET_CODE" != '200' ]; then + log "[ERROR] Failed to connect to CouchDB: ${GPII_COUCHDB_URL_SANITIZED}" + exit 1 +fi + +# Create build dir if it does not exist +if [ ! -d "${GPII_BUILD_DATA_DIR}" ]; then + mkdir -p "${GPII_BUILD_DATA_DIR}" +fi + +# Convert preferences json5 to GPII keys and preferences safes +if [ -d "${GPII_PREFERENCES_DATA_DIR}" ]; then + node "${CONVERT_JS}" "${GPII_PREFERENCES_DATA_DIR}" "${GPII_BUILD_DATA_DIR}" snapset + if [ "$?" != '0' ]; then + log "[ERROR] ${CONVERT_JS} failed (exit code: $?)" + exit 1 + fi +else + log "GPII_PREFERENCES_DATA_DIR ($GPII_PREFERENCES_DATA_DIR) does not exist, nothing to convert" +fi + +# Initialize (possibly clear) data base +if [ "${GPII_CLEAR_INDEX}" == 'true' ]; then + log "Deleting database at ${GPII_COUCHDB_URL_SANITIZED}" + if ! curl -fsS -X DELETE "${GPII_COUCHDB_URL}"; then + log "Error deleting database" + fi +fi + +log "Creating database at ${GPII_COUCHDB_URL_SANITIZED}" +if ! curl -fsS -X PUT "${GPII_COUCHDB_URL}"; then + log "Database already exists at ${GPII_COUCHDB_URL_SANITIZED}" +fi + +# Submit data +node "${DATALOADER_JS}" "${GPII_COUCHDB_URL}" "${GPII_STATIC_DATA_DIR}" "${GPII_BUILD_DATA_DIR}" +err=$? +if [ "${err}" != '0' ]; then + log "${DATALOADER_JS} failed with ${err}, exiting" + exit "${err}" +fi + +# Warm Data +warm_indices diff --git a/scripts/vagrantCloudBasedContainers.sh b/scripts/vagrantCloudBasedContainers.sh index 49bcc8d4f..2a9c1eb37 100755 --- a/scripts/vagrantCloudBasedContainers.sh +++ b/scripts/vagrantCloudBasedContainers.sh @@ -35,16 +35,16 @@ COUCHDB_HEALTHCHECK_TIMEOUT=30 if [ "$NO_REBUILD" == "true" ] ; then CLEAR_INDEX= else - CLEAR_INDEX=1 + CLEAR_INDEX='true' fi UNIVERSAL_DIR="/home/vagrant/sync/universal" STATIC_DATA_DIR="$UNIVERSAL_DIR/testData/dbData" BUILD_DATA_DIR="$UNIVERSAL_DIR/build/dbData/snapset" -DATALOADER_IMAGE="herrclown/gpii-dataloader" DATALOADER_COUCHDB_URL="http://couchdb:${COUCHDB_PORT}/gpii" DATASOURCE_HOSTNAME="http://couchdb" +DATALOADER_CMD="/app/scripts/deleteAndLoadSnapsets.sh" GPII_PREFERENCES_CONFIG="gpii.config.preferencesServer.standalone.production" GPII_PREFERENCES_PORT=9081 @@ -82,7 +82,7 @@ docker run -d -p $COUCHDB_PORT:$COUCHDB_PORT --name couchdb $COUCHDB_IMAGE wget -O /dev/null --retry-connrefused --waitretry=$COUCHDB_HEALTHCHECK_DELAY --read-timeout=20 --timeout=1 --tries=$COUCHDB_HEALTHCHECK_TIMEOUT http://localhost:$COUCHDB_PORT # Load the CouchDB data -docker run --rm --link couchdb -v $STATIC_DATA_DIR:/static_data -e STATIC_DATA_DIR=/static_data -v $BUILD_DATA_DIR:/build_data -e BUILD_DATA_DIR=/build_data -e COUCHDB_URL=$DATALOADER_COUCHDB_URL -e CLEAR_INDEX=$CLEAR_INDEX $DATALOADER_IMAGE +docker run --rm --link couchdb -v $STATIC_DATA_DIR:/static_data -e GPII_STATIC_DATA_DIR=/static_data -v $BUILD_DATA_DIR:/build_data -e GPII_BUILD_DATA_DIR=/build_data -e GPII_COUCHDB_URL=$DATALOADER_COUCHDB_URL -e GPII_CLEAR_INDEX=$CLEAR_INDEX $UNIVERSAL_IMAGE $DATALOADER_CMD # Wait for the CouchDB views become accessible. Accessing the view URL forced the view index to build which take time. # The URL returns 500 when the index is not ready, so use "--retry-on-http-error" option to continue retries at 500 response code.