From 8145f43fc5dc08fdff32c7cc25417e1b1d37390a Mon Sep 17 00:00:00 2001 From: Ondra Chaloupka Date: Tue, 17 Apr 2018 11:18:33 +0200 Subject: [PATCH 1/3] [CLOUD-2261] handling parition by querying api this is change from the older style where flock function was used to manage message migration or transaction recovery this style uses openshift api querying --- .gitignore | 1 + os-amq-launch/added/launch.sh | 2 +- os-datavirt/added/openshift-launch.sh | 5 +- .../added/launch/openshift-migrate-common.sh | 14 +- .../added/launch/openshift-node-name.sh | 20 +- os-eap-probes/added/readinessProbe.sh | 2 +- os-eap64-launch/added/openshift-launch.sh | 5 +- os-eap7-launch/added/openshift-launch.sh | 5 +- os-jdg-launch/added/openshift-launch.sh | 5 +- os-jdg7-launch/added/openshift-launch.sh | 6 +- os-partition/added/partitionPV.sh | 349 ++++++++---------- os-partition/added/queryapi/query.py | 157 ++++++++ os-partition/configure.sh | 9 + .../{install_as_root => install_as_root.sh} | 3 +- os-partition/module.yaml | 6 +- os-sso71/added/openshift-launch.sh | 4 +- os-sso72/added/openshift-launch.sh | 4 +- 17 files changed, 359 insertions(+), 238 deletions(-) create mode 100644 os-partition/added/queryapi/query.py create mode 100755 os-partition/configure.sh rename os-partition/{install_as_root => install_as_root.sh} (80%) diff --git a/.gitignore b/.gitignore index 13bbe5b9..14859fc5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .idea/ +.project diff --git a/os-amq-launch/added/launch.sh b/os-amq-launch/added/launch.sh index d3f28c7c..0584e2ca 100755 --- a/os-amq-launch/added/launch.sh +++ b/os-amq-launch/added/launch.sh @@ -57,7 +57,7 @@ if [ "$AMQ_SPLIT" = "true" ]; then DATA_DIR="${AMQ_HOME}/data" mkdir -p "${DATA_DIR}" - partitionPV "${DATA_DIR}" "${AMQ_LOCK_TIMEOUT:-30}" + partitionPV "${DATA_DIR}" else exec $AMQ_HOME/bin/activemq console fi diff --git a/os-datavirt/added/openshift-launch.sh b/os-datavirt/added/openshift-launch.sh index 4a066014..0dd87779 100755 --- a/os-datavirt/added/openshift-launch.sh +++ b/os-datavirt/added/openshift-launch.sh @@ -5,9 +5,6 @@ source ${JBOSS_HOME}/bin/launch/openshift-common.sh function runServer() { local instanceDir=$1 - local count=$2 - - export NODE_NAME="${NODE_NAME:-node}-${count}" source $JBOSS_HOME/bin/launch/configure.sh @@ -28,7 +25,7 @@ if [ "${SPLIT_DATA^^}" = "TRUE" ]; then DATA_DIR="${JBOSS_HOME}/standalone/partitioned_data" - partitionPV "${DATA_DIR}" "${SPLIT_LOCK_TIMEOUT:-30}" + partitionPV "${DATA_DIR}" else source $JBOSS_HOME/bin/launch/configure.sh diff --git a/os-eap-migration/added/launch/openshift-migrate-common.sh b/os-eap-migration/added/launch/openshift-migrate-common.sh index d0867df8..2cb03afe 100644 --- a/os-eap-migration/added/launch/openshift-migrate-common.sh +++ b/os-eap-migration/added/launch/openshift-migrate-common.sh @@ -7,9 +7,6 @@ source /opt/partition/partitionPV.sh function runMigration() { local instanceDir=$1 - local count=$2 - - export NODE_NAME="${NODE_NAME:-node}-${count}" cp -f ${STANDALONE_XML_COPY} ${STANDALONE_XML} source $JBOSS_HOME/bin/launch/configure.sh @@ -29,8 +26,15 @@ function runMigration() { local success=false local message="Finished, migration pod has been terminated" ${JBOSS_HOME}/bin/readinessProbe.sh + local probeStatus=$? + + if [ $probeStatus -eq 0 ] && [ "$(type -t probePodLog)" = 'function' ]; then + # -- checking if server.log is clean from errors (only if function of the particular name exists) + probePodLog # calling function from partitionPV.sh + probeStatus=$? + fi - if [ $? -eq 0 ] ; then + if [ $probeStatus -eq 0 ] ; then echo "$(date): Server started, checking for transactions" local startTime=$(date +'%s') local endTime=$((startTime + ${RECOVERY_TIMEOUT} + 1)) @@ -66,7 +70,7 @@ function runMigration() { if [ "${success}" = "true" ] ; then message="Finished, recovery terminated successfully" else - message="Finished, Recovery DID NOT complete, check log for details. Recovery will be reattempted." + message="Finished, Recovery DID NOT complete, check log for details. Recovery will be reattempted." fi fi diff --git a/os-eap-node-name/added/launch/openshift-node-name.sh b/os-eap-node-name/added/launch/openshift-node-name.sh index c692f650..f594d0dc 100644 --- a/os-eap-node-name/added/launch/openshift-node-name.sh +++ b/os-eap-node-name/added/launch/openshift-node-name.sh @@ -1,12 +1,18 @@ +function init_pod_name() { + # when POD_NAME is non-zero length using that given name + + # docker sets up container_uuid + [ -z "${POD_NAME}" ] && POD_NAME="${container_uuid}" + # openshift sets up the node id as host name + [ -z "${POD_NAME}" ] && POD_NAME="${HOSTNAME}" + # TODO: fail when pod name is not set here? +} + function init_node_name() { if [ -z "${JBOSS_NODE_NAME}" ] ; then - if [ -n "${NODE_NAME}" ]; then - JBOSS_NODE_NAME="${NODE_NAME}" - elif [ -n "${container_uuid}" ]; then - JBOSS_NODE_NAME="${container_uuid}" - else - JBOSS_NODE_NAME="${HOSTNAME}" - fi + init_pod_name + + JBOSS_NODE_NAME="${POD_NAME}" # CLOUD-427: truncate to 23 characters max (from the end backwards) if [ ${#JBOSS_NODE_NAME} -gt 23 ]; then diff --git a/os-eap-probes/added/readinessProbe.sh b/os-eap-probes/added/readinessProbe.sh index 694b0166..679b2072 100644 --- a/os-eap-probes/added/readinessProbe.sh +++ b/os-eap-probes/added/readinessProbe.sh @@ -8,7 +8,7 @@ LOG=/tmp/readiness-log COUNT=30 SLEEP=5 -DEBUG=false +DEBUG=${SCRIPT_DEBUG:-false} PROBE_IMPL=probe.eap.dmr.EapProbe if [ $# -gt 0 ] ; then diff --git a/os-eap64-launch/added/openshift-launch.sh b/os-eap64-launch/added/openshift-launch.sh index 6268ad23..bebf8d1d 100755 --- a/os-eap64-launch/added/openshift-launch.sh +++ b/os-eap64-launch/added/openshift-launch.sh @@ -7,9 +7,6 @@ source ${JBOSS_HOME}/bin/launch/openshift-common.sh function runServer() { local instanceDir=$1 - local count=$2 - - export NODE_NAME="${NODE_NAME:-node}-${count}" source $JBOSS_HOME/bin/launch/configure.sh @@ -30,7 +27,7 @@ if [ "${SPLIT_DATA^^}" = "TRUE" ]; then DATA_DIR="${JBOSS_HOME}/standalone/partitioned_data" - partitionPV "${DATA_DIR}" "${SPLIT_LOCK_TIMEOUT:-30}" + partitionPV "${DATA_DIR}" else source $JBOSS_HOME/bin/launch/configure.sh diff --git a/os-eap7-launch/added/openshift-launch.sh b/os-eap7-launch/added/openshift-launch.sh index 941c235b..e6ade86b 100755 --- a/os-eap7-launch/added/openshift-launch.sh +++ b/os-eap7-launch/added/openshift-launch.sh @@ -13,9 +13,6 @@ function clean_shutdown() { function runServer() { local instanceDir=$1 - local count=$2 - - export NODE_NAME="${NODE_NAME:-node}-${count}" source $JBOSS_HOME/bin/launch/configure.sh @@ -47,7 +44,7 @@ if [ "${SPLIT_DATA^^}" = "TRUE" ]; then DATA_DIR="${JBOSS_HOME}/standalone/partitioned_data" - partitionPV "${DATA_DIR}" "${SPLIT_LOCK_TIMEOUT:-30}" + partitionPV "${DATA_DIR}" else source $JBOSS_HOME/bin/launch/configure.sh diff --git a/os-jdg-launch/added/openshift-launch.sh b/os-jdg-launch/added/openshift-launch.sh index 25b36880..3e6b51c0 100755 --- a/os-jdg-launch/added/openshift-launch.sh +++ b/os-jdg-launch/added/openshift-launch.sh @@ -4,9 +4,6 @@ source ${JBOSS_HOME}/bin/launch/openshift-common.sh function runServer() { local instanceDir=$1 - local count=$2 - - export NODE_NAME="${NODE_NAME:-node}-${count}" source $JBOSS_HOME/bin/launch/configure.sh @@ -27,7 +24,7 @@ if [ "${SPLIT_DATA^^}" = "TRUE" ]; then DATA_DIR="${JBOSS_HOME}/standalone/partitioned_data" - partitionPV "${DATA_DIR}" "${SPLIT_LOCK_TIMEOUT:-30}" + partitionPV "${DATA_DIR}" else source $JBOSS_HOME/bin/launch/configure.sh diff --git a/os-jdg7-launch/added/openshift-launch.sh b/os-jdg7-launch/added/openshift-launch.sh index b2872661..7b17eb5e 100755 --- a/os-jdg7-launch/added/openshift-launch.sh +++ b/os-jdg7-launch/added/openshift-launch.sh @@ -5,9 +5,6 @@ source $JBOSS_HOME/bin/launch/logging.sh function runServer() { local instanceDir=$1 - local count=$2 - - export NODE_NAME="${NODE_NAME:-node}-${count}" source $JBOSS_HOME/bin/launch/configure.sh @@ -24,14 +21,13 @@ function init_data_dir() { } SPLIT_DATA=${SPLIT_DATA:-$DATAGRID_SPLIT} -SPLIT_LOCK_TIMEOUT=${SPLIT_LOCK_TIMEOUT:-$DATAGRID_LOCK_TIMEOUT} if [ "${SPLIT_DATA^^}" = "TRUE" ]; then source /opt/partition/partitionPV.sh DATA_DIR="${JBOSS_HOME}/standalone/partitioned_data" - partitionPV "${DATA_DIR}" "${SPLIT_LOCK_TIMEOUT:-30}" + partitionPV "${DATA_DIR}" else source $JBOSS_HOME/bin/launch/configure.sh diff --git a/os-partition/added/partitionPV.sh b/os-partition/added/partitionPV.sh index 4cee9077..34411fab 100644 --- a/os-partition/added/partitionPV.sh +++ b/os-partition/added/partitionPV.sh @@ -1,241 +1,204 @@ +source ${JBOSS_HOME}/bin/launch/openshift-node-name.sh +[ "${SCRIPT_DEBUG}" = "true" ] && DEBUG_QUERY_API_PARAM="-l debug" + +# parameters +# - needle to search in array +# - array passed as: "${ARRAY_VAR[@]}" +function arrContains() { + local element match="$1" + shift + for element; do + [[ "$element" == "$match" ]] && return 0 + done + return 1 +} + # parameters # - base directory -# - lock timeout function partitionPV() { - LOCK_DIR="$1" - LOCK_TIMEOUT="${2:-30}" - - mkdir -p "${LOCK_DIR}" - - LOCK_FD=200 - WAITING_FD=201 - - COUNT=1 - - while : ; do - INSTANCE_DIR="${LOCK_DIR}/split-$COUNT" - mkdir -p "${INSTANCE_DIR}" - - echo "Attempting to obtain lock for directory: ($INSTANCE_DIR)" - - TERMINATING_FILE="${INSTANCE_DIR}/terminating" - RUNNING_FILE="${INSTANCE_DIR}/running" - ( - flock -n $WAITING_FD - if [ $? -eq 0 ]; then - # Nobody waiting, try to grab the lock - - flock -n $LOCK_FD - LOCK_STATUS=$? - if [ $LOCK_STATUS -ne 0 ] ; then - # Second attempt with a potential wait period - TERMINATING=$(cat "${TERMINATING_FILE}" 2>/dev/null) - if [ -z "$TERMINATING" ] ; then - # Not terminating, grab the lock without waiting - flock -n $LOCK_FD - else - # Terminating, grab the lock with timeout - echo "Existing server instance is terminating, waiting to acquire the lock" - flock -w $LOCK_TIMEOUT $LOCK_FD - fi - LOCK_STATUS=$? - fi - if [ $LOCK_STATUS -eq 0 ] ; then - echo "Successfully locked directory: ($INSTANCE_DIR)" + local podsDir="$1" + local applicationPodDir - > "$TERMINATING_FILE" - echo "$HOSTNAME" > "$RUNNING_FILE" - flock -u $WAITING_FD + mkdir -p "${podsDir}" - SERVER_DATA_DIR="${INSTANCE_DIR}/serverData" - mkdir -p "${SERVER_DATA_DIR}" + init_pod_name + local applicationPodDir="${podsDir}/${POD_NAME}" - if [ ! -f "${SERVER_DATA_DIR}/../data_initialized" ]; then - init_data_dir ${SERVER_DATA_DIR} - touch "${SERVER_DATA_DIR}/../data_initialized" - fi + local waitCounter=0 + # 2) while any file matching, sleep + while true; do + local isRecoveryInProgress=false + # is there an existing RECOVERY descriptor that means a recovery is in progress + find "${podsDir}" -maxdepth 1 -type f -name "${POD_NAME}-RECOVERY-*" 2>/dev/null | grep -q . + [ $? -eq 0 ] && isRecoveryInProgress=true - runServer "${SERVER_DATA_DIR}" "${COUNT}" & - - PID=$! - - trap "echo Received TERM ; echo \"$HOSTNAME\" > \"$TERMINATING_FILE\" ; kill -TERM $PID" TERM + # we are free to start the app container + if ! $isRecoveryInProgress; then + break + fi - wait $PID 2>/dev/null - STATUS=$? - trap - TERM - wait $PID 2>/dev/null + if $isRecoveryInProgress; then + echo "Waiting to start pod ${POD_NAME} as recovery process '$(echo ${podsDir}/${POD_NAME}-RECOVERY-*)' is currently cleaning data directory." + fi - > "${RUNNING_FILE}" + sleep 1 + echo "`date`: waiting for recovery process to clean the environment for the pod to start" + done - echo "Server terminated with status $STATUS ($(kill -l $STATUS 2>/dev/null))" + # 3) create /pods/ + SERVER_DATA_DIR="${applicationPodDir}/serverData" + mkdir -p "${SERVER_DATA_DIR}" - if [ "$STATUS" -eq 255 ] ; then - echo "Server returned 255, changing to 254" - STATUS=254 - fi + if [ ! -f "${SERVER_DATA_DIR}/../data_initialized" ]; then + init_data_dir ${SERVER_DATA_DIR} + touch "${SERVER_DATA_DIR}/../data_initialized" + fi - # If not TERM then update the terminating file to force a check - if [ "$STATUS" -ne 143 ] ; then - echo "$HOSTNAME" > "$TERMINATING_FILE" - fi + # 4) launch EAP with node name as pod name + NODE_NAME="${POD_NAME}" runServer "${SERVER_DATA_DIR}" & - echo "Releasing lock: ($INSTANCE_DIR)" - exit $STATUS - fi - else - echo "Failed to obtain lock for directory: ($INSTANCE_DIR)" - fi + PID=$! - exit 255 - ) 200> "${INSTANCE_DIR}/lock" 201> "${INSTANCE_DIR}/waiting" & + trap "echo Received TERM of pid ${PID} of pod name ${POD_NAME}; kill -TERM $PID" TERM - PID=$! + wait $PID 2>/dev/null + STATUS=$? + trap - TERM + wait $PID 2>/dev/null - trap "kill -TERM $PID" TERM + echo "Server terminated with status $STATUS ($(kill -l $STATUS 2>/dev/null))" - wait $PID 2>/dev/null - STATUS=$? - trap - TERM - wait $PID 2>/dev/null + if [ "$STATUS" -eq 255 ] ; then + echo "Server returned 255, changing to 254" + STATUS=254 + fi - if [ $STATUS -ne 255 ] ; then - break; - fi - COUNT=$(expr $COUNT + 1) - done + exit $STATUS } # parameters # - base directory -# - migration timeout # - migration pause between cycles function migratePV() { - LOCK_DIR="$1" - MIGRATION_TIMEOUT="${2:-30}" - MIGRATION_PAUSE="${3:-30}" + local podsDir="$1" + local applicationPodDir + MIGRATION_PAUSE="${2:-30}" MIGRATED=false - mkdir -p "${LOCK_DIR}" + init_pod_name + local recoveryPodName="${POD_NAME}" - LOCK_FD=200 - MIGRATING_FD=201 + while true ; do - COUNT=1 + # 1) Periodically, for each /pods/ + for applicationPodDir in "${podsDir}"/*; do + # check if the found file is type of directory, if not directory move to the next item + [ ! -d "$applicationPodDir" ] && continue - while : ; do - INSTANCE_DIR="${LOCK_DIR}/split-$COUNT" - if [ -d "$INSTANCE_DIR" ] ; then - mkdir -p "${INSTANCE_DIR}" + # 1.a) create /pods/-RECOVERY- + local applicationPodName="$(basename ${applicationPodDir})" + touch "${podsDir}/${applicationPodName}-RECOVERY-${recoveryPodName}" + STATUS=42 # expecting there could be error on getting living pods - TERMINATING_FILE="${INSTANCE_DIR}/terminating" - RUNNING_FILE="${INSTANCE_DIR}/running" + # 1.a.i) if is not in the cluster + echo "examining existence of living pod for directory: '${applicationPodDir}'" + unset LIVING_PODS + LIVING_PODS=($(python ${JBOSS_HOME}/bin/queryapi/query.py -q pods_living -f list_space ${DEBUG_QUERY_API_PARAM})) + [ $? -ne 0 ] && echo "ERROR: Can't get list of living pods" && continue + STATUS=-1 # here we have data about living pods and the recovery marker can be removed if the pod is living + if ! arrContains ${applicationPodName} "${LIVING_PODS[@]}"; then - RUNNING=$(cat "${RUNNING_FILE}" 2>/dev/null) - if [ -n "$RUNNING" ] ; then ( - flock -n $LOCK_FD - if [ $? -eq 0 ] ; then - RUNNING=$(cat "${RUNNING_FILE}" 2>/dev/null) - if [ -n "$RUNNING" ] ; then - echo "Process has terminated abnorminally, forcing a termination check" - echo "FORCED" > "${TERMINATING_FILE}" - > "${RUNNING_FILE}" - fi - fi - ) 200> "${INSTANCE_DIR}/lock" - fi + # 1.a.ii) run recovery until empty (including orphan checks and empty object store hierarchy deletion) + SERVER_DATA_DIR="${applicationPodDir}/serverData" + JBOSS_NODE_NAME="$applicationPodName" runMigration "${SERVER_DATA_DIR}" & - TERMINATING=$(cat "${TERMINATING_FILE}" 2>/dev/null) - if [ -n "$TERMINATING" ] ; then - echo "Attempting to migrate directory: ($INSTANCE_DIR)" + PID=$! - ( - flock -n $MIGRATING_FD - if [ $? -eq 0 ]; then - TERMINATING_TIME=$(stat -c "%Y" "${TERMINATING_FILE}") - CURRENT_TIME=$(date +"%s") - TIMEOUT=$(expr $MIGRATION_TIMEOUT + $TERMINATING_TIME - $CURRENT_TIME) - echo "Waiting for grace period to expire, remaining timeout is ${TIMEOUT} seconds" - while : ; do - TERMINATING=$(cat "${TERMINATING_FILE}" 2>/dev/null) - if [ -z "$TERMINATING" ] ; then - echo "Migration cancelled, no longer terminating in directory: ($INSTANCE_DIR)" - break - else - TIMEOUT=$(expr $TIMEOUT - 1) - if [ "$TIMEOUT" -gt 0 ] ; then - sleep 1 - else - break - fi - fi - done - - if [ "$TIMEOUT" -le 0 ] ; then - echo "Attempting to obtain lock for directory: ($INSTANCE_DIR)" - - flock -n $LOCK_FD - LOCK_STATUS=$? - - if [ $LOCK_STATUS -eq 0 ] ; then - echo "Successfully locked directory: ($INSTANCE_DIR)" - MIGRATED=true - - flock -u $MIGRATING_FD - - SERVER_DATA_DIR="${INSTANCE_DIR}/serverData" - MIGRATION_DIR="${SERVER_DATA_DIR}/migration" - mkdir -p "${MIGRATION_DIR}" - cd "${MIGRATION_DIR}" - - runMigration "${SERVER_DATA_DIR}" "${COUNT}" & - - PID=$! - - trap "echo Received TERM ; echo \"$HOSTNAME\" > \"$TERMINATING_FILE\" ; kill -TERM $PID" TERM - - wait $PID 2>/dev/null - STATUS=$? - trap - TERM - wait $PID 2>/dev/null - - echo "Migration terminated with status $STATUS ($(kill -l $STATUS))" - - if [ "$STATUS" -eq 0 ] ; then - > "$TERMINATING_FILE" - elif [ "$STATUS" -eq 255 ] ; then - echo "Server returned 255, changing to 254" - STATUS=254 - fi - - echo "Releasing lock: ($INSTANCE_DIR)" - exit $STATUS - fi - fi - fi + trap "echo Received TERM ; kill -TERM $PID" TERM + + wait $PID 2>/dev/null + STATUS=$? + trap - TERM + wait $PID 2>/dev/null - exit 255 - ) 200> "${INSTANCE_DIR}/lock" 201> "${INSTANCE_DIR}/migrating" & + echo "Migration terminated with status $STATUS ($(kill -l $STATUS))" + + if [ "$STATUS" -eq 255 ] ; then + echo "Server returned 255, changing to 254" + STATUS=254 + fi + exit $STATUS + ) & PID=$! trap "kill -TERM $PID" TERM wait $PID 2>/dev/null + STATUS=$? trap - TERM wait $PID 2>/dev/null + + if [ $STATUS -eq 0 ]; then + # 1.a.iii) Delete /pods/ when recovery was succesful + echo "`date`: Migration succesfully finished for application directory ${applicationPodDir} thus removing it by recovery pod ${recoveryPodName}" + rm -rf "${applicationPodDir}" + fi fi - COUNT=$(expr $COUNT + 1) - else - if [ "$MIGRATED" = "false" ] ; then - echo "Finished Migration Check cycle, pausing for ${MIGRATION_PAUSE} seconds before resuming" - COUNT=1 - sleep "${MIGRATION_PAUSE}" - else - MIGRATED=false + + # 1.b.) Deleting the recovery marker + if [ $STATUS -eq 0 ] || [ $STATUS -eq -1 ]; then + # STATUS is 0: we are free from in-doubt transactions, -1: there is a running pod of the same name (do the recovery on his own if needed) + rm -f "${podsDir}/${applicationPodName}-RECOVERY-${recoveryPodName}" fi - fi + + # 2) Periodically, for files /pods/-RECOVERY-, for failed recovery pods + for recoveryPodFilePathToCheck in "${podsDir}/"*-RECOVERY-*; do + local recoveryPodFileToCheck="$(basename ${recoveryPodFilePathToCheck})" + local recoveryPodNameToCheck=${recoveryPodFileToCheck#*RECOVERY-} + + unset LIVING_PODS + LIVING_PODS=($(python ${JBOSS_HOME}/bin/queryapi/query.py -q pods_living -f list_space ${DEBUG_QUERY_API_PARAM})) + [ $? -ne 0 ] && echo "ERROR: Can't get list of living pods" && continue + + if ! arrContains ${recoveryPodNameToCheck} "${LIVING_PODS[@]}"; then + # recovery pod is dead, garbage collecting + rm -f "${recoveryPodFilePathToCheck}" + fi + done + + done + + echo "`date`: Finished Migration Check cycle, pausing for ${MIGRATION_PAUSE} seconds before resuming" + sleep "${MIGRATION_PAUSE}" done } + +# parameters +# - pod name (optional) +function probePodLog() { + init_pod_name + local podNameToProbe=${1:-$POD_NAME} + + local logOutput=$(python ${JBOSS_HOME}/bin/queryapi/query.py -q log ${podNameToProbe}) + local probeStatus=$? + + if [ $probeStatus -ne 0 ]; then + echo "Cannot contact OpenShift API to get log for pod ${POD_NAME}" + return 1 + fi + + local isPeriodicRecoveryError=false + local patternToCheck="ERROR.*Periodic Recovery" + while read line; do + [[ $line =~ $patternToCheck ]] && isPeriodicRecoveryError=true && break + done <<< "$logOutput" + if $isPeriodicRecoveryError; then # ERROR string was found in the log output + echo "Server at ${NAMESPACE}/${POD_NAME} started with errors" + return 1 + fi + + return 0 +} diff --git a/os-partition/added/queryapi/query.py b/os-partition/added/queryapi/query.py new file mode 100644 index 00000000..0190e52b --- /dev/null +++ b/os-partition/added/queryapi/query.py @@ -0,0 +1,157 @@ +""" +Copyright 2018 Red Hat, Inc. + +Red Hat licenses this file to you under the Apache License, version +2.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied. See the License for the specific language governing +permissions and limitations under the License. +""" + +import argparse +import json +import logging +import urllib2 + +from enum import Enum + + +class QueryType(Enum): + """ + Represents what could be queried. + PODS: list of pods + LOG: log from particular pod + """ + + PODS = 'pods' + PODS_LIVING = 'pods_living' + LOG = 'log' + + def __str__(self): + return self.value + +class OutputFormat(Enum): + """ + Represents output format of this script. + RAW: no formatting + LIST_SPACE: if possible values are delimited with space and returned + LIST_COMMA: comma separated list + """ + + RAW = "raw" + LIST_SPACE = "list_space" + LIST_COMMA = "list_comma" + + def __str__(self): + return self.value + + +class OpenShiftQuery(): + """ + Utility class to help query OpenShift api. Declares constant + to get token and uri of the query. Having methods doing the query etc. + """ + + API_URL = 'https://openshift.default.svc' + TOKEN_FILE_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/token' + NAMESPACE_FILE_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/namespace' + CERT_FILE_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt' + STATUS_LIVING_PODS = ['Pending', 'Running', 'Unknown'] + + @staticmethod + def __readFile(fileToRead): + with open(fileToRead, 'r') as readingfile: + return readingfile.read().strip() + + @staticmethod + def getToken(): + return OpenShiftQuery.__readFile(OpenShiftQuery.TOKEN_FILE_PATH) + + @staticmethod + def getNameSpace(): + return OpenShiftQuery.__readFile(OpenShiftQuery.NAMESPACE_FILE_PATH) + + @staticmethod + def queryApi(urlSuffix): + request = urllib2.Request(OpenShiftQuery.API_URL + urlSuffix, + headers = {'Authorization': 'Bearer ' + OpenShiftQuery.getToken(), "Accept": 'application/json'}) + logger.debug('query for: "%s"', request.get_full_url()) + try: + return urllib2.urlopen(request, cafile = OpenShiftQuery.CERT_FILE_PATH).read() + except: + logger.critical('Cannot query OpenShift API for "%s"', request.get_full_url()) + raise + + + +def getPodsJsonData(): + jsonText = OpenShiftQuery.queryApi('/api/v1/namespaces/{}/pods'.format(OpenShiftQuery.getNameSpace())) + return json.loads(jsonText) + +def getPods(): + jsonPodsData = getPodsJsonData() + pods = [] + for pod in jsonPodsData["items"]: + logger.debug('query pod %s of status %s', pod["metadata"]["name"], pod["status"]["phase"]) + pods.append(pod["metadata"]["name"]) + return pods + +def getLivingPods(): + jsonPodsData = getPodsJsonData() + + pods = [] + for pod in jsonPodsData["items"]: + logger.debug('query pod %s of status %s', pod["metadata"]["name"], pod["status"]["phase"]) + if pod["status"]["phase"] in OpenShiftQuery.STATUS_LIVING_PODS: + pods.append(pod["metadata"]["name"]) + return pods + +def getLog(podName): + jsonText = OpenShiftQuery.queryApi('/api/v1/namespaces/{}/pods/{}/log' + .format(OpenShiftQuery.getNameSpace(), podName)) + return jsonText + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description = "Queries OpenShift API, gathering the json and parsing it to get specific info from it") + parser.add_argument("-q", "--query", required = False, type = QueryType, default = QueryType.PODS, choices=list(QueryType), help = "Query type/what to query") + parser.add_argument("-f", "--format", required = False, type = OutputFormat, default = OutputFormat.RAW, choices=list(OutputFormat), help = "Output format") + parser.add_argument("-l", "--loglevel", default="CRITICAL", help="Log level", + choices=["debug", "DEBUG", "info", "INFO", "warning", "WARNING", "error", "ERROR", "critical", "CRITICAL"]) + parser.add_argument("args", nargs = argparse.REMAINDER, help = "Arguments of the query (each query type has different)") + + args = parser.parse_args() + + # don't spam warnings (e.g. when not verifying ssl connections) + logging.captureWarnings(True) + logging.basicConfig(level = args.loglevel.upper()) + logger = logging.getLogger(__name__) + + logger.debug("Starting query openshift api with args: %s", args) + + if args.query == QueryType.PODS: + queryResult = getPods() + elif args.query == QueryType.PODS_LIVING: + queryResult = getLivingPods() + elif args.query == QueryType.LOG: + if len(args.args) < 0 or args.args[0] is None: + logger.critical('query of type "log" requires one argument to be an existing pod name') + exit(1) + queryResult = getLog(args.args[0]) + else: + logger.critical('No handler for query type %s', args.query) + exit(1) + + if args.format == OutputFormat.LIST_SPACE: + print ' '.join(queryResult) + elif args.format == OutputFormat.LIST_COMMA: + print ','.join(queryResult) + else: # RAW format + print queryResult + + exit(0) diff --git a/os-partition/configure.sh b/os-partition/configure.sh new file mode 100755 index 00000000..10425a9f --- /dev/null +++ b/os-partition/configure.sh @@ -0,0 +1,9 @@ +#!/bin/sh +set -e + +SCRIPT_DIR=$(dirname $0) +ADDED_DIR=${SCRIPT_DIR}/added + +mkdir -p ${JBOSS_HOME}/bin/queryapi +cp -r ${ADDED_DIR}/queryapi/* ${JBOSS_HOME}/bin/queryapi +chmod -R ugo+x $JBOSS_HOME/bin/queryapi diff --git a/os-partition/install_as_root b/os-partition/install_as_root.sh similarity index 80% rename from os-partition/install_as_root rename to os-partition/install_as_root.sh index 413fa457..c4e3a3cd 100755 --- a/os-partition/install_as_root +++ b/os-partition/install_as_root.sh @@ -9,5 +9,4 @@ test -d /opt/partition || mkdir /opt/partition cp "$ADDED_DIR"/partitionPV.sh \ /opt/partition/ -chmod 755 /opt/partition/partitionPV.sh \ - +chmod 755 /opt/partition/partitionPV.sh diff --git a/os-partition/module.yaml b/os-partition/module.yaml index 1ebed806..0aa42892 100644 --- a/os-partition/module.yaml +++ b/os-partition/module.yaml @@ -1,6 +1,8 @@ schema_version: 1 name: os-partition version: '1.0' -description: Legacy os-partition script package. +description: Lock-free os-partition script package. execute: -- script: install_as_root +- script: configure.sh + user: '185' +- script: install_as_root.sh diff --git a/os-sso71/added/openshift-launch.sh b/os-sso71/added/openshift-launch.sh index cc8db872..9ec6672a 100755 --- a/os-sso71/added/openshift-launch.sh +++ b/os-sso71/added/openshift-launch.sh @@ -13,8 +13,6 @@ function clean_shutdown() { function runServer() { local instanceDir=$1 - local count=$2 - export NODE_NAME="${NODE_NAME:-node}-${count}" source $JBOSS_HOME/bin/launch/configure.sh @@ -45,7 +43,7 @@ if [ "${SPLIT_DATA^^}" = "TRUE" ]; then DATA_DIR="${JBOSS_HOME}/standalone/partitioned_data" - partitionPV "${DATA_DIR}" "${SPLIT_LOCK_TIMEOUT:-30}" + partitionPV "${DATA_DIR}" else source $JBOSS_HOME/bin/launch/configure.sh diff --git a/os-sso72/added/openshift-launch.sh b/os-sso72/added/openshift-launch.sh index cc8db872..9ec6672a 100755 --- a/os-sso72/added/openshift-launch.sh +++ b/os-sso72/added/openshift-launch.sh @@ -13,8 +13,6 @@ function clean_shutdown() { function runServer() { local instanceDir=$1 - local count=$2 - export NODE_NAME="${NODE_NAME:-node}-${count}" source $JBOSS_HOME/bin/launch/configure.sh @@ -45,7 +43,7 @@ if [ "${SPLIT_DATA^^}" = "TRUE" ]; then DATA_DIR="${JBOSS_HOME}/standalone/partitioned_data" - partitionPV "${DATA_DIR}" "${SPLIT_LOCK_TIMEOUT:-30}" + partitionPV "${DATA_DIR}" else source $JBOSS_HOME/bin/launch/configure.sh From fca2b706981604d0521fa5c5c533bb277a8e8840 Mon Sep 17 00:00:00 2001 From: rcernich Date: Tue, 5 Jun 2018 19:31:45 -0600 Subject: [PATCH 2/3] CLOUD-2261 tweaks to pass unit tests Signed-off-by: rcernich --- .../added/launch/openshift-node-name.sh | 20 +++++--------- os-eap7-launch/added/openshift-launch.sh | 2 +- os-partition/added/partitionPV.sh | 26 ++++++++++++++----- os-partition/added/{queryapi => }/query.py | 1 + os-partition/install_as_root.sh | 5 ++-- os-partition/module.yaml | 2 -- 6 files changed, 31 insertions(+), 25 deletions(-) rename os-partition/added/{queryapi => }/query.py (99%) diff --git a/os-eap-node-name/added/launch/openshift-node-name.sh b/os-eap-node-name/added/launch/openshift-node-name.sh index f594d0dc..c692f650 100644 --- a/os-eap-node-name/added/launch/openshift-node-name.sh +++ b/os-eap-node-name/added/launch/openshift-node-name.sh @@ -1,18 +1,12 @@ -function init_pod_name() { - # when POD_NAME is non-zero length using that given name - - # docker sets up container_uuid - [ -z "${POD_NAME}" ] && POD_NAME="${container_uuid}" - # openshift sets up the node id as host name - [ -z "${POD_NAME}" ] && POD_NAME="${HOSTNAME}" - # TODO: fail when pod name is not set here? -} - function init_node_name() { if [ -z "${JBOSS_NODE_NAME}" ] ; then - init_pod_name - - JBOSS_NODE_NAME="${POD_NAME}" + if [ -n "${NODE_NAME}" ]; then + JBOSS_NODE_NAME="${NODE_NAME}" + elif [ -n "${container_uuid}" ]; then + JBOSS_NODE_NAME="${container_uuid}" + else + JBOSS_NODE_NAME="${HOSTNAME}" + fi # CLOUD-427: truncate to 23 characters max (from the end backwards) if [ ${#JBOSS_NODE_NAME} -gt 23 ]; then diff --git a/os-eap7-launch/added/openshift-launch.sh b/os-eap7-launch/added/openshift-launch.sh index e6ade86b..2c615a9f 100755 --- a/os-eap7-launch/added/openshift-launch.sh +++ b/os-eap7-launch/added/openshift-launch.sh @@ -7,7 +7,7 @@ source $JBOSS_HOME/bin/launch/logging.sh # TERM signal handler function clean_shutdown() { log_error "*** JBossAS wrapper process ($$) received TERM signal ***" - $JBOSS_HOME/bin/jboss-cli.sh -c ":shutdown(timeout=60)" + $JBOSS_HOME/bin/jboss-cli.sh -c "shutdown --timeout=60" wait $! } diff --git a/os-partition/added/partitionPV.sh b/os-partition/added/partitionPV.sh index 34411fab..59266706 100644 --- a/os-partition/added/partitionPV.sh +++ b/os-partition/added/partitionPV.sh @@ -1,4 +1,3 @@ -source ${JBOSS_HOME}/bin/launch/openshift-node-name.sh [ "${SCRIPT_DEBUG}" = "true" ] && DEBUG_QUERY_API_PARAM="-l debug" # parameters @@ -55,7 +54,9 @@ function partitionPV() { fi # 4) launch EAP with node name as pod name - NODE_NAME="${POD_NAME}" runServer "${SERVER_DATA_DIR}" & + #NODE_NAME="${POD_NAME}" runServer "${SERVER_DATA_DIR}" & + # node name cannot be longer than 23 chars + runServer "${SERVER_DATA_DIR}" & PID=$! @@ -76,6 +77,15 @@ function partitionPV() { exit $STATUS } +function init_pod_name() { + # when POD_NAME is non-zero length using that given name + + # docker sets up container_uuid + [ -z "${POD_NAME}" ] && POD_NAME="${container_uuid}" + # openshift sets up the node id as host name + [ -z "${POD_NAME}" ] && POD_NAME="${HOSTNAME}" + # TODO: fail when pod name is not set here? +} # parameters # - base directory @@ -104,7 +114,7 @@ function migratePV() { # 1.a.i) if is not in the cluster echo "examining existence of living pod for directory: '${applicationPodDir}'" unset LIVING_PODS - LIVING_PODS=($(python ${JBOSS_HOME}/bin/queryapi/query.py -q pods_living -f list_space ${DEBUG_QUERY_API_PARAM})) + LIVING_PODS=($(${BASH_SOURCE[0]}/query.py -q pods_living -f list_space ${DEBUG_QUERY_API_PARAM})) [ $? -ne 0 ] && echo "ERROR: Can't get list of living pods" && continue STATUS=-1 # here we have data about living pods and the recovery marker can be removed if the pod is living if ! arrContains ${applicationPodName} "${LIVING_PODS[@]}"; then @@ -112,7 +122,11 @@ function migratePV() { ( # 1.a.ii) run recovery until empty (including orphan checks and empty object store hierarchy deletion) SERVER_DATA_DIR="${applicationPodDir}/serverData" - JBOSS_NODE_NAME="$applicationPodName" runMigration "${SERVER_DATA_DIR}" & + JBOSS_NODE_NAME="$applicationPodName" + if [ ${#JBOSS_NODE_NAME} -gt 23 ]; then + JBOSS_NODE_NAME=${JBOSS_NODE_NAME: -23} + fi + runMigration "${SERVER_DATA_DIR}" & PID=$! @@ -160,7 +174,7 @@ function migratePV() { local recoveryPodNameToCheck=${recoveryPodFileToCheck#*RECOVERY-} unset LIVING_PODS - LIVING_PODS=($(python ${JBOSS_HOME}/bin/queryapi/query.py -q pods_living -f list_space ${DEBUG_QUERY_API_PARAM})) + LIVING_PODS=($(${BASH_SOURCE[0]}/query.py -q pods_living -f list_space ${DEBUG_QUERY_API_PARAM})) [ $? -ne 0 ] && echo "ERROR: Can't get list of living pods" && continue if ! arrContains ${recoveryPodNameToCheck} "${LIVING_PODS[@]}"; then @@ -182,7 +196,7 @@ function probePodLog() { init_pod_name local podNameToProbe=${1:-$POD_NAME} - local logOutput=$(python ${JBOSS_HOME}/bin/queryapi/query.py -q log ${podNameToProbe}) + local logOutput=$(${BASH_SOURCE[0]}/query.py -q log ${podNameToProbe}) local probeStatus=$? if [ $probeStatus -ne 0 ]; then diff --git a/os-partition/added/queryapi/query.py b/os-partition/added/query.py similarity index 99% rename from os-partition/added/queryapi/query.py rename to os-partition/added/query.py index 0190e52b..c9291bc7 100644 --- a/os-partition/added/queryapi/query.py +++ b/os-partition/added/query.py @@ -1,3 +1,4 @@ +#!/bin/python """ Copyright 2018 Red Hat, Inc. diff --git a/os-partition/install_as_root.sh b/os-partition/install_as_root.sh index c4e3a3cd..7fa743f8 100755 --- a/os-partition/install_as_root.sh +++ b/os-partition/install_as_root.sh @@ -6,7 +6,6 @@ ADDED_DIR=${SCRIPT_DIR}/added test -d /opt/partition || mkdir /opt/partition -cp "$ADDED_DIR"/partitionPV.sh \ - /opt/partition/ +cp "$ADDED_DIR"/* /opt/partition/ -chmod 755 /opt/partition/partitionPV.sh +chmod 755 /opt/partition/* diff --git a/os-partition/module.yaml b/os-partition/module.yaml index 0aa42892..7beccac9 100644 --- a/os-partition/module.yaml +++ b/os-partition/module.yaml @@ -3,6 +3,4 @@ name: os-partition version: '1.0' description: Lock-free os-partition script package. execute: -- script: configure.sh - user: '185' - script: install_as_root.sh From c0ad3d349465ed1970bf117f1d48192e4495749d Mon Sep 17 00:00:00 2001 From: rcernich Date: Wed, 6 Jun 2018 11:18:12 -0600 Subject: [PATCH 3/3] CLOUD-2261 more tweaks to pass unit tests Signed-off-by: rcernich --- os-partition/added/partitionPV.sh | 6 +++--- tests/features/datagrid/datagrid_split.feature | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/os-partition/added/partitionPV.sh b/os-partition/added/partitionPV.sh index 59266706..54732020 100644 --- a/os-partition/added/partitionPV.sh +++ b/os-partition/added/partitionPV.sh @@ -114,7 +114,7 @@ function migratePV() { # 1.a.i) if is not in the cluster echo "examining existence of living pod for directory: '${applicationPodDir}'" unset LIVING_PODS - LIVING_PODS=($(${BASH_SOURCE[0]}/query.py -q pods_living -f list_space ${DEBUG_QUERY_API_PARAM})) + LIVING_PODS=($($(dirname ${BASH_SOURCE[0]})/query.py -q pods_living -f list_space ${DEBUG_QUERY_API_PARAM})) [ $? -ne 0 ] && echo "ERROR: Can't get list of living pods" && continue STATUS=-1 # here we have data about living pods and the recovery marker can be removed if the pod is living if ! arrContains ${applicationPodName} "${LIVING_PODS[@]}"; then @@ -174,7 +174,7 @@ function migratePV() { local recoveryPodNameToCheck=${recoveryPodFileToCheck#*RECOVERY-} unset LIVING_PODS - LIVING_PODS=($(${BASH_SOURCE[0]}/query.py -q pods_living -f list_space ${DEBUG_QUERY_API_PARAM})) + LIVING_PODS=($($(dirname ${BASH_SOURCE[0]})/query.py -q pods_living -f list_space ${DEBUG_QUERY_API_PARAM})) [ $? -ne 0 ] && echo "ERROR: Can't get list of living pods" && continue if ! arrContains ${recoveryPodNameToCheck} "${LIVING_PODS[@]}"; then @@ -196,7 +196,7 @@ function probePodLog() { init_pod_name local podNameToProbe=${1:-$POD_NAME} - local logOutput=$(${BASH_SOURCE[0]}/query.py -q log ${podNameToProbe}) + local logOutput=$($(dirname ${BASH_SOURCE[0]})/query.py -q log ${podNameToProbe}) local probeStatus=$? if [ $probeStatus -ne 0 ]; then diff --git a/tests/features/datagrid/datagrid_split.feature b/tests/features/datagrid/datagrid_split.feature index 6148b55c..ea2c1b5c 100644 --- a/tests/features/datagrid/datagrid_split.feature +++ b/tests/features/datagrid/datagrid_split.feature @@ -4,7 +4,7 @@ Feature: Openshift DataGrid SPLIT tests Scenario: Ensure split doesn't happen with regular configuration When container is ready Then container log should match regex .*Data Grid.*started.* - And available container log should not contain Attempting to obtain lock for directory: + And available container log should contain jboss.server.data.dir = /opt/datagrid/standalone/data @jboss-datagrid-6 @jboss-datagrid-7 Scenario: Ensure split happens with SPLIT_DATA @@ -12,7 +12,7 @@ Feature: Openshift DataGrid SPLIT tests | variable | value | | SPLIT_DATA | TRUE | Then container log should match regex .*Data Grid.*started.* - And available container log should contain Attempting to obtain lock for directory: + And available container log should contain jboss.server.data.dir = /opt/datagrid/standalone/partitioned_data/ @jboss-datagrid-7 Scenario: Ensure split happens with DATAGRID_SPLIT @@ -20,4 +20,4 @@ Feature: Openshift DataGrid SPLIT tests | variable | value | | DATAGRID_SPLIT | TRUE | Then container log should match regex .*Data Grid.*started.* - And available container log should contain Attempting to obtain lock for directory: + And available container log should contain jboss.server.data.dir = /opt/datagrid/standalone/partitioned_data/