From 613c878568ffb4b1a32cfe8e15e454b1b9a76c37 Mon Sep 17 00:00:00 2001 From: Daniel Eisenberg Date: Mon, 10 Feb 2025 21:30:14 +0200 Subject: [PATCH] code sync --- build_system/docker/entry.sh | 8 +- nodes/orchestration/package/CMakeLists.txt | 1 + .../watchdog/revert_orchestrator_version.sh | 79 +++++++++++++++++++ nodes/orchestration/package/watchdog/watchdog | 69 ++++++++++++++++ 4 files changed, 153 insertions(+), 4 deletions(-) create mode 100755 nodes/orchestration/package/watchdog/revert_orchestrator_version.sh diff --git a/build_system/docker/entry.sh b/build_system/docker/entry.sh index 6c8c696..e74c7a6 100644 --- a/build_system/docker/entry.sh +++ b/build_system/docker/entry.sh @@ -98,19 +98,19 @@ while true; do init=true /etc/cp/watchdog/cp-nano-watchdog >/dev/null 2>&1 & sleep 5 - active_watchdog_pid=$(pgrep -f -x -o "/bin/bash /etc/cp/watchdog/cp-nano-watchdog") + active_watchdog_pid=$(pgrep -f -x -o "/bin/(bash|sh) /etc/cp/watchdog/cp-nano-watchdog") fi - current_watchdog_pid=$(pgrep -f -x -o "/bin/bash /etc/cp/watchdog/cp-nano-watchdog") + current_watchdog_pid=$(pgrep -f -x -o "/bin/(bash|sh) /etc/cp/watchdog/cp-nano-watchdog") if [ ! -f /tmp/restart_watchdog ] && [ "$current_watchdog_pid" != "$active_watchdog_pid" ]; then echo "Error: Watchdog exited abnormally" exit 1 elif [ -f /tmp/restart_watchdog ]; then rm -f /tmp/restart_watchdog - kill -9 "$(pgrep -f -x -o "/bin/bash /etc/cp/watchdog/cp-nano-watchdog")" + kill -9 "$(pgrep -f -x -o "/bin/(bash|sh) /etc/cp/watchdog/cp-nano-watchdog")" /etc/cp/watchdog/cp-nano-watchdog >/dev/null 2>&1 & sleep 5 - active_watchdog_pid=$(pgrep -f -x -o "/bin/bash /etc/cp/watchdog/cp-nano-watchdog") + active_watchdog_pid=$(pgrep -f -x -o "/bin/(bash|sh) /etc/cp/watchdog/cp-nano-watchdog") fi sleep 5 diff --git a/nodes/orchestration/package/CMakeLists.txt b/nodes/orchestration/package/CMakeLists.txt index 6c9c4f1..f1558dd 100755 --- a/nodes/orchestration/package/CMakeLists.txt +++ b/nodes/orchestration/package/CMakeLists.txt @@ -26,6 +26,7 @@ install(FILES configuration/cp-nano-orchestration-debug-conf.json DESTINATION ./ install(FILES watchdog/watchdog DESTINATION ./orchestration/watchdog/ PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ) install(FILES watchdog/wait-for-networking-inspection-modules.sh DESTINATION ./orchestration/watchdog/ PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ) install(FILES watchdog/access_pre_init DESTINATION ./orchestration/watchdog/ PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ) +install(FILES watchdog/revert_orchestrator_version.sh DESTINATION ./orchestration/watchdog/ PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ) install(FILES local-default-policy.yaml DESTINATION ./orchestration/ PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ) install(FILES open-appsec-cloud-mgmt DESTINATION ./orchestration/ PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ) diff --git a/nodes/orchestration/package/watchdog/revert_orchestrator_version.sh b/nodes/orchestration/package/watchdog/revert_orchestrator_version.sh new file mode 100755 index 0000000..c411541 --- /dev/null +++ b/nodes/orchestration/package/watchdog/revert_orchestrator_version.sh @@ -0,0 +1,79 @@ +#!/bin/sh + +SCRIPT_FOLDER=$(dirname "$0") +PARENT_FOLDER=$(dirname "$SCRIPT_FOLDER") +FILESYSTEM_PATH=$PARENT_FOLDER +UPGRADE_STATUS_FILE=${FILESYSTEM_PATH}/revert/upgrade_status +FORBIDDEN_VERSIONS_FILE=${FILESYSTEM_PATH}/revert/forbidden_versions +LAST_KNOWN_WORKING_ORCHESTRATOR=${FILESYSTEM_PATH}/revert/last_known_working_orchestrator +LOG_FILE=$1 +CONFIG_FILE="${FILESYSTEM_PATH}/conf/cp-nano-orchestration-conf.json" + +get_configuration_with_default() +{ + section="$1" + key="$2" + default_value="$3" + + local value + value=$(awk -v section="$section" -v k="$key" -v def_value="$default_value" ' + BEGIN { + found_section=0; + found_key=0; + } + $0 ~ "\"" section "\"" { found_section=1; next; } + found_section && $0 ~ "\"" k "\"" { + found_key=1; + next; + } + found_key && $0 ~ /"value"/ { + match($0, /"value"[[:space:]]*:[[:space:]]*"?([^",}]*)"?/, arr); + if (arr[1] != "") + print arr[1]; + exit; + } + found_section && $0 ~ /^\}/ { found_section=0; found_key=0; } + END { + if (!found_key) print def_value; + } + ' "$CONFIG_FILE") + + echo "$value" +} + +log() +{ + curr_date_time=$(date +%Y-%m-%dT%H:%M:%S) + callee_function=${1} + echo "[${curr_date_time}@${callee_function}] ${2}" >>${LOG_FILE} +} + +if [ -f "$UPGRADE_STATUS_FILE" ]; then + awk '{print $2}' "$UPGRADE_STATUS_FILE" >> "$FORBIDDEN_VERSIONS_FILE" + cp "$UPGRADE_STATUS_FILE" ${FILESYSTEM_PATH}/revert/failed_upgrade_info +fi + +if [ -f "$LAST_KNOWN_WORKING_ORCHESTRATOR" ]; then + manifest_file_path=$(get_configuration_with_default "orchestration" "Manifest file path" "${FILESYSTEM_PATH}/conf/manifest.json") + cp ${FILESYSTEM_PATH}/revert/last_known_manifest "$manifest_file_path" + + to_version=$(awk '{print $2}' "$UPGRADE_STATUS_FILE") + last_known_orch_version=$($LAST_KNOWN_WORKING_ORCHESTRATOR --version) + log "revert_orchestrator_version.sh" "Reverting orchestration version $to_version to last known working orchestrator (version: $last_known_orch_version)" + installation_flags="--install" + + trusted_ca_directory=$(get_configuration_with_default "message" "Trusted CA directory" "") + if [ -n "$trusted_ca_directory" ]; then + installation_flags="${installation_flags} --certs-dir ${trusted_ca_directory}" + fi + if grep -q '^CP_VS_ID=' ${FILESYSTEM_PATH}/conf/environment-details.cfg; then + cp_vs_id=$(grep '^CP_VS_ID=' "$config_file" | cut -d'=' -f2) + installation_flags="${installation_flags} --vs_id ${cp_vs_id}" + fi + + chmod +x ${LAST_KNOWN_WORKING_ORCHESTRATOR} + $LAST_KNOWN_WORKING_ORCHESTRATOR ${installation_flags} +else + log "revert_orchestrator_version.sh" "Last known working orchestrator not found" + exit 1 +fi diff --git a/nodes/orchestration/package/watchdog/watchdog b/nodes/orchestration/package/watchdog/watchdog index c8fa4c3..ababdf7 100755 --- a/nodes/orchestration/package/watchdog/watchdog +++ b/nodes/orchestration/package/watchdog/watchdog @@ -36,6 +36,8 @@ TMP_VOL_SRVS_FILE_PRE_DEL=watchdog/wd.volatile_services.del SRVS_HALTED=watchdog/wd.services.halt SERVICE_LOG_FILE_TTL_MINUTES=10080 PIDOF_CMD_EXISTS=0 +CONFIG_FILE="${FILESYSTEM_PATH}/conf/cp-nano-orchestration-conf.json" +SETTINGS_FILE="${FILESYSTEM_PATH}/conf/settings.json" env_details_file=conf/environment-details.cfg @@ -48,6 +50,41 @@ VS_EVAL_PREFIX= var_service_startup= var_upgarde=false +get_profile_agent_setting_with_default() { + key="$1" + default_value="$2" + value=$(grep -oP "\"key\":\s*\"$key\".*?\"value\":\s*\"[^\"]+\"" $SETTINGS_FILE | sed -E 's/.*"value":\s*"([^"]+)".*/\1/') + if [ "$value" = "null" ] || [ -z "$value" ]; then + echo "$default_value" + else + echo "$value" + fi +} + +MAX_ORCH_RESTARTS=$(get_profile_agent_setting_with_default "maxOrchestrationRestartsWithinThreeMin" "10") +MAX_AGE_MINUTES=$(get_profile_agent_setting_with_default "upgradeProcessTimeoutMin" "90") +MAX_AGE_SECONDS=$((MAX_AGE_MINUTES * 60)) + +update_orchestrations_counters() +{ + current_time=$(date +%s) + elapsed_time=$((current_time - last_update)) + intervals_passed=$((elapsed_time / interval_duration)) + + if [ "$intervals_passed" -gt 0 ]; then + shifts=$((intervals_passed > 3 ? 3 : intervals_passed)) + for _ in $(seq 1 "$shifts"); do + orch_counters="0 $(echo "$orch_counters" | cut -d' ' -f1-2)" + done + last_update=$((last_update + intervals_passed * interval_duration)) + fi + + first=$(echo "$orch_counters" | cut -d' ' -f1) + rest=$(echo "$orch_counters" | cut -d' ' -f2-) + first=$((first + 1)) + orch_counters="$first $rest" +} + get_basename() { is_basename="$(command -v basename)" @@ -830,6 +867,16 @@ load_services() else var_service_startup=false fi + + crashes_revert=$(get_profile_agent_setting_with_default "allowCrashesRevert" "true") + if [ "$crashes_revert" = "true" ] && [ "$(get_basename $service)" = "cp-nano-orchestration" ] && [ -f ${FILESYSTEM_PATH}/revert/upgrade_status ]; then + update_orchestrations_counters + total_orch_restarts=$(echo "$orch_counters" | awk '{print $1 + $2 + $3}') + log "load_services" "orchestrator restart no. ${total_orch_restarts}" + if [ "$total_orch_restarts" -ge "$MAX_ORCH_RESTARTS" ]; then + ${SCRIPT_FOLDER}/revert_orchestrator_version.sh ${LOG_FILE_PATH}/$LOG_FILE + fi + fi run_service $service $gaia_ld_path increment_watchdog_process_restart_counter echo "running" > $AGENT_RUN_STATUS_FILE @@ -1010,6 +1057,12 @@ else fi IS_SERVICE_STARTED=false echo "" >${FILESYSTEM_PATH}/$SRVS_HALTED + +last_update=$(date +%s) +interval_duration=60 +orch_counters="0 0 0" +iteration_count=0 + while $(true); do if [ -z $IS_CONTAINER_ENV ] && [ -f ${FILESYSTEM_PATH}/orchestration/restart_watchdog ]; then rm -f ${FILESYSTEM_PATH}/orchestration/restart_watchdog @@ -1028,5 +1081,21 @@ while $(true); do rotate_service_log daily_log_files_cleanup + + file_age_revert=$(get_profile_agent_setting_with_default "allowFileAgeRevert" "false") + iteration_count=$((iteration_count + 1)) + if [ $((iteration_count % 10)) -eq 0 ]; then + if [ "$file_age_revert" = "true" ] && [ -f ${FILESYSTEM_PATH}/revert/upgrade_status ]; then + file_mtime=$(stat -c %Y "${FILESYSTEM_PATH}/revert/upgrade_status") + current_time=$(date +%s) + file_age=$((current_time - file_mtime)) + + if [ "$file_age" -gt "$MAX_AGE_SECONDS" ]; then + log "monitor_upgrade_status_file_age" "The file has existed for more than $MAX_AGE_MINUTES minutes." + ${SCRIPT_FOLDER}/revert_orchestrator_version.sh ${LOG_FILE_PATH}/$LOG_FILE + fi + fi + fi + sleep 5 done