#!/bin/sh SCRIPT_FOLDER=$(dirname "$0") PARENT_FOLDER=$(dirname "$SCRIPT_FOLDER") FILESYSTEM_PATH=$PARENT_FOLDER VS_ID_PATTERN="vs[0-9]\+$" VS_ID= TMP_FOLDER="/tmp" USR_LIB_PATH="/usr/lib" NGEN_LIB_PATH=${USR_LIB_PATH}/cpnano/ if echo "$PARENT_FOLDER" | grep -q "$VS_ID_PATTERN"; then VS_ID="${PARENT_FOLDER##*vs}" TMP_FOLDER="/tmp/${VS_ID}" mkdir -p ${TMP_FOLDER} NGEN_LIB_PATH=${USR_LIB_PATH}/cpnano/vs${VS_ID}/ if [ -f "/etc/bashrc" ]; then . /etc/bashrc vsenv ${VS_ID} fi fi LOG_FILE_PATH="/var/log" INIT_D_PATH="/etc/init.d" WATCHDOG_PROCESS_RESTART_COUNTER="${FILESYSTEM_PATH}/watchdog/watchdog_process_restart" LOG_FILE=nano_agent/cp-nano-watchdog.dbg AGENT_RUN_STATUS_FILE=${TMP_FOLDER}/agent-status.txt SRVS_FILE=watchdog/wd.services STARTUP_SRVS_FILE=watchdog/wd.services.startup TMP_SRVS_FILE=watchdog/wd.temp VOL_SRVS_FILE=watchdog/wd.volatile_services SRVS_CONTAINER_FILE=watchdog/wd.container_services_startup SRVS_TO_RESTART_FILE=watchdog/wd.services.restart SRVS_TO_STOP_FILE=watchdog/wd.services.stop TMP_SRVS_TO_STOP_FILE=watchdog/wd.stop.temp TMP_VOL_SRVS_FILE_PRE_STOP=watchdog/wd.volatile_services.stop TMP_VOL_SRVS_FILE_PRE_DEL=watchdog/wd.volatile_services.del SRVS_HALTED=watchdog/wd.services.halt SERVICE_LOG_FILE_TTL_MINUTES=10080 PIDOF_CMD_EXISTS=0 CONFIG_FILE="${FILESYSTEM_PATH}/conf/cp-nano-orchestration-conf.json" SETTINGS_FILE="${FILESYSTEM_PATH}/conf/settings.json" env_details_file=conf/environment-details.cfg #default raw log file size in K-Bytes before rotation DEFAULT_MAX_FILE_SIZE=4096 #default amount of archived log files to rotate DEFAULT_MAX_ROTATION=10 VS_EVAL_PREFIX= var_service_startup= var_upgarde=false get_profile_agent_setting_with_default() { key="$1" default_value="$2" value=$(grep -oP "\"key\":\s*\"$key\".*?\"value\":\s*\"[^\"]+\"" $SETTINGS_FILE | sed -E 's/.*"value":\s*"([^"]+)".*/\1/') if [ "$value" = "null" ] || [ -z "$value" ]; then echo "$default_value" else echo "$value" fi } MAX_ORCH_RESTARTS=$(get_profile_agent_setting_with_default "maxOrchestrationRestartsWithinThreeMin" "10") MAX_AGE_MINUTES=$(get_profile_agent_setting_with_default "upgradeProcessTimeoutMin" "90") MAX_AGE_SECONDS=$((MAX_AGE_MINUTES * 60)) update_orchestrations_counters() { current_time=$(date +%s) elapsed_time=$((current_time - last_update)) intervals_passed=$((elapsed_time / interval_duration)) if [ "$intervals_passed" -gt 0 ]; then shifts=$((intervals_passed > 3 ? 3 : intervals_passed)) for _ in $(seq 1 "$shifts"); do orch_counters="0 $(echo "$orch_counters" | cut -d' ' -f1-2)" done last_update=$((last_update + intervals_passed * interval_duration)) fi first=$(echo "$orch_counters" | cut -d' ' -f1) rest=$(echo "$orch_counters" | cut -d' ' -f2-) first=$((first + 1)) orch_counters="$first $rest" } get_basename() { is_basename="$(command -v basename)" if [ -n ${is_basename} ]; then echo $(basename $1) else echo $(echo $1 | rev | cut -d / -f 1 | rev) fi } load_paths() { [ -f /etc/environment ] && . "/etc/environment" if [ -f "${FILESYSTEM_PATH}/$env_details_file" ]; then . ${FILESYSTEM_PATH}/$env_details_file fi if [ -n "${CP_ENV_FILESYSTEM}" ]; then FILESYSTEM_PATH=$CP_ENV_FILESYSTEM fi if [ -n "${CP_ENV_LOG_FILE}" ]; then LOG_FILE_PATH=$CP_ENV_LOG_FILE fi if [ -n "${CP_VS_ID}" ]; then VS_ID=${CP_VS_ID} VS_EVAL_PREFIX="ip netns exec CTX0000${VS_ID} env" NGEN_LIB_PATH=${USR_LIB_PATH}/cpnano/vs${VS_ID}/ fi if [ -n "${CP_USR_LIB_PATH}" ]; then USR_LIB_PATH=$CP_USR_LIB_PATH if [ -z "${VS_ID}" ]; then NGEN_LIB_PATH=${USR_LIB_PATH}/cpnano/ else NGEN_LIB_PATH=${USR_LIB_PATH}/cpnano/vs${VS_ID}/ fi fi if [ -n "${CP_INIT_D_PATH}" ]; then INIT_D_PATH=$CP_INIT_D_PATH fi if [ -z "${MAX_FILE_SIZE}" ]; then MAX_FILE_SIZE=$DEFAULT_MAX_FILE_SIZE fi if [ -z "${MAX_ROTATION}" ]; then MAX_ROTATION=$DEFAULT_MAX_ROTATION fi } load_paths pidof_cmd="pidof -x" if command -v pidof > /dev/null 2>&1; then PIDOF_CMD_EXISTS=1 fi ARCH="x86" if [ -f ${FILESYSTEM_PATH}/watchdog/platform ]; then ARCH=$(cat ${FILESYSTEM_PATH}/watchdog/platform) else for m in "$@"; do if [ $m = --arm32_openwrt ]; then ARCH="arm" MAX_FILE_SIZE=50 echo "arm" >${FILESYSTEM_PATH}/watchdog/platform elif [ $m = --gaia ]; then ARCH="gaia" echo "gaia" >${FILESYSTEM_PATH}/watchdog/platform fi done if [ ! -f ${FILESYSTEM_PATH}/watchdog/platform ]; then if [ -n "$(uname -a | awk '{print $(NF -1) }' | grep arm)" ]; then ARCH="arm" echo "arm" >${FILESYSTEM_PATH}/watchdog/platform fi fi fi alpine_pid() { ps -ef | grep $1 | grep -v grep | awk '{printf $1 " "}' } ls -l /etc/ | grep release > /dev/null 2>&1 retval=$? if [ $retval -eq 0 ]; then if [ ! -z "$(cat /etc/*release | grep alpine)" ]; then pidof_cmd="alpine_pid" ARCH="alpine" echo "alpine" >${FILESYSTEM_PATH}/watchdog/platform fi fi is_smb_release=0 if [ -f /pfrm2.0/bin/cposd ]; then is_smb_release=1 ARCH="smb" echo "smb" >${FILESYSTEM_PATH}/watchdog/platform pidof_cmd="/pfrm2.0/bin/nano_pidof" MAX_FILE_SIZE=50 if [ ! -d /storage/tmp/orchestration_downloads -o ! -h /tmp/orchestration_downloads ]; then rm -rf /tmp/orchestration_downloads mkdir -p /storage/tmp/orchestration_downloads ln -sf /storage/tmp/orchestration_downloads /tmp/orchestration_downloads fi SUB_HW_VER=`fw_printenv -n sub_hw_ver` # Clear TMPDIR(set by nano-egg in SMB), # so the nano-agent will use the default tmp dir(/tmp). export TMPDIR= fi log() { curr_date_time=$(date +%Y-%m-%dT%H:%M:%S) callee_function=${1} echo "[${curr_date_time}@${callee_function}] ${2}" >>${LOG_FILE_PATH}/$LOG_FILE } sigterm() { # Should be replaced with proper ending log "sigterm" "Terminating cp-nano-agent watchdog service" for service in $(cat ${FILESYSTEM_PATH}/${SRVS_FILE}); do if [ $ARCH = "arm" ]; then cmd_pid=$(ps | awk -v srv=${service} '{if($5==srv) print $1}') if [ "${cmd_pid:-null}" = null ] && [ $PIDOF_CMD_EXISTS -eq 1 ]; then base_name=$(get_basename $service) cmd_pid=$(pidof $base_name) fi elif [ $ARCH = "alpine" ]; then cmd_pid=$(ps -ef | awk -v srv=${service} '{if($4 ~ srv || $3 ~ srv || $6 ~ srv) print $1}') else cmd_pid=$(ps -eo pid,cmd | awk -v srv=${service} '{if($2 == srv || $3 == srv) print $1}') fi if ! [ "${cmd_pid:-null}" = null ]; then log "sigterm" "stopping $service (pid=$cmd_pid)" kill_processes_by_pid $cmd_pid fi done if [ -f ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} ]; then mv ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_STOP} for service_line in $(cat ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_STOP}); do service=$(echo $service_line | cut -f1 -d ';') family=$(echo $service_line | cut -f2 -d ';') instance_id=$(echo $service_line | cut -f3 -d ';') stop_instance $service $instance_id $family done rm ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_STOP} # wait for kill_process_by_pid that is run async by stop_instance wait fi echo "down" > $AGENT_RUN_STATUS_FILE log "sigterm" "cp-nano-agent watchdog service was successfully stopped " exit 0 } stop() { sigterm } trap 'sigterm' TERM trap 'sigterm' INT run_service() { service=$1 execution_flags= srv_debug_file= srv_err_file= gaia_ld_path=$2 run_in_vs= log "run_service" "Running the service: $service" if [ -f ${service}.cfg ]; then . "${service}.cfg" fi if [ -z ${srv_debug_file} ]; then base_name=$(get_basename $service) srv_debug_file=${LOG_FILE_PATH}/nano_agent/${base_name}.dbg fi if [ -z ${srv_err_file} ]; then base_name=$(get_basename $service) srv_err_file=${LOG_FILE_PATH}/nano_agent/${base_name}.err fi if ! [ -z ${gaia_ld_path} ]; then if [ -n "$VS_ID" ]; then NGEN_LIB_PATH="${USR_LIB_PATH}/cpnano/vs${VS_ID}/:${gaia_ld_path}" else NGEN_LIB_PATH="${USR_LIB_PATH}/cpnano/:${gaia_ld_path}" fi fi if [ -n "$VS_ID" ]; then run_in_vs="ip netns exec CTX0000${VS_ID}" fi if [ "${service}" = "${FILESYSTEM_PATH}/agentCache/cp-nano-agent-cache" ] || [ "${service}" = "${FILESYSTEM_PATH}/agentIntelligence/redis/redis-server" ] || [ "${service}" = "${FILESYSTEM_PATH}/crowdsecAux/cp-nano-crowdsec-aux" ]; then LD_LIBRARY_PATH=${NGEN_LIB_PATH} ${run_in_vs} ${service} ${execution_flags} 2>${srv_err_file} 1>/dev/null & else LD_LIBRARY_PATH=${NGEN_LIB_PATH} ${run_in_vs} ${service} ${execution_flags} --filesystem_path=${FILESYSTEM_PATH} --log_files_path=${LOG_FILE_PATH} --service_startup=${var_service_startup} 2>${srv_err_file} 1>/dev/null & fi } run_volatile_service() { service_line=$1 service=$(echo $service_line | cut -f1 -d ';') family=$(echo $service_line | cut -f2 -d ';') instance_id=$(echo $service_line | cut -f3 -d ';') already_running="0" execution_flags= srv_debug_file= srv_err_file= gaia_ld_path=$2 log "run_volatile_service" "Running the service: ${service} ($family : $instance_id)" if [ -f ${service}.cfg ]; then . "${service}.cfg" fi if [ -z $family ]; then debug_file_suffix=${instance_id} else debug_file_suffix=${family}_${instance_id} fi if [ -z ${srv_debug_file} ]; then base_name=$(get_basename $service) srv_debug_file=${LOG_FILE_PATH}/nano_agent/$base_name.dbg${debug_file_suffix} fi if [ -z ${srv_err_file} ]; then base_name=$(get_basename $service) srv_err_file=${LOG_FILE_PATH}/nano_agent/$base_name.err${debug_file_suffix} fi if ! [ -z ${gaia_ld_path} ]; then if [ -n "$VS_ID" ]; then NGEN_LIB_PATH="${USR_LIB_PATH}/cpnano/vs${VS_ID}/:${gaia_ld_path}" else NGEN_LIB_PATH="${USR_LIB_PATH}/cpnano/:${gaia_ld_path}" fi if [ "$is_smb" = "1" -a "$SUB_HW_VER" = "THX2" ]; then NGEN_LIB_PATH="/lib64:/pfrm2.0/lib64:${NGEN_LIB_PATH}" fi fi if [ -n "$VS_ID" ]; then run_in_vs="ip netns exec CTX0000${VS_ID}" fi family_arg="" if [ -n "${family}" ]; then family_arg="--family=${family}" fi base_name=$(get_basename $service) srv_err_file=${LOG_FILE_PATH}/nano_agent/$base_name.err${debug_file_suffix} echo -en "["$(date)"]" >> ${srv_debug_file} LD_LIBRARY_PATH=${NGEN_LIB_PATH} ${run_in_vs} ${service} ${execution_flags} --filesystem_path=${FILESYSTEM_PATH} --log_files_path=${LOG_FILE_PATH} --service_startup=${var_service_startup} ${family_arg} --id=${instance_id} 2>${srv_err_file} & } register() { log "register" "enter" load_paths service_name=$1 family_size=$2 family_name=$3 if [ -z $service_name ]; then log "register" "Error! no service provided for registration" echo "Error! no service provided for registration" exit 1 fi if [ ! -z $IS_CONTAINER_ENV ]; then touch ${FILESYSTEM_PATH}/${SRVS_CONTAINER_FILE} echo "$service_name" >>${FILESYSTEM_PATH}/${SRVS_CONTAINER_FILE} fi if [ -z $family_size ]; then #handle single instance services if ! [ -z "$(cat ${FILESYSTEM_PATH}/${SRVS_FILE} | grep ^${service_name}$)" ]; then log "register" "Warning! service '$service_name' is already registered" echo "Warning! service '$service_name' is already registered" exit 0 fi if echo "$service_name" | grep -q "orchestration"; then temp_file=${FILESYSTEM_PATH}/${SRVS_FILE}.tmp echo "$service_name" | cat - ${FILESYSTEM_PATH}/${SRVS_FILE} > $temp_file mv $temp_file ${FILESYSTEM_PATH}/${SRVS_FILE} else echo "$service_name" >>${FILESYSTEM_PATH}/${SRVS_FILE} fi if [ $var_upgarde = false ]; then log "register" "The service $service_name is running for the first time." echo "$service_name" >>${FILESYSTEM_PATH}/${STARTUP_SRVS_FILE} fi else touch ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} # handle multiple instances services family_prev_size=$(cat ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} | grep "^$service_name;${family_name};" | wc -l) if [ $family_size -eq $family_prev_size ]; then log "register" "Service '$service_name' already registered with $family_size instances for family '${family_name}'" echo "Service '$service_name' already registered with $family_size instances for family '${family_name}'" exit 0 fi mv ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_STOP} if [ $family_prev_size -gt $family_size ]; then for i in $(seq ${family_prev_size} -1 $((family_size - 1))); do stop_instance $service_name ${i} ${family_name} done fi # wait for kill_process_by_pid that is run async by stop_instance wait grep -v -e "^${service_name};${family_name};" ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_STOP} >${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_DEL} for i in $(seq 1 ${family_size}); do echo "$service_name;$family_name;$i" >>${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_DEL} if [ $var_upgarde = false ]; then log "register" "The service $service_name is running for the first time." echo "$service_name" >>${FILESYSTEM_PATH}/${STARTUP_SRVS_FILE} fi done mv ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_DEL} ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} rm ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_STOP} fi } # IMPORTANT - assumes /etc/cp/watchdog/wd.volatile_services was moved to # /etc/cp/watchdog/wd.volatile_services.tmp in order to avoid race condition stop_instance() { service_name=$1 instance_id=$2 family_name=$3 service=$(cat ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_STOP} | grep "^${service_name};${family_name};${instance_id}$") if [ -z $service ]; then log "stop_instance" "Warning! service '$service_name' with ID $instance_id in family $family_name is already un-registered" return fi family_arg="" if [ -n "${family_name}" ]; then family_arg="--family=${family_name}" fi if [ $ARCH = "arm" ]; then cmd_pid=$(ps | grep -- "${family_arg}" | grep -- "--id=$instance_id" | awk -v srv=${1} '{if($5==srv) print $1}') elif [ $ARCH = "alpine" ]; then cmd_pid=$(ps -ef | grep -- "${family_arg}" | grep -- "--id=$instance_id" | awk -v srv=${1} '{if($4 ~ srv || $3 ~ srv) print $1}') else cmd_pid=$(ps -eo pid,cmd,args | grep -- "${family_arg}" | grep -- "--id=$instance_id" | awk -v srv=${1} '{if($2 ~ srv || $3 ~ srv) print $1}') fi if ! [ "${cmd_pid:-null}" = null ]; then log "stop_instance" "Stopping registered service '$service_name', family $family_name, instance $instance_id with pid=$cmd_pid" echo "Stopping registered service '$service_name', family $family_name, instance $instance_id with pid=$cmd_pid" kill_processes_by_pid $cmd_pid & fi } kill_processes_by_pid() { for pid in "$@"; do kill -15 ${pid} done sleep 0.5 for pid in "$@"; do if [ -d /proc/$pid ]; then sleep 2 break fi done for pid in "$@"; do if [ -d /proc/$pid ]; then kill -9 ${pid} log "kill_processes_by_pid" "Terminating ${pid} with sigkill signal" fi done } kill_services_if_needed() { for service_name in $(cat ${FILESYSTEM_PATH}/${SRVS_TO_RESTART_FILE}); do kill_processes_by_pid $(${pidof_cmd} ${service_name}) done echo "" >${FILESYSTEM_PATH}/${SRVS_TO_RESTART_FILE} sleep 0.3 } trigger_restart_service() { service_name=$1 vol_service=$([ -f ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} ] && cat ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} | grep "^${service_name};") service=$([ -f ${FILESYSTEM_PATH}/${SRVS_FILE} ] && cat ${FILESYSTEM_PATH}/${SRVS_FILE} | grep "^${service_name}$") if [ -z "$service" ] && [ -z "$vol_service" ]; then log "trigger_restart_service" "Requested to restart service '$service_name', but it is not registered" echo "Requested to restart service '$service_name', but it is not registered" return fi echo "$service_name" >>${FILESYSTEM_PATH}/$SRVS_TO_RESTART_FILE } unregister() { service_name=$1 to_kill=$2 family_name=$3 kill_flag=1 if [ -z $service_name ]; then log "unregister" "Error! no service provided for un-registration" echo "Error! no service provided for un-registration" exit 1 fi if test "$to_kill" = "without_kill"; then kill_flag=0 fi touch ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} if [ -z "$family_name" ]; then no_family=$(cat ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} | grep "^$service_name;;" | wc -l) fi if test "$to_kill" = "--all"; then # unregister volatile service family if [ -z "$(cat ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} | grep ${service_name})" ]; then log "unregister" "Warning! service '$service_name' is already un-registered" exit 0 fi mv ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_STOP} grep -v -e "${service_name}" ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_STOP} >${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_DEL} family_size=$(cat ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_STOP} | grep "^$service_name;${family_name};" | wc -l) mv ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_DEL} ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} kill_processes_by_pid $(${pidof_cmd} ${service_name}) rm ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_STOP} elif [ -n "$family_name" ] || [ $no_family -ne 0 ]; then # unregister volatile service family if [ -z "$(cat ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} | grep ^${service_name}\;${family_name}\;)" ]; then log "unregister" "Warning! family '$family_name' of service '$service_name' is already un-registered" exit 0 fi mv ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_STOP} grep -v -e "^${service_name};${family_name};" ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_STOP} >${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_DEL} family_size=$(cat ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_STOP} | grep "^$service_name;${family_name};" | wc -l) mv ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_DEL} ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} if [ ${kill_flag} -eq 1 ]; then for i in $(seq 1 $family_size); do stop_instance $service_name $i $family_name done fi rm ${FILESYSTEM_PATH}/${TMP_VOL_SRVS_FILE_PRE_STOP} wait else # unregister standard service service="$(cat ${FILESYSTEM_PATH}/${SRVS_FILE} | grep ^${service_name}$)" if [ -z $service ]; then log "unregister" "Warning! service '$service_name' is already un-registered" exit 0 fi if [ -z "$(cat ${FILESYSTEM_PATH}/${SRVS_TO_STOP_FILE} | grep ^${service_name}$)" ]; then echo $service >> ${FILESYSTEM_PATH}/${SRVS_TO_STOP_FILE} fi if [ $ARCH = "arm" ]; then cmd_pid=$(ps | awk -v srv=${service} '{if($5==srv) print $1}') if [ "${cmd_pid:-null}" = null ] && [ $PIDOF_CMD_EXISTS -eq 1 ]; then base_name=$(get_basename $service) cmd_pid=$(pidof $base_name) fi elif [ $ARCH = "alpine" ]; then cmd_pid=$(ps -ef | awk -v srv=${service} '{if($4 ~ srv || $3 ~ srv || $6 ~ srv) print $1}') else cmd_pid=$(ps -eo pid,cmd | awk -v srv=${service} '{if($2 == srv || $3 == srv) print $1}') fi if ! [ "${cmd_pid:-null}" = null ]; then log "unregister" "Unregistering $service (pid=$cmd_pid)" echo "Unregistering $service (pid=$cmd_pid)" if [ ${kill_flag} -eq 1 ]; then log "unregister" "Stopping registered service '$service' with pid=$cmd_pid" kill_processes_by_pid $cmd_pid fi fi grep -v -e "^$service_name$" ${FILESYSTEM_PATH}/${SRVS_FILE} >${FILESYSTEM_PATH}/${TMP_SRVS_FILE} mv ${FILESYSTEM_PATH}/${TMP_SRVS_FILE} ${FILESYSTEM_PATH}/${SRVS_FILE} grep -v -e "^$service_name$" ${FILESYSTEM_PATH}/${SRVS_TO_STOP_FILE} >${FILESYSTEM_PATH}/${TMP_SRVS_TO_STOP_FILE} mv ${FILESYSTEM_PATH}/${TMP_SRVS_TO_STOP_FILE} ${FILESYSTEM_PATH}/${SRVS_TO_STOP_FILE} fi } rotate_service_file() { log_file_name=$1 touch $log_file_name file_size=$(du -a $log_file_name | tr -s '\t' ' ' | cut -d' ' -f1) if [ $file_size -gt $MAX_FILE_SIZE ]; then log "Run log rotation on ${log_file_name}" cp $log_file_name "$log_file_name.0" && echo >$log_file_name for file_num in $(seq $((MAX_ROTATION - 1)) -1 0); do if [ -f "$log_file_name.$file_num.gz" ]; then mv "$log_file_name.$file_num.gz" "$log_file_name.$((file_num + 1)).gz" fi done gzip "$log_file_name.0" fi } rotate_service_log() { if [ -f ${FILESYSTEM_PATH}/${SRVS_FILE} ]; then for service in $(cat ${FILESYSTEM_PATH}/${SRVS_FILE}); do base_name=$(get_basename $service) srv_debug_file=${LOG_FILE_PATH}/nano_agent/$base_name.dbg srv_log_file=${LOG_FILE_PATH}/nano_agent/$base_name.log rotate_service_file ${srv_debug_file} rotate_service_file ${srv_log_file} done fi if [ -f ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} ]; then for service_line in $(cat ${FILESYSTEM_PATH}/${VOL_SRVS_FILE}); do service=$(echo $service_line | cut -f1 -d ';') family=$(echo $service_line | cut -f2 -d ';') instance_id=$(echo $service_line | cut -f3 -d ';') if [ -z $family ]; then debug_file_suffix=${instance_id} else debug_file_suffix=${family}_${instance_id} fi base_name=$(get_basename $service) srv_debug_file=${LOG_FILE_PATH}/nano_agent/$base_name.dbg${debug_file_suffix} srv_log_file=${LOG_FILE_PATH}/nano_agent/$base_name.log${debug_file_suffix} rotate_service_file ${srv_debug_file} rotate_service_file ${srv_log_file} done fi if [ -f ${LOG_FILE_PATH}/nano_agent/cp-nano-init-agent.dbg ]; then rotate_service_file ${LOG_FILE_PATH}/nano_agent/cp-nano-init-agent.dbg fi rotate_service_file ${LOG_FILE_PATH}/$LOG_FILE } remove_file_if_expired() { file=$1 file_ttl_minutes=$2 if test "$(find $file -mmin +$file_ttl_minutes)" = "$file"; then rm $file fi } remove_old_service_logs() { if [ -f ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} ]; then for service_line in $(cat ${FILESYSTEM_PATH}/${VOL_SRVS_FILE}); do service=$(echo $service_line | cut -f1 -d ';') service=$(get_basename $service) family=$(echo $service_line | cut -f2 -d ';') if [ -z $family ]; then continue fi for service_log_file in $(ls -d ${LOG_FILE_PATH}/nano_agent/* | grep $service); do family_and_id=$(echo $service_log_file | cut -f2 -d '.') if [ -z $(echo $family_and_id | grep '_') ]; then continue fi family=$(echo $family_and_id | cut -f1 -d '_') family=${family#dbg} family=${family#log} relevant_services=$(cat ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} | grep "$service;$family") if [ -z "$relevant_services" ]; then remove_file_if_expired $service_log_file $SERVICE_LOG_FILE_TTL_MINUTES for buffered_log in $(ls -d ${LOG_FILE_PATH}/nano_agent/event_buffer/* | grep $family); do remove_file_if_expired $buffered_log $SERVICE_LOG_FILE_TTL_MINUTES done fi done done fi for buffered_log in $(ls -d ${LOG_FILE_PATH}/nano_agent/event_buffer/* | grep cpz); do log_id=$(echo $buffered_log | cut -f2 -d '.') if [ $log_id -ge 10 ]; then rm $buffered_log fi done } daily_log_files_cleanup() { minutes_in_day=1440 last_cleanup_flag_file=${FILESYSTEM_PATH}/${VOL_SRVS_FILE}.log_cleanup remove_file_if_expired ${last_cleanup_flag_file} $minutes_in_day if [ ! -f ${last_cleanup_flag_file} ]; then remove_old_service_logs touch ${last_cleanup_flag_file} fi } is_service_running() { service=$1 if [ $ARCH = "arm" ]; then cmd_pid=$(ps w | grep $service | head -n-1 | awk '{ print $1 }') if [ "${cmd_pid:-null}" = null ] && [ $PIDOF_CMD_EXISTS -eq 1 ]; then base_name=$(get_basename $service) cmd_pid=$(pidof $base_name) fi elif [ $ARCH = "alpine" ]; then cmd_pid=$(ps -ef | awk -v srv="$service$" '{if(($4 ~ srv || $3 ~ srv || $6 ~ srv) && ($4 != "awk" && $4 != "grep" )) print $1}') else cmd_pid=$(ps -eo pid,cmd | awk -v srv="$service$" '{if($2 ~ srv || $3 ~ srv) print $1}') fi if [ "${cmd_pid:-null}" = null ]; then echo false else echo true fi } # is_volatile_service_running arguments: (service name, instance ID, family name) is_volatile_service_running() { service=$1 instance_id=$2 family_name=$3 family_arg="" if [ -n "${family_name}" ]; then family_arg="--family=${family_name}" fi if [ $ARCH = "arm" ]; then cmd_pid=$(ps | grep -- "${family_arg}" | grep -- "--id=$instance_id" | awk -v srv=${service} '{if($5==srv) print $1}') elif [ $ARCH = "alpine" ]; then cmd_pid=$(ps -ef | grep -- "${family_arg}" | grep -- "--id=$instance_id" | awk -v srv=${service} '{if($4 ~ srv || $3 ~ srv) print $1}') else cmd_pid=$(ps -eo pid,cmd | grep -- "${family_arg}" | grep -- "--id=$instance_id" | awk -v srv=${service} '{if($2 ~ srv || $3 ~ srv) print $1}') fi if [ "${cmd_pid:-null}" = null ]; then echo false else echo true fi } increment_watchdog_process_restart_counter() { if [ -f $WATCHDOG_PROCESS_RESTART_COUNTER ]; then counter=$(cat ${WATCHDOG_PROCESS_RESTART_COUNTER}) else counter=0 fi counter=$((counter+1)) echo ${counter}> ${WATCHDOG_PROCESS_RESTART_COUNTER} } load_volatile_services() { if [ -f ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} ]; then for service_line in $(cat ${FILESYSTEM_PATH}/${VOL_SRVS_FILE}); do service=$(echo $service_line | cut -f1 -d ';') family=$(echo $service_line | cut -f2 -d ';') instance_id=$(echo $service_line | cut -f3 -d ';') already_running="0" gaia_ld_path= if [ -n "$(cat ${FILESYSTEM_PATH}/$SRVS_HALTED | grep $service)" ]; then continue fi if [ -f ${service}.cfg ]; then . "${service}.cfg" fi if test "$(is_volatile_service_running $service $instance_id $family)" = "false"; then if [ -n "$(cat ${FILESYSTEM_PATH}/${STARTUP_SRVS_FILE} | grep $service)" ]; then var_service_startup=true grep -v -e "^$service$" ${FILESYSTEM_PATH}/${STARTUP_SRVS_FILE} >${FILESYSTEM_PATH}/${STARTUP_SRVS_FILE}.tmp mv ${FILESYSTEM_PATH}/${STARTUP_SRVS_FILE}.tmp ${FILESYSTEM_PATH}/${STARTUP_SRVS_FILE} else var_service_startup=false fi run_volatile_service $service_line $gaia_ld_path increment_watchdog_process_restart_counter echo "running" > $AGENT_RUN_STATUS_FILE already_running="1" fi if test "$already_running" = "0" && [ -f $AGENT_RUN_STATUS_FILE ]; then echo "already running" > $AGENT_RUN_STATUS_FILE fi done fi } load_services() { load_paths all_running=true is_startup_mode=false if [ -f ${FILESYSTEM_PATH}/watchdog/wd.startup ]; then rm -f ${FILESYSTEM_PATH}/watchdog/wd.startup echo "" >${FILESYSTEM_PATH}/$SRVS_HALTED is_startup_mode=true fi already_running="0" for service in $(cat ${FILESYSTEM_PATH}/${SRVS_FILE}); do if test "$is_startup_mode" = "false" && [ -n "$(cat ${FILESYSTEM_PATH}/$SRVS_HALTED | grep $service)" ]; then continue fi if [ -n "$(cat ${FILESYSTEM_PATH}/${SRVS_TO_STOP_FILE} | grep $service)" ]; then continue fi if [ -f ${service}.cfg ]; then . "${service}.cfg" fi if test "$(is_service_running $service)" = "false"; then all_running=false if [ ! -z $IS_CONTAINER_ENV ] && [ -f ${FILESYSTEM_PATH}/$SRVS_CONTAINER_FILE ]; then if grep -q "$service" ${FILESYSTEM_PATH}/$SRVS_CONTAINER_FILE; then sed -i "/$service/d" ${FILESYSTEM_PATH}/$SRVS_CONTAINER_FILE is_startup_mode=true fi fi if [ ! -z $IS_CONTAINER_ENV ] && test "$is_startup_mode" = "false"; then log "load_services" "Error: Nano service $service stopped running" echo "Error: Nano service $service stopped running" exit 1 fi log "load_services" "Respawn ${service}" if [ -n "$(cat ${FILESYSTEM_PATH}/${STARTUP_SRVS_FILE} | grep $service)" ]; then var_service_startup=true grep -v -e "^$service$" ${FILESYSTEM_PATH}/${STARTUP_SRVS_FILE} >${FILESYSTEM_PATH}/${STARTUP_SRVS_FILE}.tmp mv ${FILESYSTEM_PATH}/${STARTUP_SRVS_FILE}.tmp ${FILESYSTEM_PATH}/${STARTUP_SRVS_FILE} else var_service_startup=false fi crashes_revert=$(get_profile_agent_setting_with_default "allowCrashesRevert" "true") if [ "$crashes_revert" = "true" ] && [ "$(get_basename $service)" = "cp-nano-orchestration" ] && [ -f ${FILESYSTEM_PATH}/revert/upgrade_status ]; then update_orchestrations_counters total_orch_restarts=$(echo "$orch_counters" | awk '{print $1 + $2 + $3}') log "load_services" "orchestrator restart no. ${total_orch_restarts}" if [ "$total_orch_restarts" -ge "$MAX_ORCH_RESTARTS" ]; then ${SCRIPT_FOLDER}/revert_orchestrator_version.sh ${LOG_FILE_PATH}/$LOG_FILE fi fi run_service $service $gaia_ld_path increment_watchdog_process_restart_counter echo "running" > $AGENT_RUN_STATUS_FILE already_running="1" fi if test "$already_running" = "0" && [ -f $AGENT_RUN_STATUS_FILE ]; then echo "already running" > $AGENT_RUN_STATUS_FILE fi done if test "$all_running" = "false"; then rm -f /tmp/wd.all_running else touch /tmp/wd.all_running fi } get_service_status() { service='' fid='' uid='' verbose=false registration_status="not-registered" running_status="not-running" while true; do if test "$1" = "--service"; then shift service=$1 elif test "$1" = "--family"; then shift fid=$1 elif test "$1" = "--id"; then shift uid=$1 elif test "$1" = "--verbose"; then verbose=true elif [ -z $1 ]; then break fi shift done if [ -z $service ]; then log "get_service_status" "Error: service name was not provided" echo "Error: service name was not provided" exit 1 fi if [ -z "$uid" ]; then is_running=$(is_service_running ${service}) if [ "$is_running" = "true" ]; then running_status="running" fi if ! [ -z "$(cat ${FILESYSTEM_PATH}/${SRVS_FILE} | grep ${service}$)" ]; then registration_status="registered" fi if [ "$verbose" = "true" ]; then echo "service '$service' is ${registration_status} and ${running_status}" else log "get_service_status" "service '$service' is ${registration_status}" echo "service '$service' is ${registration_status}" fi else if [ "$(is_volatile_service_running ${service} ${uid} ${fid})" = "true" ]; then running_status="running" fi family_size=$(cat ${FILESYSTEM_PATH}/${VOL_SRVS_FILE} | grep "${service};${fid};" | wc -l) if ! { [ -z "$family_size" ] || [ $family_size -lt $uid ]; }; then registration_status="registered" fi # handle multiple instances services if [ "$verbose" = "true" ]; then log "get_service_status" "service '$service' (Family '$fid', uid '$uid') is ${registration_status} and ${running_status}" echo "service '$service' (Family '$fid', uid '$uid') is ${registration_status} and ${running_status}" else log "get_service_status" "service '$service' (Family '$fid', uid '$uid') is ${registration_status}" echo "service '$service' (Family '$fid', uid '$uid') is ${registration_status}" fi fi } #read_config load_paths if test "$1" = "--status" || test "$1" = "-s"; then shift get_service_status "${@}" exit 0 elif test "$1" = "--restart_count" || test "$1" = "-rc"; then if [ -f $WATCHDOG_PROCESS_RESTART_COUNTER ]; then counter=$(cat ${WATCHDOG_PROCESS_RESTART_COUNTER}) else echo 0> ${WATCHDOG_PROCESS_RESTART_COUNTER} counter=0 fi echo ${counter} exit 0 elif test "$1" = "--register" || test "$1" = "-r"; then if test "$2" = "--upgrade"; then var_upgarde=true shift fi if test "$3" = "--family" || test "$3" = "-f"; then family_name=$4 if test "$5" = "--count" || test "$5" = "-c"; then family_size=$6 else log "main" "Registering a family requires size argument" fi elif test "$3" = "--count" || test "$3" = "-c"; then family_size=$4 fi register $2 $family_size $family_name exit 0 elif test "$1" = "--un-register" || test "$1" = "-u"; then if test "$3" = "--family" || test "$3" = "-f"; then family_name=$4 kill_arg="kill" else kill_arg=$3 if test "$4" = "--family" || test "$4" = "-f"; then family_name=$5 kill_arg="kill" fi fi unregister $2 $kill_arg $family_name exit 0 elif test "$1" = "--restart"; then trigger_restart_service $2 exit 0 elif test "$1" = "--stop" || test "$1" = "-q"; then if test "$2" = "--persistent" || test "$2" = "-p"; then echo "$3" >>${FILESYSTEM_PATH}/$SRVS_HALTED unregister $3 exit 0 fi echo "$2" >>${FILESYSTEM_PATH}/$SRVS_HALTED kill_processes_by_pid $(${pidof_cmd} ${2}) retry_counter=0 while [ $retry_counter -lt 10 ]; do if [ -z "$(${pidof_cmd} ${2})" ]; then exit 0 fi sleep 0.3 retry_counter=$(($retry_counter + 1)) done if [ -n "$(${pidof_cmd} ${2})" ]; then log "main" "Service $2 is in 'stopped' state but have not exited for 3 seconds" exit 1 fi exit 0 elif test "$1" = "--start" || test "$1" = "-r"; then if test "$2" = "--persistent" || test "$2" = "-p"; then register $3 shift fi if [ -z "$(cat ${FILESYSTEM_PATH}/$SRVS_FILE | grep $2)" ] && [ -z "$(cat ${FILESYSTEM_PATH}/$VOL_SRVS_FILE | grep $2)" ]; then exit 3 fi if [ -n "$(cat ${FILESYSTEM_PATH}/$SRVS_HALTED | grep $2)" ]; then sed -i "\|$2|d" ${FILESYSTEM_PATH}/$SRVS_HALTED exit $? fi exit 2 fi if [ -z $IS_CONTAINER_ENV ]; then log "main" "Starting cp-nano-agent watchdog as service mode" else log "main" "Starting cp-nano-agent watchdog as container mode" fi IS_SERVICE_STARTED=false echo "" >${FILESYSTEM_PATH}/$SRVS_HALTED last_update=$(date +%s) interval_duration=60 orch_counters="0 0 0" iteration_count=0 while $(true); do if [ -z $IS_CONTAINER_ENV ] && [ -f ${FILESYSTEM_PATH}/orchestration/restart_watchdog ]; then rm -f ${FILESYSTEM_PATH}/orchestration/restart_watchdog if [ $ARCH = "arm" ]; then cp_exec "$INIT_D_PATH/nano_agent.init restart" else service nano_agent restart fi fi $IS_SERVICE_STARTED && kill_services_if_needed IS_SERVICE_STARTED=true load_services load_volatile_services rotate_service_log daily_log_files_cleanup file_age_revert=$(get_profile_agent_setting_with_default "allowFileAgeRevert" "false") iteration_count=$((iteration_count + 1)) if [ $((iteration_count % 10)) -eq 0 ]; then if [ "$file_age_revert" = "true" ] && [ -f ${FILESYSTEM_PATH}/revert/upgrade_status ]; then file_mtime=$(stat -c %Y "${FILESYSTEM_PATH}/revert/upgrade_status") current_time=$(date +%s) file_age=$((current_time - file_mtime)) if [ "$file_age" -gt "$MAX_AGE_SECONDS" ]; then log "monitor_upgrade_status_file_age" "The file has existed for more than $MAX_AGE_MINUTES minutes." ${SCRIPT_FOLDER}/revert_orchestrator_version.sh ${LOG_FILE_PATH}/$LOG_FILE fi fi fi sleep 5 done