From fc240bdff60aae7133a532c7752c6253ce8f65ca Mon Sep 17 00:00:00 2001 From: Oyvind Albrigtsen Date: Mon, 4 Aug 2025 16:53:09 +0200 Subject: [PATCH 1/2] db2: add "skip_basic_sql_health_check" parameter to avoid failing on systems with high load --- heartbeat/db2 | 63 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/heartbeat/db2 b/heartbeat/db2 index 1cd66f15a..da6c9d5f1 100755 --- a/heartbeat/db2 +++ b/heartbeat/db2 @@ -40,10 +40,12 @@ # Parameter defaults OCF_RESKEY_instance_default="" +OCF_RESKEY_skip_basic_sql_health_check_default="false" OCF_RESKEY_admin_default="" OCF_RESKEY_dbpartitionnum_default="0" : ${OCF_RESKEY_instance=${OCF_RESKEY_instance_default}} +: ${OCF_RESKEY_skip_basic_sql_health_check=${OCF_RESKEY_skip_basic_sql_health_check_default}} : ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}} : ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}} @@ -102,6 +104,15 @@ Defaults to all databases in the instance. Specify one db for HADR mode. List of databases to be managed + + +Skip basic health check SQL query. + +Only set to "true" to avoid issues during high load. + +Skip basic health check SQL query + + DEPRECATED: The admin user of the instance. @@ -695,31 +706,33 @@ db2_monitor() { # set master preference accordingly case "$hadr" in PRIMARY/*|Primary/*|Standard/*) - # perform a basic health check - CMD="if db2 connect to $db; - then - db2 select \* from sysibm.sysversions ; rc=\$?; - db2 terminate; - else - rc=\$?; - fi; - exit \$rc" - - if ! output=$(runasdb2 $CMD) - then - case "$output" in - SQL1776N*) - # can't connect/select on standby, may be spurious turing takeover - ;; - - *) - ocf_log err "DB2 database $instance($db2node)/$db is not working" - ocf_log err "DB2 message: $output" - - # dead primary, remove master score - master_score -D -l reboot - return $OCF_ERR_GENERIC - esac + if ! ocf_is_true "$OCF_RESKEY_skip_basic_sql_health_check"; then + # perform a basic health check + CMD="if db2 connect to $db; + then + db2 select \* from sysibm.sysversions ; rc=\$?; + db2 terminate; + else + rc=\$?; + fi; + exit \$rc" + + if ! output=$(runasdb2 $CMD) + then + case "$output" in + SQL1776N*) + # can't connect/select on standby, may be spurious turing takeover + ;; + + *) + ocf_log err "DB2 database $instance($db2node)/$db is not working" + ocf_log err "DB2 message: $output" + + # dead primary, remove master score + master_score -D -l reboot + return $OCF_ERR_GENERIC + esac + fi fi ocf_log debug "DB2 database $instance($db2node)/$db appears to be working" From ded016f84d3fb77dc0542e3f4226774526910d97 Mon Sep 17 00:00:00 2001 From: Oyvind Albrigtsen Date: Thu, 7 Aug 2025 13:55:11 +0200 Subject: [PATCH 2/2] db2: add "monitor_retries", "monitor_sleep", and "monitor_retry_all_errors" parameters to be able to avoid failing on first try --- heartbeat/db2 | 80 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 72 insertions(+), 8 deletions(-) diff --git a/heartbeat/db2 b/heartbeat/db2 index da6c9d5f1..fe1d9b892 100755 --- a/heartbeat/db2 +++ b/heartbeat/db2 @@ -41,11 +41,17 @@ OCF_RESKEY_instance_default="" OCF_RESKEY_skip_basic_sql_health_check_default="false" +OCF_RESKEY_monitor_retries_default="1" +OCF_RESKEY_monitor_sleep_default="1" +OCF_RESKEY_monitor_retry_all_errors_default="false" OCF_RESKEY_admin_default="" OCF_RESKEY_dbpartitionnum_default="0" : ${OCF_RESKEY_instance=${OCF_RESKEY_instance_default}} : ${OCF_RESKEY_skip_basic_sql_health_check=${OCF_RESKEY_skip_basic_sql_health_check_default}} +: ${OCF_RESKEY_monitor_retries=${OCF_RESKEY_monitor_retries_default}} +: ${OCF_RESKEY_monitor_sleep=${OCF_RESKEY_monitor_sleep_default}} +: ${OCF_RESKEY_monitor_retry_all_errors=${OCF_RESKEY_monitor_retry_all_errors_default}} : ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}} : ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}} @@ -108,11 +114,33 @@ Defaults to all databases in the instance. Specify one db for HADR mode. Skip basic health check SQL query. -Only set to "true" to avoid issues during high load. +Only set to "true" when the "monitor_retries" and "monitor_retry_all_errors" parameters arent +enough to avoid issues under high load. Skip basic health check SQL query + + +Monitor retries before failing. + +Monitor retries + + + + +Monitor sleep between tries. + +Monitor sleep + + + + +Set to true to retry monitor-action for all errors instead of the default "db2pd" race conditions. + +Retry monitor for all errors + + DEPRECATED: The admin user of the instance. @@ -666,6 +694,7 @@ db2_hadr_status() { local output output=$(runasdb2 db2pd -hadr -db $db) + ocf_log debug "db2_hadr_status: $output" if [ $? != 0 ] then echo "Down/Off" @@ -676,7 +705,34 @@ db2_hadr_status() { awk '/^\s+HADR_(ROLE|STATE) =/ {printf $3"/"} /^\s+HADR_CONNECT_STATUS =/ {print $3; exit; } /^HADR is not active/ {print "Standard/Standalone"; exit; } - /^Role *State */ {getline; printf "%s/%s\n", $1, $2; exit; }' + /^Role *State */ {getline; printf "%s/%s\n", $1, $2; exit; } + /^Option -hadr requires -db or -alldbs option and active database./ { exit 255 } + /^Another possibility of this failure is the Virtual Address Space Randomization is currently enabled on this system./ { exit 255 } + /^Changing data structure forced command termination./ { exit 255 }' +} + +db2_monitor_retry() { + local tries=$(($OCF_RESKEY_monitor_retries + 1)) + + for try in $(seq $tries); do + ocf_log debug "monitor try $try of $tries" + db2_monitor + rc=$? + [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_RUNNING_MASTER ] && [ $rc -ne $OCF_NOT_RUNNING ] && ocf_log warn "Monitor failed with rc $rc." + if [ $rc -eq $OCF_SUCCESS ] || [ $rc -eq $OCF_RUNNING_MASTER ] || [ $rc -eq $OCF_NOT_RUNNING ] || { [ $rc -ne 255 ] && ! ocf_is_true "$OCF_RESKEY_monitor_retry_all_errors" ;} ;then + break + fi + [ $try -lt $tries ] && sleep $OCF_RESKEY_monitor_sleep + done + + [ $rc -eq 255 ] && rc=$OCF_ERR_GENERIC + + if [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_RUNNING_MASTER ]; then + # instance is dead remove master score + master_score -D -l reboot + fi + + return $rc } # @@ -690,9 +746,7 @@ db2_monitor() { db2_instance_status rc=$? if [ $rc -ne $OCF_SUCCESS ]; then - # instance is dead remove master score - master_score -D -l reboot - exit $rc + return $rc fi [ $db2node = 0 ] || return 0 @@ -700,8 +754,18 @@ db2_monitor() { for db in $dblist do - hadr=$(db2_hadr_status $db) || return $OCF_ERR_GENERIC + hadr=$(db2_hadr_status $db) + rc=$? ocf_log debug "Monitor: DB2 database $instance($db2node)/$db has HADR status $hadr" + if [ "$rc" -eq 255 ]; then + if [ "$__OCF_ACTION" = "monitor" ]; then + return $rc + else + return $OCF_ERR_GENERIC + fi + elif [ "$rc" -ne 0 ]; then + return $OCF_ERR_GENERIC + fi # set master preference accordingly case "$hadr" in @@ -915,9 +979,9 @@ case "$__OCF_ACTION" in exit $? ;; - monitor) + monitor) db2_validate - db2_monitor + db2_monitor_retry exit $? ;;