Back to home page

Enduro/X

 
 

    


0001 #!/bin/bash
0002 ##
0003 ## @brief Test is as run-local.sh, excep we test here situations what might
0004 ##  happen if in cluster of virtual machine some suspend / failover / resume
0005 ##  happens.
0006 ##
0007 ## @file run-suspend.sh
0008 ##
0009 ## -----------------------------------------------------------------------------
0010 ## Enduro/X Middleware Platform for Distributed Transaction Processing
0011 ## Copyright (C) 2009-2016, ATR Baltic, Ltd. All Rights Reserved.
0012 ## Copyright (C) 2017-2023, Mavimax, Ltd. All Rights Reserved.
0013 ## This software is released under one of the following licenses:
0014 ## AGPL (with Java and Go exceptions) or Mavimax's license for commercial use.
0015 ## See LICENSE file for full text.
0016 ## -----------------------------------------------------------------------------
0017 ## AGPL license:
0018 ## 
0019 ## This program is free software; you can redistribute it and/or modify it under
0020 ## the terms of the GNU Affero General Public License, version 3 as published
0021 ## by the Free Software Foundation;
0022 ##
0023 ## This program is distributed in the hope that it will be useful, but WITHOUT ANY
0024 ## WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
0025 ## PARTICULAR PURPOSE. See the GNU Affero General Public License, version 3
0026 ## for more details.
0027 ##
0028 ## You should have received a copy of the GNU Affero General Public License along 
0029 ## with this program; if not, write to the Free Software Foundation, Inc., 
0030 ## 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0031 ##
0032 ## -----------------------------------------------------------------------------
0033 ## A commercial use license is available from Mavimax, Ltd
0034 ## contact@mavimax.com
0035 ## -----------------------------------------------------------------------------
0036 ##
0037 
0038 export TESTNAME="test104_tmqfailover"
0039 
0040 PWD=`pwd`
0041 if [ `echo $PWD | grep $TESTNAME ` ]; then
0042     # Do nothing 
0043     echo > /dev/null
0044 else
0045     # started from parent folder
0046     pushd .
0047     echo "Doing cd"
0048     cd $TESTNAME
0049 fi;
0050 
0051 . ../testenv.sh
0052 
0053 export TESTDIR="$NDRX_APPHOME/atmitest/$TESTNAME"
0054 export PATH=$PATH:$TESTDIR
0055 export NDRX_ULOG=$TESTDIR
0056 # we have 30 sec recovery period. Dur that last tmqueue might detect that
0057 # some message on disk found, it would restart, however tmsrv might
0058 # get timeout on tmqueue (we do not keep currenlty messages in dead Q).
0059 # thus the recovery time depends on TOUT setting. So lets keep it low
0060 # now...
0061 export NDRX_TOUT=10
0062 export NDRX_SILENT=Y
0063 export NDRX_SGREFRESH=10
0064 
0065 ################################################################################
0066 # 6 gives:
0067 # lock expire if not refreshed in 6 seconds
0068 # lock take over by other node if file unlocked: 12 sec
0069 # exsinglesv periodic scans / locks 2 sec
0070 export NDRX_SGREFRESH=30
0071 ################################################################################
0072 
0073 if [ "$(uname)" == "Darwin" ]; then
0074     export NDRX_LIBEXT="dylib"
0075 else
0076     export NDRX_LIBEXT="so"
0077 fi
0078 
0079 UNAME=`uname`
0080 
0081 #
0082 # Get the crash lib...
0083 #
0084 case $UNAME in
0085 
0086   Darwin)
0087     export NDRX_PLUGINS=libt86_lcf.dylib
0088     export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:$TESTDIR/../test086_tmqlimit
0089     ;;
0090 
0091   AIX)
0092     export NDRX_PLUGINS=libt86_lcf.so
0093     export LIBPATH=$LIBPATH:$TESTDIR/../test086_tmqlimit
0094     ;;
0095 
0096   *)
0097     export NDRX_PLUGINS=libt86_lcf.so
0098     export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TESTDIR/../test086_tmqlimit
0099     ;;
0100 esac
0101 
0102 #
0103 # Common configuration between the domains
0104 #
0105 export NDRX_CCONFIG1=$TESTDIR/app-common.ini
0106 
0107 # Check if the /etc/os-release file exists
0108 # slow disks on RPI (sd card..) + avoid wearing out...
0109 if [ -e /etc/os-release ]; then
0110     if grep -q "Raspbian" /etc/os-release; then
0111         echo "Raspbian detected - no fsync"
0112         export NDRX_CCONFIG1=$TESTDIR/app-common.ini
0113     fi
0114 fi
0115 
0116 #
0117 # Domain 1 - here client will live
0118 #
0119 set_dom1() {
0120     echo "Setting domain 1"
0121     . ../dom1.sh
0122     export NDRX_CONFIG=$TESTDIR/ndrxconfig-dom1-sus.xml
0123     export NDRX_DMNLOG=$TESTDIR/ndrxd-dom1.log
0124     export NDRX_LOG=$TESTDIR/ndrx-dom1.log
0125     export NDRX_CCONFIG=$TESTDIR/app-dom1-sus.ini
0126     #export NDRX_DEBUG_CONF=$TESTDIR/debug-dom1.conf
0127 }
0128 
0129 #
0130 # Domain 2 - here server will live
0131 #
0132 set_dom2() {
0133     echo "Setting domain 2"
0134     . ../dom2.sh    
0135     export NDRX_CONFIG=$TESTDIR/ndrxconfig-dom2-sus.xml
0136     export NDRX_DMNLOG=$TESTDIR/ndrxd-dom2.log
0137     export NDRX_LOG=$TESTDIR/ndrx-dom2.log
0138     export NDRX_CCONFIG=$TESTDIR/app-dom2-sus.ini
0139     #export NDRX_DEBUG_CONF=$TESTDIR/debug-dom2.conf
0140 }
0141 
0142 #
0143 # Generic exit function
0144 #
0145 function go_out {
0146     echo "Test exiting with: $1"
0147     
0148     set_dom1;
0149     xadmin stop -y
0150     xadmin down -y
0151 
0152     set_dom2;
0153     xadmin stop -y
0154     xadmin down -y
0155 
0156     # If some alive stuff left...
0157     xadmin killall atmiclt104
0158 
0159     popd 2>/dev/null
0160     exit $1
0161 }
0162 
0163 # clean up some old stuff
0164 rm *.log 2>/dev/null
0165 rm lock_* 2>/dev/null
0166 rm ULOG* 2>/dev/null
0167 rm -rf RM1 RM2 qdata 2>/dev/null
0168 
0169 # where to store the data
0170 mkdir RM1 RM2 qdata
0171 
0172 # Any bridges that are live must be killed!
0173 xadmin killall tpbridge
0174 
0175 set_dom1;
0176 xadmin down -y
0177 xadmin start -y || go_out 1
0178 
0179 set_dom2;
0180 xadmin down -y
0181 xadmin start -y || go_out 2
0182 
0183 echo "Sleep 15 for link"
0184 sleep 15
0185 
0186 set_dom1;
0187 xadmin psg
0188 xadmin psc
0189 
0190 ################################################################################
0191 echo ">>> Loop enqueue + crash"
0192 ################################################################################
0193 NUM=400
0194 
0195 counter=0
0196 while [ $counter -lt $NUM ]
0197 do
0198     echo "Loop [$counter]"
0199 
0200     # enq single msg...
0201     ./atmiclt104 enq $counter
0202     RET=$?
0203 
0204     if [[ "X$RET" != "X0" ]]; then
0205         echo "./atmiclt104 enq $counter failed"
0206         go_out $RET
0207     fi
0208 
0209     # plock loss simulation
0210     if [ "$(($counter % 40))" == "0" ]; then
0211 
0212         echo "Node freeze test...."
0213         # for active node, we will suspend tmsrv...
0214         # that shall cause failover...
0215 
0216         set_dom1;
0217         DOM_NUM=2
0218 
0219         #if [[ "X`xadmin ppm | grep 'wait  runok'`" != "X" ]]; then
0220         #    echo "domain 2 is active"
0221         #    set_dom2;
0222         #    xadmin stop -s exsingleckr
0223         #else
0224         #    echo "domain 1 is active"
0225         #    xadmin stop -s exsingleckr
0226         #    DOM_NUM=1
0227         #fi
0228 
0229         set_dom1;
0230         xadmin lcf lockloss -A5 -a
0231         xadmin lcf
0232         xadmin stop -s exsingleckr
0233         xadmin dsleep 15
0234 
0235         set_dom2;
0236         xadmin lcf lockloss -A5 -a
0237         xadmin lcf
0238         xadmin stop -s exsingleckr
0239         xadmin dsleep 15
0240 
0241         echo "Let to failover tmsrvs... (sleep 15)"
0242         sleep 15
0243         
0244         # restore original domain...
0245         #if [[ "$DOM_NUM" == "2" ]]; then
0246         #    echo "domain 2 is active"
0247         #    set_dom2;
0248         #    xadmin start -s exsingleckr
0249         #else
0250         #    echo "domain 1 is active"
0251         #    xadmin start -s exsingleckr
0252         #fi
0253 
0254         # let system to clear up
0255         #xadmin lcf lockloss -A0 -a
0256         #
0257         
0258         echo "Restore domains to normal...`date`"
0259         set_dom1;
0260         xadmin lcf lockloss -A0 -a
0261         xadmin lcf
0262         xadmin start -s exsingleckr
0263 
0264         set_dom2;
0265         xadmin lcf lockloss -A0 -a
0266         xadmin lcf
0267         xadmin start -s exsingleckr
0268         xadmin killall exsinglesv
0269         
0270         # longer as more threads?
0271         sleep 30
0272         echo "Continue with normal...`date`"
0273         xadmin psc
0274         xadmin ppm
0275         set_dom1;
0276         xadmin psc
0277         xadmin ppm
0278 
0279     fi
0280 
0281     ((counter++))
0282 
0283 done
0284 
0285 ################################################################################
0286 echo ">>> Validate $NUM messages"
0287 ################################################################################
0288 
0289 # disable auto from Qs....
0290 xadmin mqch -n2 -i 200 -qQ1,autoq=n
0291 xadmin mqch -n2 -i 200 -qQ2,autoq=n
0292 sleep 5
0293 xadmin mqlq
0294 # if forwarder started, did timeout to tmsrv and rollback might timed-out too
0295 # thus 5+5 + 2 for waiting for release
0296 sleep 12
0297 # lets Q to complete...
0298 xadmin mqlc
0299 xadmin mqlq
0300 
0301 # validate that all messages are in place
0302 # enq single msg...
0303 ./atmiclt104 deq $NUM
0304 RET=$?
0305 
0306 if [[ "X$RET" != "X0" ]]; then
0307     echo "./atmiclt104 deq $counter failed"
0308     go_out $RET
0309 fi
0310 
0311 ################################################################################
0312 echo ">>> Corrupted ping file -> all groups down"
0313 ################################################################################
0314 # the 0 content would fail the CRC32 test thus all groups of all nodes shall go down...
0315 dd if=/dev/zero of=$TESTDIR/lock_GRP2_2 bs=1 count=2048
0316 # let the exsinglesv to detect the situation
0317 
0318 set_dom1;
0319 xadmin stop -s exsingleckr
0320 set_dom2;
0321 xadmin stop -s exsingleckr
0322 sleep 5
0323 
0324 set_dom1;
0325 # avoid exsinglesv feed from services:
0326 xadmin ppm
0327 CNT=`xadmin ppm | grep tmqueue | grep 'runok runok' | wc | awk '{print $1}'`
0328 if [ "$CNT" -ne "0" ]; then
0329     echo "Expected tmqueue down (0) on dom1, but got $CNT"
0330     go_out -1
0331 fi
0332 
0333 set_dom2;
0334 xadmin ppm
0335 CNT=`xadmin ppm | grep tmqueue | grep 'runok runok' | wc | awk '{print $1}'`
0336 if [ "$CNT" -ne "0" ]; then
0337     echo "Expected tmqueue down (0) on dom2, but got $CNT"
0338     go_out -1
0339 fi
0340 
0341 # Catch is there is test error!!!
0342 if [ "X`grep TESTERROR *.log`" != "X" ]; then
0343     echo "Test error detected!"
0344     RET=-2
0345 fi
0346 
0347 go_out $RET
0348 
0349 
0350 # vim: set ts=4 sw=4 et smartindent:
0351