Back to home page

Enduro/X

 
 

    


0001 /**
0002  * @brief ndrxd monitor & recover process.
0003  *
0004  * @file tprecover.c
0005  */
0006 /* -----------------------------------------------------------------------------
0007  * Enduro/X Middleware Platform for Distributed Transaction Processing
0008  * Copyright (C) 2009-2016, ATR Baltic, Ltd. All Rights Reserved.
0009  * Copyright (C) 2017-2023, Mavimax, Ltd. All Rights Reserved.
0010  * This software is released under one of the following licenses:
0011  * AGPL (with Java and Go exceptions) or Mavimax's license for commercial use.
0012  * See LICENSE file for full text.
0013  * -----------------------------------------------------------------------------
0014  * AGPL license:
0015  *
0016  * This program is free software; you can redistribute it and/or modify it under
0017  * the terms of the GNU Affero General Public License, version 3 as published
0018  * by the Free Software Foundation;
0019  *
0020  * This program is distributed in the hope that it will be useful, but WITHOUT ANY
0021  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
0022  * PARTICULAR PURPOSE. See the GNU Affero General Public License, version 3
0023  * for more details.
0024  *
0025  * You should have received a copy of the GNU Affero General Public License along 
0026  * with this program; if not, write to the Free Software Foundation, Inc.,
0027  * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0028  *
0029  * -----------------------------------------------------------------------------
0030  * A commercial use license is available from Mavimax, Ltd
0031  * contact@mavimax.com
0032  * -----------------------------------------------------------------------------
0033  */
0034 #include <stdio.h>
0035 #include <stdlib.h>
0036 #include <string.h>
0037 #include <errno.h>
0038 #include <regex.h>
0039 #include <utlist.h>
0040 #include <unistd.h>
0041 #include <signal.h>
0042 #include <sys/wait.h>
0043 
0044 #include <ndebug.h>
0045 #include <atmi.h>
0046 #include <atmi_int.h>
0047 #include <typed_buf.h>
0048 #include <ndrstandard.h>
0049 #include <ubf.h>
0050 #include <Exfields.h>
0051 
0052 #include "tprecover.h"
0053 /*---------------------------Externs------------------------------------*/
0054 /*---------------------------Macros-------------------------------------*/
0055 /*---------------------------Enums--------------------------------------*/
0056 /*---------------------------Typedefs-----------------------------------*/
0057 /*---------------------------Globals------------------------------------*/
0058 /*---------------------------Statics------------------------------------*/
0059 static long M_restarts = 0;
0060 static long M_check = 5; /**< defaulted to 5 sec */
0061 /**
0062  * ping timeout... (seconds to wait for ping) 
0063  * WARNING ! As the process will be blocked in this time
0064  * the backpings from ndrxd may stall too. Thus check ndrxconfig.xml *ping_max*
0065  * setting, as ndrxd might kill the tprecover in this time.
0066  * But usually ndrxd shall be fast to respond, thus ping timeout to 20 should
0067  * be fine.
0068  */
0069 static int M_ping_tout = 20;
0070 static int M_ping_max = 3; /**< max ping attempts with out success to kill ndrxd */
0071 
0072 static int M_bad_pings = 0; /**< bad pings reset at exec */
0073 /*---------------------------Prototypes---------------------------------*/
0074 int start_daemon_recover(void);
0075 
0076 /**
0077  * Discard the deadly ndrxds
0078  */
0079 void handle_sigchld(void)
0080 {
0081     while (waitpid((pid_t)(-1), 0, WNOHANG) > 0) {}
0082 }
0083 
0084 /**
0085  * Monitor ndrxd & recover it if needed.
0086  * 1. could monitor process existance
0087  * 2. could do periodical pings to ndrxd for status
0088  * @param p_svc
0089  */
0090 void TPRECOVER (TPSVCINFO *p_svc)
0091 {
0092     int ret=EXSUCCEED;
0093     UBFH *p_ub = (UBFH *)p_svc->data;
0094     
0095     NDRX_LOG(log_debug, "TPRECOVER got call");
0096     Bfprint(p_ub, stderr);
0097 
0098     Bchg(p_ub, EXDM_RESTARTS, 1, (char *)&M_restarts, 0L);
0099 
0100     
0101 out:
0102     tpreturn(  ret==EXSUCCEED?TPSUCCESS:TPFAIL,
0103                 0,
0104                 (char *)p_ub,
0105                 0L,
0106                 0L);
0107 }
0108 
0109 /**
0110  * Periodic poll callback.
0111  * We might want to run pings here to ndrxd. If ping fails several times
0112  * we kill the ndrxd and restart it...!
0113  * @return 
0114  */
0115 expublic int poll_timer(void)
0116 {
0117     int ret=EXSUCCEED;
0118     int seq;
0119     long tim;
0120     int ndrxd_stat;
0121     
0122     /* remove zomies */
0123     handle_sigchld();
0124     
0125     ndrxd_stat = ndrx_chk_ndrxd();
0126     
0127     if (M_bad_pings > M_ping_max && !ndrxd_stat)
0128     {
0129         NDRX_LOG(log_always, "WARNING ! bad_pings=%d ping_max=%d and "
0130                 "ndrxd not running: respawn", 
0131                 M_bad_pings, M_ping_max);
0132         
0133         if (EXSUCCEED!=start_daemon_recover())
0134         {
0135            EXFAIL_OUT(ret);
0136         }
0137         
0138         M_bad_pings = 0;
0139     }
0140     else if (!ndrxd_stat)
0141     {
0142         M_bad_pings++;
0143         NDRX_LOG(log_always, "ndrxd not present (or resources issue for process listing...)"
0144                     "increase bad_pings=%d ping_max=%d", 
0145                     M_bad_pings, M_ping_max);
0146     }
0147     else
0148     {
0149         /* perform ping of ndrxd... */
0150         
0151         if (EXSUCCEED!=ndrx_ndrxd_ping(&seq, &tim, ndrx_get_G_atmi_conf()->reply_q, 
0152                 ndrx_get_G_atmi_conf()->reply_q_str))
0153         {
0154             M_bad_pings++;
0155             
0156             NDRX_LOG(log_info, "ndrxd_ping_seq=%d bad_pings=%d: timeout or system error", 
0157                     seq, M_bad_pings);
0158         }
0159         else
0160         {
0161             NDRX_LOG(log_error, "ndrxd_ping_seq=%d time=%ld ms", seq, tim);
0162         }
0163         
0164         if (M_bad_pings > M_ping_max)
0165         {
0166             /* get ndrxd pid... */
0167             pid_t ndrxd_pid = ndrx_ndrxd_pid_get();
0168             NDRX_LOG(log_always, "WARNING ! bad_pings=%d ping_max=%d -> kill %d %d", 
0169                     M_bad_pings, M_ping_max, SIGKILL, (int)ndrxd_pid);
0170             if (EXSUCCEED!=kill(ndrxd_pid, SIGKILL))
0171             {
0172                 NDRX_LOG(log_error, "Failed to kill %d: %s", 
0173                         (int)ndrxd_pid, strerror(errno));
0174             }
0175         }
0176     }
0177     
0178 out:
0179     return ret;
0180 }
0181 
0182 /**
0183  * Start ndrxd daemon in recovery mode.
0184  */
0185 int start_daemon_recover(void)
0186 {
0187     int ret=EXSUCCEED;
0188     pid_t pid;
0189     char    key[NDRX_MAX_KEY_SIZE+3+1];
0190     /* Log filename for ndrxd */
0191     char *ndrxd_logfile = getenv(CONF_NDRX_DMNLOG);
0192     /* clone our self */
0193     pid = ndrx_fork();
0194     
0195     if( pid == 0)
0196     {
0197         FILE *f;
0198         char *cmd[] = { "ndrxd", key, "-r", (char *)0 };
0199 
0200         /* this is child - start EnduroX back-end */
0201         snprintf(key, sizeof(key), NDRX_KEY_FMT, G_atmi_env.rnd_key);
0202 
0203         /* Open log file */
0204         if (NULL==(f=NDRX_FOPEN(ndrxd_logfile, "a")))
0205         {
0206             fprintf(stderr, "Failed to open ndrxd log file: %s\n",
0207                     ndrxd_logfile);
0208         }
0209         else
0210         {
0211             /* Redirect stdout, stderr to log file */
0212             close(1);
0213             close(2);
0214             if (EXFAIL==dup(fileno(f)))
0215             {
0216                 userlog("%s: Failed to dup(1): %s", __func__, strerror(errno));
0217             }
0218 
0219             if (EXFAIL==dup(fileno(f)))
0220             {
0221                 userlog("%s: Failed to dup(2): %s", __func__, strerror(errno));
0222             }
0223         }
0224 
0225         if (EXSUCCEED != execvp ("ndrxd", cmd))
0226         {
0227             fprintf(stderr, "Failed to start server - ndrxd!\n");
0228             exit(1);
0229         }
0230     }
0231     else
0232     {
0233         M_restarts++;
0234         NDRX_LOG(log_error, "Started ndrxd PID %d", pid);
0235     }
0236 out:
0237     return ret;
0238 }
0239 
0240 /*
0241  * Do initialization
0242  */
0243 int NDRX_INTEGRA(tpsvrinit)(int argc, char **argv)
0244 {
0245     int ret=EXSUCCEED;
0246     int c;
0247     extern char *optarg;
0248     sigset_t blockMask;
0249     
0250     NDRX_LOG(log_debug, "tpsvrinit called");
0251     /* Parse command line  */
0252     while((c = getopt(argc, argv, "c:t:m:")) != -1)
0253     {
0254         NDRX_LOG(log_debug, "%c = [%s]", c, optarg);
0255         switch(c)
0256         {
0257             case 'c':
0258                 M_check = atoi(optarg);
0259                 NDRX_LOG(log_debug, "check (-c): %d", 
0260                         M_check);
0261                 break;
0262             case 't':
0263                 M_ping_tout = atoi(optarg);
0264                 break;
0265             case 'm':
0266                 M_ping_max = atoi(optarg);
0267                 break;
0268             default:
0269                 NDRX_LOG(log_error, "Unknown param %c - 0x%x", c, c);
0270         EXFAIL_OUT(ret);
0271                 break;
0272         }
0273     }
0274 
0275     sigemptyset(&blockMask);
0276     sigaddset(&blockMask, SIGCHLD);
0277     
0278     if (sigprocmask(SIG_BLOCK, &blockMask, NULL) == -1)
0279     {
0280         NDRX_LOG(log_always, "%s: sigprocmask failed: %s",
0281                 __func__, strerror(errno));
0282     }
0283     
0284     /* Register timer check.... */
0285     NDRX_LOG(log_warn, "Config: ndrxd check time: %d sec", M_check);
0286     NDRX_LOG(log_warn, "Config: ndrxd ping timeout: %d sec", M_ping_tout);
0287     NDRX_LOG(log_warn, "Config: max pings for kill ndrxd: %d", M_ping_max);
0288     
0289     if (EXSUCCEED!=tpext_addperiodcb((int)M_check, poll_timer))
0290     {
0291         NDRX_LOG(log_error, "tpext_addperiodcb failed: %s",
0292             tpstrerror(tperrno));
0293         EXFAIL_OUT(ret);
0294     }
0295 
0296     if (EXSUCCEED!=tpadvertise(NDRX_SYS_SVC_PFX TPRECOVERSVC, TPRECOVER))
0297     {
0298         NDRX_LOG(log_error, "Failed to initialize TPRECOVER!");
0299         EXFAIL_OUT(ret);
0300     }
0301     
0302     if (EXSUCCEED!=tptoutset(M_ping_tout))
0303     {
0304         NDRX_LOG(log_error, "Failed to initialize TPRECOVER!");
0305         EXFAIL_OUT(ret);
0306     }
0307 
0308 out:
0309     return ret;
0310 }
0311 
0312 void NDRX_INTEGRA(tpsvrdone)(void)
0313 {
0314     /* just for build... */
0315 }
0316 /* vim: set ts=4 sw=4 et smartindent: */