Back to home page

Enduro/X

 
 

    


0001 /**
0002  * @brief Sanity checking routines.
0003  *   This will do following:
0004  *   - scan all stuff for:
0005  *   <prefix>.srv.reply.[binary].[pid] and do following checks:
0006  *   - Check is process in system by binary name + pid, if not the remove process
0007  *
0008  * @file sanity.c
0009  */
0010 /* -----------------------------------------------------------------------------
0011  * Enduro/X Middleware Platform for Distributed Transaction Processing
0012  * Copyright (C) 2009-2016, ATR Baltic, Ltd. All Rights Reserved.
0013  * Copyright (C) 2017-2023, Mavimax, Ltd. All Rights Reserved.
0014  * This software is released under one of the following licenses:
0015  * AGPL (with Java and Go exceptions) or Mavimax's license for commercial use.
0016  * See LICENSE file for full text.
0017  * -----------------------------------------------------------------------------
0018  * AGPL license:
0019  *
0020  * This program is free software; you can redistribute it and/or modify it under
0021  * the terms of the GNU Affero General Public License, version 3 as published
0022  * by the Free Software Foundation;
0023  *
0024  * This program is distributed in the hope that it will be useful, but WITHOUT ANY
0025  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
0026  * PARTICULAR PURPOSE. See the GNU Affero General Public License, version 3
0027  * for more details.
0028  *
0029  * You should have received a copy of the GNU Affero General Public License along 
0030  * with this program; if not, write to the Free Software Foundation, Inc.,
0031  * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0032  *
0033  * -----------------------------------------------------------------------------
0034  * A commercial use license is available from Mavimax, Ltd
0035  * contact@mavimax.com
0036  * -----------------------------------------------------------------------------
0037  */
0038 #include <string.h>
0039 #include <stdio.h>
0040 #include <stdlib.h>
0041 #include <errno.h>
0042 #include <memory.h>
0043 #include <sys/types.h>
0044 #include <dirent.h>
0045 #include <sys/stat.h>
0046 #include <utlist.h>
0047 
0048 #include <ndrstandard.h>
0049 #include <ndrxd.h>
0050 #include <atmi_int.h>
0051 #include <nstopwatch.h>
0052 
0053 #include <ndebug.h>
0054 #include <cmd_processor.h>
0055 #include <signal.h>
0056 #include <bridge_int.h>
0057 #include <atmi_shm.h>
0058 #include "userlog.h"
0059 #include "sys_unix.h"
0060 #include <lcfint.h>
0061 #include <singlegrp.h>
0062 
0063 /*---------------------------Externs------------------------------------*/
0064 /*---------------------------Macros-------------------------------------*/
0065 /*---------------------------Enums--------------------------------------*/
0066 /*---------------------------Typedefs-----------------------------------*/
0067 /*---------------------------Globals------------------------------------*/
0068 expublic unsigned G_sanity_cycle = 0;
0069 exprivate ndrx_stopwatch_t M_timer; /**< Stopwatch for next sanity scan*/
0070 exprivate int M_first = EXTRUE; /**< Was the santiy init OK */
0071 /*---------------------------Statics------------------------------------*/
0072 /*---------------------------Prototypes---------------------------------*/
0073 
0074 exprivate int check_server(char *qname);
0075 exprivate int check_client(char *qname, int is_xadmin, unsigned sanity_cycle);
0076 exprivate int check_cnvclt(char *qname);
0077 exprivate int check_cnvsrv(char *qname);
0078 exprivate int check_long_startup(void);
0079 exprivate int check_dead_processes(void);
0080 exprivate void check_memlimits(void);
0081 exprivate int check_singlegrp(void);
0082 
0083 /**
0084  * Return sanity scan stopwatch
0085  * @return ptr to ndrx_stopwatch_t or NULL (in case if no init done)
0086  */
0087 expublic ndrx_stopwatch_t * ndrx_get_santiy_stopwatch(void)
0088 {
0089 
0090     if (!M_first)
0091     {
0092         return &M_timer;
0093     }
0094 
0095     return NULL;
0096 }
0097 
0098 /**
0099  * Master process for sanity checking.
0100  * @param[in] finalchk perform final checks? Remove dread resources...
0101  * @return SUCCEED/FAIL
0102  */
0103 expublic int do_sanity_check(int finalchk)
0104 {
0105     int ret=EXSUCCEED;
0106     static char    server_prefix[NDRX_MAX_Q_SIZE+1];
0107     static int     server_prefix_len;
0108     static char    client_prefix[NDRX_MAX_Q_SIZE+1];
0109     static int     client_prefix_len;
0110     static char    xadmin_prefix[NDRX_MAX_Q_SIZE+1];
0111     static int     xadmin_prefix_len;
0112     
0113     /* conversational prefixes */
0114     static char    cnvclt_prefix[NDRX_MAX_Q_SIZE+1]; /* initiator... */
0115     static int     cnvclt_prefix_len;
0116     
0117     static char    cnvsrv_prefix[NDRX_MAX_Q_SIZE+1];
0118     static int     cnvsrv_prefix_len;
0119     
0120     
0121     int wasrun = EXFALSE;
0122     
0123     string_list_t* qlist = NULL;
0124     string_list_t* elt = NULL;
0125 
0126     G_sanity_cycle++;
0127     
0128     /* No sanity checks while app config not loaded */
0129     if (NULL==G_app_config)
0130         goto out;
0131     
0132     if (M_first)
0133     {
0134         ndrx_stopwatch_reset(&M_timer);
0135         /* Initialize q prefixes, +1 for skipping initial / */
0136         snprintf(client_prefix, sizeof(client_prefix), NDRX_CLT_QREPLY_PFX, 
0137                 G_sys_config.qprefix);
0138         client_prefix_len=strlen(client_prefix);
0139         NDRX_LOG(log_debug, "client_prefix=[%s]/%d", client_prefix, 
0140                             client_prefix_len);
0141         
0142         snprintf(xadmin_prefix, sizeof(xadmin_prefix),
0143                 NDRX_NDRXCLT_PFX, G_sys_config.qprefix);
0144         xadmin_prefix_len=strlen(xadmin_prefix);
0145         NDRX_LOG(log_debug, "xadmin_prefix=[%s]/%d", xadmin_prefix, 
0146                             xadmin_prefix_len);
0147         
0148         snprintf(server_prefix, sizeof(server_prefix), NDRX_SVR_QREPLY_PFX, 
0149                 G_sys_config.qprefix);
0150         server_prefix_len=strlen(server_prefix);
0151         NDRX_LOG(log_debug, "server_prefix=[%s]/%d", server_prefix, 
0152                             server_prefix_len);
0153     
0154         snprintf(cnvclt_prefix, sizeof(cnvclt_prefix), NDRX_CONV_INITATOR_Q_PFX, 
0155                 G_sys_config.qprefix);
0156         
0157         cnvclt_prefix_len=strlen(cnvclt_prefix);
0158         NDRX_LOG(log_debug, "cnvclt_prefix=[%s]/%d", cnvclt_prefix, 
0159                             cnvclt_prefix_len);
0160     
0161         snprintf(cnvsrv_prefix, sizeof(cnvsrv_prefix), NDRX_CONV_SRV_Q_PFX, 
0162                 G_sys_config.qprefix);
0163         cnvsrv_prefix_len=strlen(cnvsrv_prefix);
0164         NDRX_LOG(log_debug, "cnvsrv_prefix=[%s]/%d", cnvsrv_prefix, 
0165                             cnvsrv_prefix_len);
0166     
0167         M_first=EXFALSE;
0168     }
0169      
0170     if (ndrx_stopwatch_get_delta_sec(&M_timer)>=G_app_config->sanity || finalchk)
0171     {
0172         wasrun = EXTRUE;
0173         NDRX_LOG(log_debug, "Time for sanity checking...");
0174          
0175         qlist = ndrx_sys_mqueue_list_make(G_sys_config.qpath, &ret);
0176 
0177         if (EXSUCCEED!=ret)
0178         {
0179             NDRX_LOG(log_error, "posix queue listing failed!");
0180             EXFAIL_OUT(ret);
0181         }
0182 
0183         LL_FOREACH(qlist,elt)
0184         {
0185             NDRX_LOG(6, "Checking... [%s]", elt->qname);
0186             
0187             if (0==strncmp(elt->qname, client_prefix, 
0188                     client_prefix_len))
0189             {
0190                 check_client(elt->qname, EXFALSE, G_sanity_cycle);
0191             }
0192             else if (0==strncmp(elt->qname, xadmin_prefix, 
0193                     xadmin_prefix_len)) 
0194             {
0195                 check_client(elt->qname, EXTRUE, G_sanity_cycle);
0196             } 
0197             /* TODO: We might want to monitor admin queues too! */
0198             else if (0==strncmp(elt->qname, server_prefix, 
0199                     server_prefix_len)) 
0200             {
0201                 check_server(elt->qname);
0202             } /*  Bug #112 */
0203             else if (0==strncmp(elt->qname, cnvclt_prefix, 
0204                     cnvclt_prefix_len)) 
0205             {
0206                 check_cnvclt(elt->qname);
0207             } /*  Bug #112 */
0208             else if (0==strncmp(elt->qname, cnvsrv_prefix, 
0209                     cnvsrv_prefix_len)) 
0210             {
0211                 check_cnvsrv(elt->qname);
0212             }
0213         }
0214 
0215         /* Will check programs with long startup they will get killed if, 
0216          * not started in time! */
0217         /* NOTE: THIS IS FIRST PROCESS WHICH INCREMENTS COUNTERS IN PM! */
0218         check_long_startup();
0219         /* Send bridge refresh (if required) */
0220         
0221         if (!finalchk)
0222         {
0223             check_memlimits();
0224         }
0225         
0226         if (!finalchk)
0227         {
0228             brd_send_periodrefresh();
0229         }
0230         
0231         /* Time for PM checking! */
0232         if (EXSUCCEED!=check_dead_processes())
0233         {
0234             ret=EXFAIL;
0235             goto out;
0236         }
0237         
0238         /* Perform any routing related checks */
0239         ndrx_ddr_apply_sanity();
0240         
0241         /* Respawn any dead processes */
0242         if (!finalchk)
0243         {
0244             do_respawn_check();
0245         }
0246 
0247         /* check singleton groups */
0248         check_singlegrp();
0249         
0250         /* update queue statistics (if enabled) */
0251         
0252         if (!finalchk)
0253         {
0254             if (G_app_config->gather_pq_stats)
0255             {
0256                 pq_run_santiy(EXTRUE);
0257             }
0258         }
0259         
0260 #ifdef EX_USE_SYSVQ
0261         if (EXSUCCEED!=do_sanity_check_sysv(finalchk))
0262         {
0263             NDRX_LOG(log_error, "System V sanity checks failed!");
0264             userlog("System V sanity checks failed!");
0265             EXFAIL_OUT(ret);
0266         }
0267 #endif
0268     }
0269     
0270 out:
0271 
0272     if (NULL!=qlist)
0273     {
0274         ndrx_string_list_free(qlist);
0275     }
0276 
0277     /* Reset timer on run */
0278     if (wasrun)
0279     {
0280         ndrx_stopwatch_reset(&M_timer);
0281     }
0282 
0283     return ret;
0284 }
0285 
0286 /**
0287  * 
0288  * @param qname
0289  * @param process
0290  * @param processsz buffer size of \p process
0291  * @param p_pid
0292  */
0293 exprivate void parse_q(char *qname, int is_server, char *process, int processsz, 
0294             pid_t *p_pid, int *server_id, int is_xadmin)
0295 {   
0296     char buf[NDRX_MAX_Q_SIZE+1];
0297     char *p;
0298     
0299     NDRX_STRCPY_SAFE(buf, qname);
0300     
0301     /* We are client, thus needs to skip the context */
0302     if (!is_server && !is_xadmin)
0303     {
0304         p = strrchr(buf, NDRX_FMT_SEP);
0305         *p=EXEOS;
0306     }
0307     
0308     /* get over with pid */
0309     p = strrchr(buf, NDRX_FMT_SEP);
0310     *p_pid = atoi(p+1);
0311     *p=EXEOS;
0312     
0313     if (is_server)
0314     {
0315         /* Return server id, if we are server! */
0316         p=strrchr(buf, NDRX_FMT_SEP);
0317         *server_id = atoi(p+1);
0318         *p=EXEOS;
0319     }
0320     
0321     /* Fix up with process name */
0322     p=strrchr(buf, NDRX_FMT_SEP);
0323             
0324     NDRX_STRCPY_SAFE_DST(process, p+1, processsz);
0325     
0326     NDRX_LOG(6, "got process: pid: %d name: [%s]", 
0327                         *p_pid, process);
0328 }
0329 /**
0330  * Remove dead process queue from system!
0331  * @param qname
0332  * @return 
0333  */
0334 exprivate int unlink_dead_queue(char *qname)
0335 {
0336     int ret=EXSUCCEED;
0337     char    q_str[NDRX_MAX_Q_SIZE+1];
0338     char    *p;
0339     
0340     if ('/'!=qname[0])
0341     {
0342         NDRX_STRCPY_SAFE(q_str, "/");
0343         NDRX_STRCAT_S(q_str, sizeof(q_str), qname);
0344         p = q_str;
0345     }
0346     else
0347     {
0348         p = qname;
0349     }
0350     
0351     NDRX_LOG(log_warn, "Unlinking queue [%s]", p);
0352     if (EXSUCCEED!=ndrx_mq_unlink(p))
0353     {
0354         int err = errno;
0355         
0356         if (ENOENT!=err)
0357         {
0358             NDRX_LOG(log_error, "Failed to unlink dead queue [%s]: %s", 
0359                     p, strerror(err));
0360             /* Feature #237 */
0361             userlog("Failed to unlink dead queue [%s]: %s", 
0362                     p, strerror(err));
0363             ret=EXFAIL;
0364         }
0365         else
0366         {
0367             NDRX_LOG(log_debug, "Queue already does not exists [%s]: %s", 
0368                     p, strerror(err));
0369         }
0370     }
0371     
0372     return ret;
0373 }
0374 
0375 /**
0376  * Remove server queues
0377  * @param process
0378  * @param pid
0379  * @param srv_id
0380  * @param rplyq if null, then reply queue will be built from scratch.
0381  * @return 
0382  */
0383 expublic int remove_server_queues(char *process, pid_t pid, int srv_id, char *rplyq)
0384 {
0385     char    q_str[NDRX_MAX_Q_SIZE+1];
0386     int     rplyq_unlink = EXFALSE;
0387     char    *p;
0388 
0389     if (NULL==rplyq)
0390     {
0391         snprintf(q_str, sizeof(q_str), NDRX_SVR_QREPLY, 
0392                 G_sys_config.qprefix, process, srv_id, pid);
0393         
0394         p = q_str;
0395         if (!ndrx_q_exists(q_str)) 
0396         {
0397             NDRX_LOG(log_info, "Seems like reply queue [%s] does not"
0398                     " exists - nothing to do: %s", q_str, strerror(errno));
0399         }
0400         else
0401         {
0402             rplyq_unlink=EXTRUE;
0403         }
0404     }
0405     else
0406     {
0407         p = rplyq;
0408         rplyq_unlink=EXTRUE;
0409     }
0410 
0411     if (rplyq_unlink)
0412     {
0413         unlink_dead_queue(p);
0414     }
0415 
0416     snprintf(q_str, sizeof(q_str), NDRX_ADMIN_FMT, G_sys_config.qprefix, 
0417             process, srv_id, pid);
0418     /* Note - admin_q_str already contains / in front! */
0419     /*If exists admin queue, but process does not exists, then remove admin q too! */
0420 
0421     if (!ndrx_q_exists(q_str))
0422     {
0423         NDRX_LOG(log_info, "Seems like admin queue [%s] does not"
0424                 " exists - nothing to do: %s", q_str, strerror(errno));
0425     }
0426     else
0427     {
0428         unlink_dead_queue(q_str);
0429     }
0430     
0431     return EXSUCCEED;
0432 }
0433 /**
0434  * Check running servers...
0435  * If self Q is full, then is not deadly situation, as it will be handled by next check.
0436  * Also if dead q is unlinked. The process will be removed by check PM.
0437  * @return 
0438  */
0439 exprivate int check_server(char *qname)
0440 {
0441     char    process[NDRX_MAX_Q_SIZE+1];
0442     pid_t pid;
0443     int     srv_id;
0444     char *buf = NULL;
0445     size_t buf_len;
0446     srv_status_t *status = (srv_status_t *)buf;
0447     int ret=EXSUCCEED;
0448     
0449     NDRX_SYSBUF_MALLOC_OUT(buf, buf_len, ret);
0450     status = (srv_status_t *)buf;
0451     memset((char *)status, 0, sizeof(srv_status_t));
0452     
0453     parse_q(qname, EXTRUE, process, sizeof(process), &pid, &srv_id, EXFALSE);
0454     
0455     if (!ndrx_sys_is_process_running(pid, process))
0456     {      
0457         /* And finally we send to our selves notification that pid is dead
0458          * so that system takes care of it's removal! */
0459         remove_server_queues(process, pid, srv_id, qname);
0460     
0461         status->srvinfo.pid = pid;
0462         status->srvinfo.state = NDRXD_PM_DIED;
0463         status->srvinfo.srvid = srv_id;
0464         NDRX_LOG(log_debug, "Sending self notification "
0465                             "about dead process...");
0466         if (EXSUCCEED!=self_notify(status, EXFALSE))
0467         {
0468             NDRX_LOG(log_warn, "Failed to send self notification "
0469                     "- exit dead process check for a while!");
0470             ret=EXFAIL;
0471             goto out;
0472         }
0473         
0474         /* Remove any conv queues... */
0475     
0476     }
0477 out:
0478     if (NULL!=buf)
0479     {
0480         NDRX_SYSBUF_FREE(buf);
0481     }
0482     return ret;
0483 }
0484 
0485 /**
0486  * Check running clients...
0487  * ----------------
0488  * Needs optimization for threads.
0489  * We could use scandir instead of readdir.
0490  * Sort the output, and cache client checks, so that we do not test process for
0491  * existance for number of threads used...
0492  * See http://stackoverflow.com/questions/5102863/how-to-sort-files-in-some-directory-by-the-names-on-linux
0493  * ---------------- => DONE
0494  * @return 
0495  */
0496 exprivate int check_client(char *qname, int is_xadmin, unsigned sanity_cycle)
0497 {
0498     char process[NDRX_MAX_Q_SIZE+1];
0499     pid_t pid;
0500     /* Used for cache, so that we do not check multi threaded process
0501      * multiple times... */
0502     static unsigned prev_sanity_cycle;
0503     static int first = EXTRUE;
0504     static char prev_process[NDRX_MAX_Q_SIZE+1];
0505     static pid_t prev_pid;
0506     
0507     /*
0508      * EXTRUE - running OK
0509      * EXFALSE - exit OK
0510      * EXFAIL - failed 
0511      */
0512     static int prev_stat=EXTRUE;
0513     
0514     if (first)
0515     {
0516         prev_sanity_cycle = sanity_cycle-1;
0517         first=EXFALSE;
0518     }
0519     
0520     parse_q(qname, EXFALSE, process, sizeof(process), &pid, 0, is_xadmin);
0521     
0522     if (sanity_cycle == prev_sanity_cycle &&
0523             0==strcmp(process, prev_process) &&
0524             pid == prev_pid)
0525     {
0526         NDRX_LOG(6, "Multi-threaded process [%s]/%d already checked "
0527                         "at this sanity check", process, pid);
0528         
0529         if (EXTRUE > prev_stat && ndrx_q_exists(qname))
0530         {
0531             /* if prev was died - already logged to ulog */
0532             if (EXFAIL==prev_stat)
0533             {
0534                 NDRX_LOG(log_warn, "Previous same process (different "
0535                         "thread was unlink) - unlink this q [%s] too ", qname);
0536             }
0537             else
0538             {
0539                 /* main thread was term OK, but have some left overs.. thus warn */
0540                 NDRX_LOG(log_error, "Client process [%s], pid %d (other "
0541                         "thread unclean shutdown q: [%s])", 
0542                         process, pid, qname);
0543                 userlog("Client process [%s], pid %d (other "
0544                         "thread unclean shutdown q: [%s])", 
0545                         process, pid, qname);
0546                 
0547                 /* mark as failed, next thread do not return ulogs.. */
0548                 prev_stat=EXFAIL;
0549             }
0550             unlink_dead_queue(qname);
0551         }
0552         goto out;
0553     }
0554     
0555     /* Fill the prev stuff */
0556     prev_pid = pid;
0557     NDRX_STRCPY_SAFE(prev_process, process);
0558     prev_sanity_cycle = sanity_cycle;
0559     
0560     if (!ndrx_sys_is_process_running(pid, process))
0561     {
0562         /* check the queue... once again... */
0563         if (ndrx_q_exists(qname))
0564         {
0565             NDRX_LOG(log_error, "Client process [%s], pid %d died", process, pid);
0566             userlog("Client process [%s], pid %d died", process, pid);
0567             unlink_dead_queue(qname);
0568             prev_stat = EXFAIL;
0569         }
0570         else
0571         {
0572             NDRX_LOG(log_debug, "Client process [%s], pid %d terminated normally", 
0573                     process, pid);
0574             prev_stat = EXFALSE;
0575 #ifdef EX_USE_EMQ
0576             /* unlink anyway.... (on MacOS might be broken Q) */
0577             unlink_dead_queue(qname);
0578 #endif
0579         }
0580         /* Remove any conv queues... */
0581     }
0582     else
0583     {
0584         prev_stat = EXTRUE;
0585     }
0586     
0587 out:
0588     return EXSUCCEED;
0589 }
0590 
0591 /* TODO: We might want to check queues against shared memory... but not sure
0592  * shm might badly initialized. But anyway if shared memory exists, then caller to queue
0593  * firstly will check the shared memory and only after that it will call the server.
0594  */
0595 
0596 /**
0597  * Kill the process - #76 we will kill the reported PID (real server PID)
0598  * @param p_pm
0599  * @return 
0600  */
0601 exprivate int send_kill(pm_node_t *p_pm, int sig, int delta)
0602 {
0603     NDRX_LOG(log_error, "Killing PID: %d (ppid: %d)/%s/%d with signal -%d", 
0604             p_pm->svpid, p_pm->pid, p_pm->binary_name, p_pm->srvid, sig);
0605     userlog("Killing PID: %d (ppid: %d)/%s/%d with signal -%d", 
0606             p_pm->svpid, p_pm->pid, p_pm->binary_name, p_pm->srvid, sig);
0607     if (EXSUCCEED!=kill(p_pm->svpid, sig))
0608     {
0609         NDRX_LOG(log_error, "Failed to kill PID %d (ppid: %d) with error: %s",
0610                 p_pm->svpid, p_pm->pid, strerror(errno));
0611     }
0612     
0613     return EXSUCCEED;
0614 }
0615 
0616 /**
0617  * This function will deal with programs which perofrms long starup!
0618  * I.e. if they do not start in time, they will be killed!
0619  * @return 
0620  */
0621 exprivate int check_long_startup(void)
0622 {
0623     int ret=EXSUCCEED;
0624     pm_node_t *p_pm;
0625     int delta;
0626     int cksum_reload_sent = EXFALSE; /* for now single binary only at one cycle */
0627 
0628     int nrgrps = ndrx_G_libnstd_cfg.pgmax;
0629     int sg_groups[nrgrps];
0630 
0631     ndrx_sg_get_lock_snapshoot(sg_groups, &nrgrps, 0);
0632     
0633     DL_FOREACH(G_process_model, p_pm)
0634     {
0635         /* PM Counter increment! */
0636         p_pm->rspstwatch++;
0637         /* Increment ping stopwatch (if was issued) */
0638         if (SANITY_CNT_IDLE!=p_pm->pingstwatch)
0639         {
0640             p_pm->pingstwatch++;
0641         }
0642         
0643         if (p_pm->conf->pingtime)
0644         {
0645             p_pm->pingtimer++;
0646         }
0647         p_pm->last_sig++;
0648         p_pm->state_changed++;
0649         
0650         /* send ping to server, if require */
0651         if (!p_pm->killreq && NDRXD_PM_RUNNING_OK==p_pm->state &&
0652                 p_pm->conf->pingtime && p_pm->pingtimer > p_pm->conf->pingtime)
0653         {
0654             /* Reset ping timer */
0655             p_pm->pingtimer = SANITY_CNT_START;
0656             
0657             /* start to watch the ping response time: */
0658             if (SANITY_CNT_IDLE==p_pm->pingstwatch)
0659             {
0660                 p_pm->pingstwatch = SANITY_CNT_START;
0661             }
0662             
0663             /* Send ping command to server */
0664             srv_send_ping (p_pm);
0665         }
0666         
0667         /* If still starting */
0668         if (p_pm->autokill)
0669         {
0670             NDRX_LOG(6, "proc: %s/%d ping stopwatch: %ld, rsp: %ld sty ping timer: %ld sty", 
0671                     p_pm->binary_name, p_pm->srvid,
0672                     p_pm->pingstwatch,
0673                     p_pm->rspstwatch,
0674                     p_pm->pingtimer);
0675 
0676             /* if running state & lost the lock -> SIGKILL */
0677             if (p_pm->conf->procgrp_no > 0
0678                 && PM_RUNNING(p_pm->state) 
0679                 && ndrx_ndrxconf_procgroups_is_singleton(G_app_config->procgroups, 
0680                     p_pm->conf->procgrp_no)
0681                 && !sg_groups[p_pm->conf->procgrp_no-1])
0682             {
0683                 NDRX_LOG(log_error, "proc: %s/%d procgrp no %d lost the lock -> SIGKILL", 
0684                     p_pm->binary_name, p_pm->srvid, p_pm->conf->procgrp_no);
0685                 userlog("proc: %s/%d procgrp no %d lost the lock -> SIGKILL", 
0686                     p_pm->binary_name, p_pm->srvid, p_pm->conf->procgrp_no);
0687                 p_pm->last_sig = SANITY_CNT_START;
0688 
0689                 /* Kill immediately */
0690                 send_kill(p_pm, SIGKILL, 0);
0691             }
0692 
0693             if (!p_pm->killreq)
0694             {
0695                 if (NDRXD_PM_STARTING==p_pm->state &&
0696                     (delta=p_pm->rspstwatch) > p_pm->conf->start_max)
0697                 {
0698                     NDRX_LOG(log_error, "Startup too long - requesting "
0699                             "kill pid=%d/bin=%s/srvid=%d",
0700                              p_pm->pid, p_pm->binary_name, p_pm->srvid);
0701                     /* Support #276 */
0702                     userlog("Startup too long - requesting "
0703                             "kill pid=%d/bin=%s/srvid=%d",
0704                              p_pm->pid, p_pm->binary_name, p_pm->srvid);
0705                     p_pm->killreq=EXTRUE;
0706                 }
0707                 else if (NDRXD_PM_RUNNING_OK==p_pm->state && p_pm->conf->pingtime &&
0708                     (delta=p_pm->pingstwatch) > p_pm->conf->ping_max)
0709                 {
0710                     NDRX_LOG(log_error, "Ping response not in time - "
0711                                         "requesting kill (ping_time=%d delta=%d "
0712                                         "ping_max=%d) pid=%d/%s/srvid=%d",
0713                     p_pm->conf->pingtime, delta, p_pm->conf->ping_max,
0714                                         p_pm->pid, p_pm->binary_name, p_pm->srvid);
0715                     /* Support #276 */
0716                     userlog("Ping response not in time - "
0717                                         "requesting kill (ping_time=%d delta=%d "
0718                                         "ping_max=%d) pid=%d/bin=%s/srvid=%d",
0719                     p_pm->conf->pingtime, delta, p_pm->conf->ping_max,
0720                                         p_pm->pid, p_pm->binary_name, p_pm->srvid);
0721                     p_pm->killreq=EXTRUE;
0722                 }
0723                 else if (NDRXD_PM_STOPPING==p_pm->state &&
0724                     (delta = p_pm->rspstwatch) > p_pm->conf->end_max)
0725                 {
0726                     NDRX_LOG(log_error, "Server did not exit in time "
0727                                                             "- requesting kill");
0728                     p_pm->killreq=EXTRUE;
0729                 }
0730             }
0731             
0732             if (p_pm->killreq)
0733             {
0734                 if (0==p_pm->num_term_sigs)
0735                 {
0736                     /* Send signal INT => -2 */
0737                     send_kill(p_pm, p_pm->conf->killseq[0], 0);
0738                     /* Reset the signal time counter */
0739                     p_pm->last_sig = SANITY_CNT_START;
0740                     p_pm->num_term_sigs++;
0741                 }
0742                 else if (p_pm->num_term_sigs > 0 
0743                         && p_pm->last_sig > p_pm->conf->killtime)
0744                 {
0745                     if (1==p_pm->num_term_sigs)
0746                     {
0747                         /* Send signal TERM => -15 */
0748                         send_kill(p_pm, p_pm->conf->killseq[1], 0);
0749                     }
0750                     else
0751                     {
0752                         /* Send signal KILL => -9 */
0753                         send_kill(p_pm, p_pm->conf->killseq[2], 0);
0754                     }
0755                     
0756                     /* Send proper signal  */
0757                     if (p_pm->num_term_sigs<2)
0758                         p_pm->num_term_sigs++;
0759                     p_pm->last_sig = SANITY_CNT_START;
0760                 }
0761             }
0762         } /* If process still starting! */
0763         
0764         /* check the restart if needed by checksum */
0765         /* TODO: We need some hash list here so we caulcate checsums only one binary
0766          * not the all instances. And only issue one update per checksum change.
0767          */
0768         if (p_pm->conf->reloadonchange && EXEOS!=p_pm->binary_path[0] 
0769                 && !cksum_reload_sent
0770                 && ndrx_file_exists(p_pm->binary_path))
0771         {
0772             if (roc_check_binary(p_pm->binary_path, G_sanity_cycle))
0773             {
0774                 NDRX_LOG(log_warn, "Cksums differ reload...");
0775                 /* Send reload command */
0776                 if (EXSUCCEED!=self_sreload(p_pm))
0777                 {
0778                     NDRX_LOG(log_warn, "Failed to send self notification "
0779                             "about changed process - ignore!");
0780                 }
0781                 cksum_reload_sent=EXTRUE;
0782             }
0783         }
0784         
0785     }/* DL_FOREACH */
0786     
0787 out:
0788     return ret;
0789 }
0790 
0791 /**
0792  * Check any dead processes, which we think are running ok, but actually 
0793  * the PID does not exists!
0794  * If self q is full, it will be removed by next try.
0795  * @return 
0796  */
0797 exprivate int check_dead_processes(void)
0798 {
0799     int ret=EXSUCCEED;
0800     pm_node_t *p_pm;
0801     char *buf = NULL;
0802     size_t buf_len;
0803     srv_status_t *status;
0804     
0805     NDRX_SYSBUF_MALLOC_OUT(buf, buf_len, ret);
0806     status = (srv_status_t *)buf;
0807     
0808     DL_FOREACH(G_process_model, p_pm)
0809     {
0810         /* If still starting */
0811         if (p_pm->state>=NDRXD_PM_MIN_RUNNING &&
0812             p_pm->state<=NDRXD_PM_MAX_RUNNING &&
0813                 p_pm->state_changed > G_app_config->checkpm)
0814         {
0815             if (!ndrx_sys_is_process_running(p_pm->svpid, p_pm->binary_name_real))
0816             {
0817                 NDRX_LOG(log_warn, "Pid %d/%s/%s in state %d is actually dead",
0818                         p_pm->pid, p_pm->binary_name, p_pm->binary_name_real, 
0819                         p_pm->state);
0820                 /*Send self notification*/
0821                 
0822                 memset(buf, 0, sizeof(srv_status_t));
0823                 
0824                 status->srvinfo.pid = p_pm->pid;
0825                 status->srvinfo.state = NDRXD_PM_DIED;
0826                 status->srvinfo.srvid = p_pm->srvid;
0827                 
0828                 NDRX_LOG(log_debug, "Sending self notification "
0829                                     "about dead process...");
0830                 
0831                 if (EXSUCCEED!=self_notify(status, EXFALSE))
0832                 {
0833                     NDRX_LOG(log_warn, "Failed to send self notification "
0834                             "- exit dead process check for a while!");
0835                     ret=EXFAIL;
0836                     goto out;
0837                 }
0838             }
0839         } /* If process still starting! */
0840     }/* DL_FOREACH */
0841     
0842 out:
0843     
0844     if (NULL!=buf)
0845     {
0846         NDRX_SYSBUF_FREE(buf);
0847     }
0848 
0849     return ret;   
0850 }
0851 
0852 /**
0853  * Check process memory limits
0854  */
0855 exprivate void check_memlimits(void)
0856 {
0857     pm_node_t *p_pm;
0858     ndrx_proc_info_t inf;
0859     
0860     DL_FOREACH(G_process_model, p_pm)
0861     {
0862         /* If still starting */
0863         if (p_pm->state>=NDRXD_PM_MIN_RUNNING &&
0864             p_pm->state<=NDRXD_PM_MAX_RUNNING &&
0865                 (EXFAIL!=p_pm->conf->rssmax || EXFAIL!=p_pm->conf->vszmax))
0866         {
0867             /* perform tests..., read current settings */
0868             if (EXSUCCEED==ndrx_proc_get_infos(p_pm->pid, &inf))
0869             {
0870                 int reached = EXFALSE;
0871                 char memtype[4];
0872                 long lim_val;
0873                 long lim_max;
0874                 
0875                 if (p_pm->conf->rssmax!=EXFAIL &&
0876                         inf.rss * NDRX_STOR_KBYTE > p_pm->conf->rssmax)
0877                 {
0878                     reached = EXTRUE;
0879                     lim_val = inf.rss * NDRX_STOR_KBYTE;
0880                     lim_max = p_pm->conf->rssmax;
0881                     NDRX_STRCPY_SAFE(memtype, "RSS");
0882                 }
0883                 else if (p_pm->conf->vszmax!=EXFAIL &&
0884                         inf.vsz * NDRX_STOR_KBYTE > p_pm->conf->vszmax)
0885                 {
0886                     reached = EXTRUE;
0887                     lim_val = inf.vsz * NDRX_STOR_KBYTE;
0888                     lim_max = p_pm->conf->vszmax;
0889                     NDRX_STRCPY_SAFE(memtype, "VSZ");
0890                 }
0891                 
0892                 if (reached)
0893                 {
0894                     char limitbuf[256];
0895                     char valuebuf[256];
0896                     
0897                     ndrx_storage_encode(lim_max, limitbuf, sizeof(limitbuf));
0898                     ndrx_storage_encode(lim_val, valuebuf, sizeof(valuebuf));
0899                     
0900                     NDRX_LOG(log_error, "Server pid = %d, srvid = %d, name [%s] "
0901                             "%s memory limit reached: "
0902                             "configured max: %s in system found: %s - restarting...",
0903                             (int)p_pm->pid, p_pm->srvid, p_pm->binary_name,
0904                             memtype, limitbuf, valuebuf);
0905                     
0906                     userlog("Server pid = %d, srvid = %d, name [%s] "
0907                             "%s memory limit reached: "
0908                             "configured max: %s in system found: %s - restarting...",
0909                             (int)p_pm->pid, p_pm->srvid, p_pm->binary_name,
0910                             memtype, limitbuf, valuebuf);
0911                     
0912                     if (EXSUCCEED!=self_sreload(p_pm))
0913                     {
0914                         NDRX_LOG(log_warn, "Failed to send self notification "
0915                                 "about changed process - ignore!");
0916                     }
0917                 }
0918             }
0919             else
0920             {
0921                 /* ignore error as not critical for system running
0922                  * the process might be just exited 
0923                  */
0924                 NDRX_LOG(log_warn, "Server pid = %d, srvid = %d, name [%s]: "
0925                             "failed to read memory usage - ignore",
0926                             (int)p_pm->pid, p_pm->srvid, p_pm->binary_name);
0927             }
0928             
0929         } /* If process still starting! */
0930     }/* DL_FOREACH */
0931     
0932 }
0933 
0934 /**
0935  * This checks the shared memory of services with linked lists for each proces in PM
0936  * @return 
0937  */
0938 exprivate int check_svc_shm(void)
0939 {
0940     return EXSUCCEED;
0941 }
0942 
0943 /*
0944 
0945  Picture here with conv quueues can be follwing:
0946 Msg queued Q name
0947 ---------- ---------------------------------------------------------------------
0948 0          /dom2,sys,bg,ndrxd
0949 0          /dom2,cnv,s,srv,atmisv35,19,32175,0,2,1,srv,atmisv35,10,32157,0,2
0950 0          /dom2,cnv,s,srv,atmisv35,18,32173,0,2,1,srv,atmisv35,19,32175,0,2
0951 0          /dom2,cnv,s,srv,atmisv35,17,32171,0,2,1,srv,atmisv35,18,32173,0,2
0952 0          /dom2,cnv,s,srv,atmisv35,16,32169,0,2,1,srv,atmisv35,17,32171,0,2
0953 0          /dom2,cnv,s,srv,atmisv35,15,32167,0,2,1,srv,atmisv35,16,32169,0,2
0954 0          /dom2,cnv,s,srv,atmisv35,14,32165,0,2,1,srv,atmisv35,15,32167,0,2
0955 0          /dom2,cnv,s,srv,atmisv35,13,32163,0,2,1,srv,atmisv35,14,32165,0,2
0956 0          /dom2,cnv,s,srv,atmisv35,12,32161,0,2,1,srv,atmisv35,13,32163,0,2
0957 0          /dom2,cnv,s,srv,atmisv35,11,32159,0,2,1,srv,atmisv35,12,32161,0,2
0958 0          /dom2,cnv,s,clt,atmiclt35,32218,2,1,1,srv,atmisv35,11,32159,0,2
0959 1          /dom2,cnv,c,srv,atmisv35,19,32175,0,2,1
0960 1          /dom2,cnv,c,srv,atmisv35,18,32173,0,2,1
0961 1          /dom2,cnv,c,srv,atmisv35,17,32171,0,2,1
0962 1          /dom2,cnv,c,srv,atmisv35,16,32169,0,2,1
0963 1          /dom2,cnv,c,srv,atmisv35,15,32167,0,2,1
0964 1          /dom2,cnv,c,srv,atmisv35,14,32165,0,2,1
0965 1          /dom2,cnv,c,srv,atmisv35,13,32163,0,2,1
0966 1          /dom2,cnv,c,srv,atmisv35,12,32161,0,2,1
0967 1          /dom2,cnv,c,srv,atmisv35,11,32159,0,2,1
0968 0          /dom2,cnv,c,srv,atmisv35,10,32157,0,2,1
0969 0          /dom1,sys,bg,xadmin,32241
0970 0          /dom1,sys,bg,ndrxd
0971 0          /dom1,cnv,s,clt,atmiclt35,32218,3,1,1,srv,atmisv35,12,32137,0,1
0972 0          /dom1,cnv,s,clt,atmiclt35,32218,1,1,1,srv,atmisv35,11,32135,0,1
0973 0          /dom1,cnv,c,clt,atmiclt35,32218,5,1,1
0974 0          /dom1,cnv,c,clt,atmiclt35,32218,4,1,1
0975 0          /dom1,cnv,c,clt,atmiclt35,32218,3,1,1
0976 0          /dom1,cnv,c,clt,atmiclt35,32218,2,1,1
0977 0          /dom1,cnv,c,clt,atmiclt35,32218,1,1,1
0978 Test exiting with: 
0979  
0980  */
0981 
0982 /**
0983  * Check the conversational initiator. We will kill the queue if any of the processes
0984  * in our cluster node are dead.
0985  * @return 
0986  */
0987 exprivate int check_cnvclt(char *qname)
0988 {
0989     int ret = EXSUCCEED;
0990     TPMYID myid;
0991     
0992     if (EXSUCCEED==ndrx_cvnq_parse_client(qname, &myid))
0993     {
0994         if (EXFALSE==ndrx_myid_is_alive(&myid))
0995         {
0996             ndrx_myid_dump(log_debug, &myid, "process is dead, remove the queue");
0997             
0998             unlink_dead_queue(qname);
0999         }
1000     }
1001     
1002 out:
1003     return ret;
1004 }
1005 
1006 /**
1007  * Check conversation server accepted Q
1008  * @param qname queue name
1009  * @return SUCCEED
1010  */
1011 exprivate int check_cnvsrv(char *qname)
1012 {
1013     int ret = EXSUCCEED;
1014     TPMYID myid1, myid2;
1015     
1016     /* check start with srv, or ctl, then detect the length of the halve
1017      * and parse other part.
1018      * We are interested in other part, if it is dead, then kill the Q.
1019      */
1020     if (EXSUCCEED==ndrx_cvnq_parse_server(qname, &myid1, &myid2))
1021     {
1022         if (EXFALSE==ndrx_myid_is_alive(&myid2))
1023         {
1024             ndrx_myid_dump(log_debug, &myid2, "process is dead, remove the queue");
1025             unlink_dead_queue(qname);
1026         }
1027     }
1028    
1029 out:
1030     return ret;
1031 }
1032 
1033 /**
1034  * Perform final sanity checks - ndrxd is exiting
1035  * @return EXSUCCEED/EXFAIL
1036  */
1037 expublic int ndrxd_sanity_finally(void)
1038 {
1039     int ret = EXSUCCEED;
1040     
1041     /*
1042 #ifdef EX_USE_SYSVQ
1043     
1044     if (EXSUCCEED!=ndrxd_sysv_finally())
1045     {
1046         ret = EXFAIL;
1047     }
1048     
1049 #endif
1050      */
1051     
1052     ret = do_sanity_check(EXTRUE);
1053     
1054 out:
1055     return ret;
1056 }
1057 
1058 /**
1059  * Compare the array of integers
1060  */
1061 exprivate int cmp_int(const void *a, const void *b)
1062 {
1063     return (*(int*)a - *(int*)b);
1064 }
1065 
1066 /**
1067  * Perform sanity checks for singleton groups shared memory
1068  * loop over all groups, check if group is locked, that processes
1069  * are running. If not running, the group shall be unlocked.
1070  * ensure that actul lock provider (from p_pm) matches the group
1071  * if does not matches, group shall be unlocked.
1072  */
1073 exprivate int check_singlegrp(void)
1074 {
1075     int i;
1076     int ret = EXSUCCEED;
1077     pm_node_t *p_pm_srvid;
1078     ndrx_sg_shm_t *p_shm, local;
1079     int grp2srvid[ndrx_G_libnstd_cfg.pgmax];
1080 
1081     NDRX_LOG(log_debug, "Into check_singlegrp()");
1082     memset(grp2srvid, 0, ndrx_G_libnstd_cfg.pgmax*sizeof(int));
1083 
1084     for (i=0; i<ndrx_G_libnstd_cfg.pgmax; i++)
1085     {
1086         p_shm = ndrx_sg_get(i+1);
1087 
1088         if (NULL==p_shm)
1089         {
1090             NDRX_LOG(log_error, "Null shared memory for singleton "
1091                 "groups (grpno: %d)", i);
1092             EXFAIL_OUT(ret);
1093         }
1094 
1095         /* verify the servers... */
1096         ndrx_sg_load(&local, p_shm);
1097 
1098         /* add given server id for duplicate checks */
1099         grp2srvid[i] = local.lockprov_srvid;
1100 
1101         /* check if the group if group is locked,
1102          * continue with next if not
1103          */
1104         if (EXTRUE!=ndrx_sg_is_locked_int(i+1, p_shm, NULL, 0))
1105         {
1106             continue;
1107         }
1108 
1109         if (!(local.lockprov_srvid>=0 && local.lockprov_srvid < ndrx_get_G_atmi_env()->max_servers))
1110         {
1111             NDRX_LOG(log_error, "Invalid server id %hd for singleton process group %d -> unlocking",
1112                     local.lockprov_srvid, i+1);
1113             userlog("Invalid server id %hd for singleton process group %d -> unlocking",
1114                     local.lockprov_srvid, i+1);
1115             ndrx_sg_unlock(p_shm, NDRX_SG_RSN_CORRUPT);
1116             continue;
1117         }
1118 
1119         p_pm_srvid = G_process_model_hash[local.lockprov_srvid];
1120 
1121         /* check the state of the server... */
1122         if (!PM_RUNNING(p_pm_srvid->state))
1123         {
1124             NDRX_LOG(log_error, "Server %d/%s/%d is not running -> "
1125                 "unlocking singleton process group %d",
1126                 p_pm_srvid->pid, p_pm_srvid->binary_name, p_pm_srvid->srvid, i+1);
1127             userlog("Server %d/%s/%d is not running -> "
1128                 "unlocking singleton process group %d",
1129                 p_pm_srvid->pid, p_pm_srvid->binary_name, p_pm_srvid->srvid, i+1);
1130             ndrx_sg_unlock(p_shm, NDRX_SG_RSN_NOPID);
1131             continue;
1132         }
1133 
1134         /* check the PIDs, it must match real server pid
1135          * in case also check real pid, as if not yet reported
1136          * then svpid might be not set.
1137          * Or other case might be that ndrxd have respawned the exsinglesv
1138          * which is not yet locked.
1139          */
1140         if (p_pm_srvid->svpid!=local.lockprov_pid
1141                 &&  p_pm_srvid->pid!=local.lockprov_pid)
1142         {
1143             NDRX_LOG(log_error, "Server %d/%d/%s/%d pid mistmatch with group's lockprov_pid %d -> "
1144                 "unlocking singleton process group %d",
1145                 (int)p_pm_srvid->pid, (int)p_pm_srvid->svpid, p_pm_srvid->binary_name,
1146                 p_pm_srvid->srvid, (int)local.lockprov_pid, i+1);
1147 
1148             userlog("Server %d/%d/%s/%d pid mistmatch with group's lockprov_pid %d -> "
1149                 "unlocking singleton process group %d",
1150                 (int)p_pm_srvid->pid, (int)p_pm_srvid->svpid, p_pm_srvid->binary_name,
1151                 p_pm_srvid->srvid, (int)local.lockprov_pid, i+1);
1152             ndrx_sg_unlock(p_shm, NDRX_SG_RSN_NOPID);
1153             continue;
1154         }
1155     }
1156 
1157     /* sort grp2srvid and check are there any duplicates  */
1158     qsort(grp2srvid, ndrx_G_libnstd_cfg.pgmax, sizeof(int), cmp_int);
1159 
1160     for (i=0; i<ndrx_G_libnstd_cfg.pgmax-1; i++)
1161     {
1162         if (grp2srvid[i]>0 && grp2srvid[i]==grp2srvid[i+1])
1163         {
1164             NDRX_LOG(log_error, "Duplicate server id %d for "
1165                     "singleton process group %d -> unlocking",
1166                     grp2srvid[i], i+1);
1167             userlog("Duplicate server id %d for "
1168                     "singleton process group %d -> unlocking",
1169                     grp2srvid[i], i+1);
1170             ndrx_sg_unlock(p_shm, NDRX_SG_RSN_CORRUPT);
1171         }
1172     }
1173 
1174 out:
1175     return ret;
1176 }
1177 
1178 /* vim: set ts=4 sw=4 et smartindent: */