0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038 #include <string.h>
0039 #include <stdio.h>
0040 #include <stdlib.h>
0041 #include <errno.h>
0042 #include <memory.h>
0043 #include <sys/types.h>
0044 #include <dirent.h>
0045 #include <sys/stat.h>
0046 #include <utlist.h>
0047
0048 #include <ndrstandard.h>
0049 #include <ndrxd.h>
0050 #include <atmi_int.h>
0051 #include <nstopwatch.h>
0052
0053 #include <ndebug.h>
0054 #include <cmd_processor.h>
0055 #include <signal.h>
0056 #include <bridge_int.h>
0057 #include <atmi_shm.h>
0058 #include "userlog.h"
0059 #include "sys_unix.h"
0060 #include <lcfint.h>
0061 #include <singlegrp.h>
0062
0063
0064
0065
0066
0067
0068 expublic unsigned G_sanity_cycle = 0;
0069 exprivate ndrx_stopwatch_t M_timer;
0070 exprivate int M_first = EXTRUE;
0071
0072
0073
0074 exprivate int check_server(char *qname);
0075 exprivate int check_client(char *qname, int is_xadmin, unsigned sanity_cycle);
0076 exprivate int check_cnvclt(char *qname);
0077 exprivate int check_cnvsrv(char *qname);
0078 exprivate int check_long_startup(void);
0079 exprivate int check_dead_processes(void);
0080 exprivate void check_memlimits(void);
0081 exprivate int check_singlegrp(void);
0082
0083
0084
0085
0086
0087 expublic ndrx_stopwatch_t * ndrx_get_santiy_stopwatch(void)
0088 {
0089
0090 if (!M_first)
0091 {
0092 return &M_timer;
0093 }
0094
0095 return NULL;
0096 }
0097
0098
0099
0100
0101
0102
0103 expublic int do_sanity_check(int finalchk)
0104 {
0105 int ret=EXSUCCEED;
0106 static char server_prefix[NDRX_MAX_Q_SIZE+1];
0107 static int server_prefix_len;
0108 static char client_prefix[NDRX_MAX_Q_SIZE+1];
0109 static int client_prefix_len;
0110 static char xadmin_prefix[NDRX_MAX_Q_SIZE+1];
0111 static int xadmin_prefix_len;
0112
0113
0114 static char cnvclt_prefix[NDRX_MAX_Q_SIZE+1];
0115 static int cnvclt_prefix_len;
0116
0117 static char cnvsrv_prefix[NDRX_MAX_Q_SIZE+1];
0118 static int cnvsrv_prefix_len;
0119
0120
0121 int wasrun = EXFALSE;
0122
0123 string_list_t* qlist = NULL;
0124 string_list_t* elt = NULL;
0125
0126 G_sanity_cycle++;
0127
0128
0129 if (NULL==G_app_config)
0130 goto out;
0131
0132 if (M_first)
0133 {
0134 ndrx_stopwatch_reset(&M_timer);
0135
0136 snprintf(client_prefix, sizeof(client_prefix), NDRX_CLT_QREPLY_PFX,
0137 G_sys_config.qprefix);
0138 client_prefix_len=strlen(client_prefix);
0139 NDRX_LOG(log_debug, "client_prefix=[%s]/%d", client_prefix,
0140 client_prefix_len);
0141
0142 snprintf(xadmin_prefix, sizeof(xadmin_prefix),
0143 NDRX_NDRXCLT_PFX, G_sys_config.qprefix);
0144 xadmin_prefix_len=strlen(xadmin_prefix);
0145 NDRX_LOG(log_debug, "xadmin_prefix=[%s]/%d", xadmin_prefix,
0146 xadmin_prefix_len);
0147
0148 snprintf(server_prefix, sizeof(server_prefix), NDRX_SVR_QREPLY_PFX,
0149 G_sys_config.qprefix);
0150 server_prefix_len=strlen(server_prefix);
0151 NDRX_LOG(log_debug, "server_prefix=[%s]/%d", server_prefix,
0152 server_prefix_len);
0153
0154 snprintf(cnvclt_prefix, sizeof(cnvclt_prefix), NDRX_CONV_INITATOR_Q_PFX,
0155 G_sys_config.qprefix);
0156
0157 cnvclt_prefix_len=strlen(cnvclt_prefix);
0158 NDRX_LOG(log_debug, "cnvclt_prefix=[%s]/%d", cnvclt_prefix,
0159 cnvclt_prefix_len);
0160
0161 snprintf(cnvsrv_prefix, sizeof(cnvsrv_prefix), NDRX_CONV_SRV_Q_PFX,
0162 G_sys_config.qprefix);
0163 cnvsrv_prefix_len=strlen(cnvsrv_prefix);
0164 NDRX_LOG(log_debug, "cnvsrv_prefix=[%s]/%d", cnvsrv_prefix,
0165 cnvsrv_prefix_len);
0166
0167 M_first=EXFALSE;
0168 }
0169
0170 if (ndrx_stopwatch_get_delta_sec(&M_timer)>=G_app_config->sanity || finalchk)
0171 {
0172 wasrun = EXTRUE;
0173 NDRX_LOG(log_debug, "Time for sanity checking...");
0174
0175 qlist = ndrx_sys_mqueue_list_make(G_sys_config.qpath, &ret);
0176
0177 if (EXSUCCEED!=ret)
0178 {
0179 NDRX_LOG(log_error, "posix queue listing failed!");
0180 EXFAIL_OUT(ret);
0181 }
0182
0183 LL_FOREACH(qlist,elt)
0184 {
0185 NDRX_LOG(6, "Checking... [%s]", elt->qname);
0186
0187 if (0==strncmp(elt->qname, client_prefix,
0188 client_prefix_len))
0189 {
0190 check_client(elt->qname, EXFALSE, G_sanity_cycle);
0191 }
0192 else if (0==strncmp(elt->qname, xadmin_prefix,
0193 xadmin_prefix_len))
0194 {
0195 check_client(elt->qname, EXTRUE, G_sanity_cycle);
0196 }
0197
0198 else if (0==strncmp(elt->qname, server_prefix,
0199 server_prefix_len))
0200 {
0201 check_server(elt->qname);
0202 }
0203 else if (0==strncmp(elt->qname, cnvclt_prefix,
0204 cnvclt_prefix_len))
0205 {
0206 check_cnvclt(elt->qname);
0207 }
0208 else if (0==strncmp(elt->qname, cnvsrv_prefix,
0209 cnvsrv_prefix_len))
0210 {
0211 check_cnvsrv(elt->qname);
0212 }
0213 }
0214
0215
0216
0217
0218 check_long_startup();
0219
0220
0221 if (!finalchk)
0222 {
0223 check_memlimits();
0224 }
0225
0226 if (!finalchk)
0227 {
0228 brd_send_periodrefresh();
0229 }
0230
0231
0232 if (EXSUCCEED!=check_dead_processes())
0233 {
0234 ret=EXFAIL;
0235 goto out;
0236 }
0237
0238
0239 ndrx_ddr_apply_sanity();
0240
0241
0242 if (!finalchk)
0243 {
0244 do_respawn_check();
0245 }
0246
0247
0248 check_singlegrp();
0249
0250
0251
0252 if (!finalchk)
0253 {
0254 if (G_app_config->gather_pq_stats)
0255 {
0256 pq_run_santiy(EXTRUE);
0257 }
0258 }
0259
0260 #ifdef EX_USE_SYSVQ
0261 if (EXSUCCEED!=do_sanity_check_sysv(finalchk))
0262 {
0263 NDRX_LOG(log_error, "System V sanity checks failed!");
0264 userlog("System V sanity checks failed!");
0265 EXFAIL_OUT(ret);
0266 }
0267 #endif
0268 }
0269
0270 out:
0271
0272 if (NULL!=qlist)
0273 {
0274 ndrx_string_list_free(qlist);
0275 }
0276
0277
0278 if (wasrun)
0279 {
0280 ndrx_stopwatch_reset(&M_timer);
0281 }
0282
0283 return ret;
0284 }
0285
0286
0287
0288
0289
0290
0291
0292
0293 exprivate void parse_q(char *qname, int is_server, char *process, int processsz,
0294 pid_t *p_pid, int *server_id, int is_xadmin)
0295 {
0296 char buf[NDRX_MAX_Q_SIZE+1];
0297 char *p;
0298
0299 NDRX_STRCPY_SAFE(buf, qname);
0300
0301
0302 if (!is_server && !is_xadmin)
0303 {
0304 p = strrchr(buf, NDRX_FMT_SEP);
0305 *p=EXEOS;
0306 }
0307
0308
0309 p = strrchr(buf, NDRX_FMT_SEP);
0310 *p_pid = atoi(p+1);
0311 *p=EXEOS;
0312
0313 if (is_server)
0314 {
0315
0316 p=strrchr(buf, NDRX_FMT_SEP);
0317 *server_id = atoi(p+1);
0318 *p=EXEOS;
0319 }
0320
0321
0322 p=strrchr(buf, NDRX_FMT_SEP);
0323
0324 NDRX_STRCPY_SAFE_DST(process, p+1, processsz);
0325
0326 NDRX_LOG(6, "got process: pid: %d name: [%s]",
0327 *p_pid, process);
0328 }
0329
0330
0331
0332
0333
0334 exprivate int unlink_dead_queue(char *qname)
0335 {
0336 int ret=EXSUCCEED;
0337 char q_str[NDRX_MAX_Q_SIZE+1];
0338 char *p;
0339
0340 if ('/'!=qname[0])
0341 {
0342 NDRX_STRCPY_SAFE(q_str, "/");
0343 NDRX_STRCAT_S(q_str, sizeof(q_str), qname);
0344 p = q_str;
0345 }
0346 else
0347 {
0348 p = qname;
0349 }
0350
0351 NDRX_LOG(log_warn, "Unlinking queue [%s]", p);
0352 if (EXSUCCEED!=ndrx_mq_unlink(p))
0353 {
0354 int err = errno;
0355
0356 if (ENOENT!=err)
0357 {
0358 NDRX_LOG(log_error, "Failed to unlink dead queue [%s]: %s",
0359 p, strerror(err));
0360
0361 userlog("Failed to unlink dead queue [%s]: %s",
0362 p, strerror(err));
0363 ret=EXFAIL;
0364 }
0365 else
0366 {
0367 NDRX_LOG(log_debug, "Queue already does not exists [%s]: %s",
0368 p, strerror(err));
0369 }
0370 }
0371
0372 return ret;
0373 }
0374
0375
0376
0377
0378
0379
0380
0381
0382
0383 expublic int remove_server_queues(char *process, pid_t pid, int srv_id, char *rplyq)
0384 {
0385 char q_str[NDRX_MAX_Q_SIZE+1];
0386 int rplyq_unlink = EXFALSE;
0387 char *p;
0388
0389 if (NULL==rplyq)
0390 {
0391 snprintf(q_str, sizeof(q_str), NDRX_SVR_QREPLY,
0392 G_sys_config.qprefix, process, srv_id, pid);
0393
0394 p = q_str;
0395 if (!ndrx_q_exists(q_str))
0396 {
0397 NDRX_LOG(log_info, "Seems like reply queue [%s] does not"
0398 " exists - nothing to do: %s", q_str, strerror(errno));
0399 }
0400 else
0401 {
0402 rplyq_unlink=EXTRUE;
0403 }
0404 }
0405 else
0406 {
0407 p = rplyq;
0408 rplyq_unlink=EXTRUE;
0409 }
0410
0411 if (rplyq_unlink)
0412 {
0413 unlink_dead_queue(p);
0414 }
0415
0416 snprintf(q_str, sizeof(q_str), NDRX_ADMIN_FMT, G_sys_config.qprefix,
0417 process, srv_id, pid);
0418
0419
0420
0421 if (!ndrx_q_exists(q_str))
0422 {
0423 NDRX_LOG(log_info, "Seems like admin queue [%s] does not"
0424 " exists - nothing to do: %s", q_str, strerror(errno));
0425 }
0426 else
0427 {
0428 unlink_dead_queue(q_str);
0429 }
0430
0431 return EXSUCCEED;
0432 }
0433
0434
0435
0436
0437
0438
0439 exprivate int check_server(char *qname)
0440 {
0441 char process[NDRX_MAX_Q_SIZE+1];
0442 pid_t pid;
0443 int srv_id;
0444 char *buf = NULL;
0445 size_t buf_len;
0446 srv_status_t *status = (srv_status_t *)buf;
0447 int ret=EXSUCCEED;
0448
0449 NDRX_SYSBUF_MALLOC_OUT(buf, buf_len, ret);
0450 status = (srv_status_t *)buf;
0451 memset((char *)status, 0, sizeof(srv_status_t));
0452
0453 parse_q(qname, EXTRUE, process, sizeof(process), &pid, &srv_id, EXFALSE);
0454
0455 if (!ndrx_sys_is_process_running(pid, process))
0456 {
0457
0458
0459 remove_server_queues(process, pid, srv_id, qname);
0460
0461 status->srvinfo.pid = pid;
0462 status->srvinfo.state = NDRXD_PM_DIED;
0463 status->srvinfo.srvid = srv_id;
0464 NDRX_LOG(log_debug, "Sending self notification "
0465 "about dead process...");
0466 if (EXSUCCEED!=self_notify(status, EXFALSE))
0467 {
0468 NDRX_LOG(log_warn, "Failed to send self notification "
0469 "- exit dead process check for a while!");
0470 ret=EXFAIL;
0471 goto out;
0472 }
0473
0474
0475
0476 }
0477 out:
0478 if (NULL!=buf)
0479 {
0480 NDRX_SYSBUF_FREE(buf);
0481 }
0482 return ret;
0483 }
0484
0485
0486
0487
0488
0489
0490
0491
0492
0493
0494
0495
0496 exprivate int check_client(char *qname, int is_xadmin, unsigned sanity_cycle)
0497 {
0498 char process[NDRX_MAX_Q_SIZE+1];
0499 pid_t pid;
0500
0501
0502 static unsigned prev_sanity_cycle;
0503 static int first = EXTRUE;
0504 static char prev_process[NDRX_MAX_Q_SIZE+1];
0505 static pid_t prev_pid;
0506
0507
0508
0509
0510
0511
0512 static int prev_stat=EXTRUE;
0513
0514 if (first)
0515 {
0516 prev_sanity_cycle = sanity_cycle-1;
0517 first=EXFALSE;
0518 }
0519
0520 parse_q(qname, EXFALSE, process, sizeof(process), &pid, 0, is_xadmin);
0521
0522 if (sanity_cycle == prev_sanity_cycle &&
0523 0==strcmp(process, prev_process) &&
0524 pid == prev_pid)
0525 {
0526 NDRX_LOG(6, "Multi-threaded process [%s]/%d already checked "
0527 "at this sanity check", process, pid);
0528
0529 if (EXTRUE > prev_stat && ndrx_q_exists(qname))
0530 {
0531
0532 if (EXFAIL==prev_stat)
0533 {
0534 NDRX_LOG(log_warn, "Previous same process (different "
0535 "thread was unlink) - unlink this q [%s] too ", qname);
0536 }
0537 else
0538 {
0539
0540 NDRX_LOG(log_error, "Client process [%s], pid %d (other "
0541 "thread unclean shutdown q: [%s])",
0542 process, pid, qname);
0543 userlog("Client process [%s], pid %d (other "
0544 "thread unclean shutdown q: [%s])",
0545 process, pid, qname);
0546
0547
0548 prev_stat=EXFAIL;
0549 }
0550 unlink_dead_queue(qname);
0551 }
0552 goto out;
0553 }
0554
0555
0556 prev_pid = pid;
0557 NDRX_STRCPY_SAFE(prev_process, process);
0558 prev_sanity_cycle = sanity_cycle;
0559
0560 if (!ndrx_sys_is_process_running(pid, process))
0561 {
0562
0563 if (ndrx_q_exists(qname))
0564 {
0565 NDRX_LOG(log_error, "Client process [%s], pid %d died", process, pid);
0566 userlog("Client process [%s], pid %d died", process, pid);
0567 unlink_dead_queue(qname);
0568 prev_stat = EXFAIL;
0569 }
0570 else
0571 {
0572 NDRX_LOG(log_debug, "Client process [%s], pid %d terminated normally",
0573 process, pid);
0574 prev_stat = EXFALSE;
0575 #ifdef EX_USE_EMQ
0576
0577 unlink_dead_queue(qname);
0578 #endif
0579 }
0580
0581 }
0582 else
0583 {
0584 prev_stat = EXTRUE;
0585 }
0586
0587 out:
0588 return EXSUCCEED;
0589 }
0590
0591
0592
0593
0594
0595
0596
0597
0598
0599
0600
0601 exprivate int send_kill(pm_node_t *p_pm, int sig, int delta)
0602 {
0603 NDRX_LOG(log_error, "Killing PID: %d (ppid: %d)/%s/%d with signal -%d",
0604 p_pm->svpid, p_pm->pid, p_pm->binary_name, p_pm->srvid, sig);
0605 userlog("Killing PID: %d (ppid: %d)/%s/%d with signal -%d",
0606 p_pm->svpid, p_pm->pid, p_pm->binary_name, p_pm->srvid, sig);
0607 if (EXSUCCEED!=kill(p_pm->svpid, sig))
0608 {
0609 NDRX_LOG(log_error, "Failed to kill PID %d (ppid: %d) with error: %s",
0610 p_pm->svpid, p_pm->pid, strerror(errno));
0611 }
0612
0613 return EXSUCCEED;
0614 }
0615
0616
0617
0618
0619
0620
0621 exprivate int check_long_startup(void)
0622 {
0623 int ret=EXSUCCEED;
0624 pm_node_t *p_pm;
0625 int delta;
0626 int cksum_reload_sent = EXFALSE;
0627
0628 int nrgrps = ndrx_G_libnstd_cfg.pgmax;
0629 int sg_groups[nrgrps];
0630
0631 ndrx_sg_get_lock_snapshoot(sg_groups, &nrgrps, 0);
0632
0633 DL_FOREACH(G_process_model, p_pm)
0634 {
0635
0636 p_pm->rspstwatch++;
0637
0638 if (SANITY_CNT_IDLE!=p_pm->pingstwatch)
0639 {
0640 p_pm->pingstwatch++;
0641 }
0642
0643 if (p_pm->conf->pingtime)
0644 {
0645 p_pm->pingtimer++;
0646 }
0647 p_pm->last_sig++;
0648 p_pm->state_changed++;
0649
0650
0651 if (!p_pm->killreq && NDRXD_PM_RUNNING_OK==p_pm->state &&
0652 p_pm->conf->pingtime && p_pm->pingtimer > p_pm->conf->pingtime)
0653 {
0654
0655 p_pm->pingtimer = SANITY_CNT_START;
0656
0657
0658 if (SANITY_CNT_IDLE==p_pm->pingstwatch)
0659 {
0660 p_pm->pingstwatch = SANITY_CNT_START;
0661 }
0662
0663
0664 srv_send_ping (p_pm);
0665 }
0666
0667
0668 if (p_pm->autokill)
0669 {
0670 NDRX_LOG(6, "proc: %s/%d ping stopwatch: %ld, rsp: %ld sty ping timer: %ld sty",
0671 p_pm->binary_name, p_pm->srvid,
0672 p_pm->pingstwatch,
0673 p_pm->rspstwatch,
0674 p_pm->pingtimer);
0675
0676
0677 if (p_pm->conf->procgrp_no > 0
0678 && PM_RUNNING(p_pm->state)
0679 && ndrx_ndrxconf_procgroups_is_singleton(G_app_config->procgroups,
0680 p_pm->conf->procgrp_no)
0681 && !sg_groups[p_pm->conf->procgrp_no-1])
0682 {
0683 NDRX_LOG(log_error, "proc: %s/%d procgrp no %d lost the lock -> SIGKILL",
0684 p_pm->binary_name, p_pm->srvid, p_pm->conf->procgrp_no);
0685 userlog("proc: %s/%d procgrp no %d lost the lock -> SIGKILL",
0686 p_pm->binary_name, p_pm->srvid, p_pm->conf->procgrp_no);
0687 p_pm->last_sig = SANITY_CNT_START;
0688
0689
0690 send_kill(p_pm, SIGKILL, 0);
0691 }
0692
0693 if (!p_pm->killreq)
0694 {
0695 if (NDRXD_PM_STARTING==p_pm->state &&
0696 (delta=p_pm->rspstwatch) > p_pm->conf->start_max)
0697 {
0698 NDRX_LOG(log_error, "Startup too long - requesting "
0699 "kill pid=%d/bin=%s/srvid=%d",
0700 p_pm->pid, p_pm->binary_name, p_pm->srvid);
0701
0702 userlog("Startup too long - requesting "
0703 "kill pid=%d/bin=%s/srvid=%d",
0704 p_pm->pid, p_pm->binary_name, p_pm->srvid);
0705 p_pm->killreq=EXTRUE;
0706 }
0707 else if (NDRXD_PM_RUNNING_OK==p_pm->state && p_pm->conf->pingtime &&
0708 (delta=p_pm->pingstwatch) > p_pm->conf->ping_max)
0709 {
0710 NDRX_LOG(log_error, "Ping response not in time - "
0711 "requesting kill (ping_time=%d delta=%d "
0712 "ping_max=%d) pid=%d/%s/srvid=%d",
0713 p_pm->conf->pingtime, delta, p_pm->conf->ping_max,
0714 p_pm->pid, p_pm->binary_name, p_pm->srvid);
0715
0716 userlog("Ping response not in time - "
0717 "requesting kill (ping_time=%d delta=%d "
0718 "ping_max=%d) pid=%d/bin=%s/srvid=%d",
0719 p_pm->conf->pingtime, delta, p_pm->conf->ping_max,
0720 p_pm->pid, p_pm->binary_name, p_pm->srvid);
0721 p_pm->killreq=EXTRUE;
0722 }
0723 else if (NDRXD_PM_STOPPING==p_pm->state &&
0724 (delta = p_pm->rspstwatch) > p_pm->conf->end_max)
0725 {
0726 NDRX_LOG(log_error, "Server did not exit in time "
0727 "- requesting kill");
0728 p_pm->killreq=EXTRUE;
0729 }
0730 }
0731
0732 if (p_pm->killreq)
0733 {
0734 if (0==p_pm->num_term_sigs)
0735 {
0736
0737 send_kill(p_pm, p_pm->conf->killseq[0], 0);
0738
0739 p_pm->last_sig = SANITY_CNT_START;
0740 p_pm->num_term_sigs++;
0741 }
0742 else if (p_pm->num_term_sigs > 0
0743 && p_pm->last_sig > p_pm->conf->killtime)
0744 {
0745 if (1==p_pm->num_term_sigs)
0746 {
0747
0748 send_kill(p_pm, p_pm->conf->killseq[1], 0);
0749 }
0750 else
0751 {
0752
0753 send_kill(p_pm, p_pm->conf->killseq[2], 0);
0754 }
0755
0756
0757 if (p_pm->num_term_sigs<2)
0758 p_pm->num_term_sigs++;
0759 p_pm->last_sig = SANITY_CNT_START;
0760 }
0761 }
0762 }
0763
0764
0765
0766
0767
0768 if (p_pm->conf->reloadonchange && EXEOS!=p_pm->binary_path[0]
0769 && !cksum_reload_sent
0770 && ndrx_file_exists(p_pm->binary_path))
0771 {
0772 if (roc_check_binary(p_pm->binary_path, G_sanity_cycle))
0773 {
0774 NDRX_LOG(log_warn, "Cksums differ reload...");
0775
0776 if (EXSUCCEED!=self_sreload(p_pm))
0777 {
0778 NDRX_LOG(log_warn, "Failed to send self notification "
0779 "about changed process - ignore!");
0780 }
0781 cksum_reload_sent=EXTRUE;
0782 }
0783 }
0784
0785 }
0786
0787 out:
0788 return ret;
0789 }
0790
0791
0792
0793
0794
0795
0796
0797 exprivate int check_dead_processes(void)
0798 {
0799 int ret=EXSUCCEED;
0800 pm_node_t *p_pm;
0801 char *buf = NULL;
0802 size_t buf_len;
0803 srv_status_t *status;
0804
0805 NDRX_SYSBUF_MALLOC_OUT(buf, buf_len, ret);
0806 status = (srv_status_t *)buf;
0807
0808 DL_FOREACH(G_process_model, p_pm)
0809 {
0810
0811 if (p_pm->state>=NDRXD_PM_MIN_RUNNING &&
0812 p_pm->state<=NDRXD_PM_MAX_RUNNING &&
0813 p_pm->state_changed > G_app_config->checkpm)
0814 {
0815 if (!ndrx_sys_is_process_running(p_pm->svpid, p_pm->binary_name_real))
0816 {
0817 NDRX_LOG(log_warn, "Pid %d/%s/%s in state %d is actually dead",
0818 p_pm->pid, p_pm->binary_name, p_pm->binary_name_real,
0819 p_pm->state);
0820
0821
0822 memset(buf, 0, sizeof(srv_status_t));
0823
0824 status->srvinfo.pid = p_pm->pid;
0825 status->srvinfo.state = NDRXD_PM_DIED;
0826 status->srvinfo.srvid = p_pm->srvid;
0827
0828 NDRX_LOG(log_debug, "Sending self notification "
0829 "about dead process...");
0830
0831 if (EXSUCCEED!=self_notify(status, EXFALSE))
0832 {
0833 NDRX_LOG(log_warn, "Failed to send self notification "
0834 "- exit dead process check for a while!");
0835 ret=EXFAIL;
0836 goto out;
0837 }
0838 }
0839 }
0840 }
0841
0842 out:
0843
0844 if (NULL!=buf)
0845 {
0846 NDRX_SYSBUF_FREE(buf);
0847 }
0848
0849 return ret;
0850 }
0851
0852
0853
0854
0855 exprivate void check_memlimits(void)
0856 {
0857 pm_node_t *p_pm;
0858 ndrx_proc_info_t inf;
0859
0860 DL_FOREACH(G_process_model, p_pm)
0861 {
0862
0863 if (p_pm->state>=NDRXD_PM_MIN_RUNNING &&
0864 p_pm->state<=NDRXD_PM_MAX_RUNNING &&
0865 (EXFAIL!=p_pm->conf->rssmax || EXFAIL!=p_pm->conf->vszmax))
0866 {
0867
0868 if (EXSUCCEED==ndrx_proc_get_infos(p_pm->pid, &inf))
0869 {
0870 int reached = EXFALSE;
0871 char memtype[4];
0872 long lim_val;
0873 long lim_max;
0874
0875 if (p_pm->conf->rssmax!=EXFAIL &&
0876 inf.rss * NDRX_STOR_KBYTE > p_pm->conf->rssmax)
0877 {
0878 reached = EXTRUE;
0879 lim_val = inf.rss * NDRX_STOR_KBYTE;
0880 lim_max = p_pm->conf->rssmax;
0881 NDRX_STRCPY_SAFE(memtype, "RSS");
0882 }
0883 else if (p_pm->conf->vszmax!=EXFAIL &&
0884 inf.vsz * NDRX_STOR_KBYTE > p_pm->conf->vszmax)
0885 {
0886 reached = EXTRUE;
0887 lim_val = inf.vsz * NDRX_STOR_KBYTE;
0888 lim_max = p_pm->conf->vszmax;
0889 NDRX_STRCPY_SAFE(memtype, "VSZ");
0890 }
0891
0892 if (reached)
0893 {
0894 char limitbuf[256];
0895 char valuebuf[256];
0896
0897 ndrx_storage_encode(lim_max, limitbuf, sizeof(limitbuf));
0898 ndrx_storage_encode(lim_val, valuebuf, sizeof(valuebuf));
0899
0900 NDRX_LOG(log_error, "Server pid = %d, srvid = %d, name [%s] "
0901 "%s memory limit reached: "
0902 "configured max: %s in system found: %s - restarting...",
0903 (int)p_pm->pid, p_pm->srvid, p_pm->binary_name,
0904 memtype, limitbuf, valuebuf);
0905
0906 userlog("Server pid = %d, srvid = %d, name [%s] "
0907 "%s memory limit reached: "
0908 "configured max: %s in system found: %s - restarting...",
0909 (int)p_pm->pid, p_pm->srvid, p_pm->binary_name,
0910 memtype, limitbuf, valuebuf);
0911
0912 if (EXSUCCEED!=self_sreload(p_pm))
0913 {
0914 NDRX_LOG(log_warn, "Failed to send self notification "
0915 "about changed process - ignore!");
0916 }
0917 }
0918 }
0919 else
0920 {
0921
0922
0923
0924 NDRX_LOG(log_warn, "Server pid = %d, srvid = %d, name [%s]: "
0925 "failed to read memory usage - ignore",
0926 (int)p_pm->pid, p_pm->srvid, p_pm->binary_name);
0927 }
0928
0929 }
0930 }
0931
0932 }
0933
0934
0935
0936
0937
0938 exprivate int check_svc_shm(void)
0939 {
0940 return EXSUCCEED;
0941 }
0942
0943
0944
0945
0946
0947
0948
0949
0950
0951
0952
0953
0954
0955
0956
0957
0958
0959
0960
0961
0962
0963
0964
0965
0966
0967
0968
0969
0970
0971
0972
0973
0974
0975
0976
0977
0978
0979
0980
0981
0982
0983
0984
0985
0986
0987 exprivate int check_cnvclt(char *qname)
0988 {
0989 int ret = EXSUCCEED;
0990 TPMYID myid;
0991
0992 if (EXSUCCEED==ndrx_cvnq_parse_client(qname, &myid))
0993 {
0994 if (EXFALSE==ndrx_myid_is_alive(&myid))
0995 {
0996 ndrx_myid_dump(log_debug, &myid, "process is dead, remove the queue");
0997
0998 unlink_dead_queue(qname);
0999 }
1000 }
1001
1002 out:
1003 return ret;
1004 }
1005
1006
1007
1008
1009
1010
1011 exprivate int check_cnvsrv(char *qname)
1012 {
1013 int ret = EXSUCCEED;
1014 TPMYID myid1, myid2;
1015
1016
1017
1018
1019
1020 if (EXSUCCEED==ndrx_cvnq_parse_server(qname, &myid1, &myid2))
1021 {
1022 if (EXFALSE==ndrx_myid_is_alive(&myid2))
1023 {
1024 ndrx_myid_dump(log_debug, &myid2, "process is dead, remove the queue");
1025 unlink_dead_queue(qname);
1026 }
1027 }
1028
1029 out:
1030 return ret;
1031 }
1032
1033
1034
1035
1036
1037 expublic int ndrxd_sanity_finally(void)
1038 {
1039 int ret = EXSUCCEED;
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052 ret = do_sanity_check(EXTRUE);
1053
1054 out:
1055 return ret;
1056 }
1057
1058
1059
1060
1061 exprivate int cmp_int(const void *a, const void *b)
1062 {
1063 return (*(int*)a - *(int*)b);
1064 }
1065
1066
1067
1068
1069
1070
1071
1072
1073 exprivate int check_singlegrp(void)
1074 {
1075 int i;
1076 int ret = EXSUCCEED;
1077 pm_node_t *p_pm_srvid;
1078 ndrx_sg_shm_t *p_shm, local;
1079 int grp2srvid[ndrx_G_libnstd_cfg.pgmax];
1080
1081 NDRX_LOG(log_debug, "Into check_singlegrp()");
1082 memset(grp2srvid, 0, ndrx_G_libnstd_cfg.pgmax*sizeof(int));
1083
1084 for (i=0; i<ndrx_G_libnstd_cfg.pgmax; i++)
1085 {
1086 p_shm = ndrx_sg_get(i+1);
1087
1088 if (NULL==p_shm)
1089 {
1090 NDRX_LOG(log_error, "Null shared memory for singleton "
1091 "groups (grpno: %d)", i);
1092 EXFAIL_OUT(ret);
1093 }
1094
1095
1096 ndrx_sg_load(&local, p_shm);
1097
1098
1099 grp2srvid[i] = local.lockprov_srvid;
1100
1101
1102
1103
1104 if (EXTRUE!=ndrx_sg_is_locked_int(i+1, p_shm, NULL, 0))
1105 {
1106 continue;
1107 }
1108
1109 if (!(local.lockprov_srvid>=0 && local.lockprov_srvid < ndrx_get_G_atmi_env()->max_servers))
1110 {
1111 NDRX_LOG(log_error, "Invalid server id %hd for singleton process group %d -> unlocking",
1112 local.lockprov_srvid, i+1);
1113 userlog("Invalid server id %hd for singleton process group %d -> unlocking",
1114 local.lockprov_srvid, i+1);
1115 ndrx_sg_unlock(p_shm, NDRX_SG_RSN_CORRUPT);
1116 continue;
1117 }
1118
1119 p_pm_srvid = G_process_model_hash[local.lockprov_srvid];
1120
1121
1122 if (!PM_RUNNING(p_pm_srvid->state))
1123 {
1124 NDRX_LOG(log_error, "Server %d/%s/%d is not running -> "
1125 "unlocking singleton process group %d",
1126 p_pm_srvid->pid, p_pm_srvid->binary_name, p_pm_srvid->srvid, i+1);
1127 userlog("Server %d/%s/%d is not running -> "
1128 "unlocking singleton process group %d",
1129 p_pm_srvid->pid, p_pm_srvid->binary_name, p_pm_srvid->srvid, i+1);
1130 ndrx_sg_unlock(p_shm, NDRX_SG_RSN_NOPID);
1131 continue;
1132 }
1133
1134
1135
1136
1137
1138
1139
1140 if (p_pm_srvid->svpid!=local.lockprov_pid
1141 && p_pm_srvid->pid!=local.lockprov_pid)
1142 {
1143 NDRX_LOG(log_error, "Server %d/%d/%s/%d pid mistmatch with group's lockprov_pid %d -> "
1144 "unlocking singleton process group %d",
1145 (int)p_pm_srvid->pid, (int)p_pm_srvid->svpid, p_pm_srvid->binary_name,
1146 p_pm_srvid->srvid, (int)local.lockprov_pid, i+1);
1147
1148 userlog("Server %d/%d/%s/%d pid mistmatch with group's lockprov_pid %d -> "
1149 "unlocking singleton process group %d",
1150 (int)p_pm_srvid->pid, (int)p_pm_srvid->svpid, p_pm_srvid->binary_name,
1151 p_pm_srvid->srvid, (int)local.lockprov_pid, i+1);
1152 ndrx_sg_unlock(p_shm, NDRX_SG_RSN_NOPID);
1153 continue;
1154 }
1155 }
1156
1157
1158 qsort(grp2srvid, ndrx_G_libnstd_cfg.pgmax, sizeof(int), cmp_int);
1159
1160 for (i=0; i<ndrx_G_libnstd_cfg.pgmax-1; i++)
1161 {
1162 if (grp2srvid[i]>0 && grp2srvid[i]==grp2srvid[i+1])
1163 {
1164 NDRX_LOG(log_error, "Duplicate server id %d for "
1165 "singleton process group %d -> unlocking",
1166 grp2srvid[i], i+1);
1167 userlog("Duplicate server id %d for "
1168 "singleton process group %d -> unlocking",
1169 grp2srvid[i], i+1);
1170 ndrx_sg_unlock(p_shm, NDRX_SG_RSN_CORRUPT);
1171 }
1172 }
1173
1174 out:
1175 return ret;
1176 }
1177
1178