00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <config.h>
00021
00022 #include <stdio.h>
00023 #include <errno.h>
00024 #include <signal.h>
00025 #include <stdarg.h>
00026 #include <stdlib.h>
00027 #include <setjmp.h>
00028 #include <sys/socket.h>
00029
00030 #ifdef HAVE_STRING_H
00031 #include <string.h>
00032 #endif
00033
00034 #ifdef HAVE_UNISTD_H
00035 #include <unistd.h>
00036 #endif
00037
00038 #include "monitor.h"
00039 #include "alert.h"
00040 #include "net.h"
00041 #include "monit_process.h"
00042
00043
00044 static sigjmp_buf timeout;
00045
00046
00047 static void do_validate(Process_T);
00048 static void do_restart(Process_T, char *, ...) ;
00049 static void do_start(Process_T, char *, ...) ;
00050 static void do_stop(Process_T, char *, ...);
00051 static int do_not_validate(Process_T);
00052 static int check_connection(Process_T, Port_T, char *);
00053 static int check_resources(Process_T, Resource_T, char *);
00054 static int check_process_state(Process_T, char *);
00055 static int check_skip(Process_T);
00056 static int check_timeout(Process_T);
00057 static int check_checksum(Process_T);
00058 static int checksum_helper(Process_T, char *, char *);
00059 static int check_timestamp(Process_T, Timestamp_T, char *);
00060 static void connection_timeout(int);
00061 static void reset_resource_counter(Process_T);
00062 static void vlog(char * report, int n, Process_T p, char *m,...);
00063 static int compare_value(int, int, int);
00064
00084
00085
00086
00090 void validate() {
00091
00092 Process_T p;
00093
00094 if(! update_loadavg()) {
00095
00096 log("Update of loadavg has failed!\n");
00097
00098 }
00099
00100 for(p= processlist; p; p= p->next) {
00101 if(p->visited)
00102 continue;
00103 do_validate(p);
00104 }
00105
00106 reset_depend();
00107
00108 }
00109
00110
00111
00112
00113
00122 static void do_validate(Process_T p) {
00123
00124 Port_T pp;
00125 Resource_T pr;
00126 Timestamp_T tl;
00127 pid_t pid= -1;
00128 sigset_t ns,os;
00129 char report[STRLEN];
00130
00131 ASSERT(p);
00132
00133
00134 LOCK(p->mutex)
00135
00136
00137 if(do_not_validate(p)) {
00138
00139 pthread_mutex_unlock(&p->mutex);
00140 return;
00141 }
00142
00143
00144 sigemptyset(&ns);
00145 sigaddset(&ns, SIGTERM);
00146 pthread_sigmask(SIG_BLOCK, &ns, &os);
00147
00148
00149 if(!(pid= is_process_running(p))) {
00150
00151
00152 memset(p->procinfo, 0, sizeof *(p->procinfo));
00153
00154 do_start(p, "Reason: Process is not running.");
00155 goto reinstall;
00156
00157 } else {
00158
00159 if(Run.debug) {
00160
00161 log("'%s' is running with pid %d\n", p->name, (int)pid);
00162
00163 }
00164
00165 }
00166
00167 if(Run.doprocess) {
00168
00169 if(update_process_data(p, pid)) {
00170
00171 if(! check_process_state(p, report)) {
00172
00173 smtp_alert_resource(p, "Reason: %s\n", report);
00174
00175 } else {
00176
00177 if(Run.debug) {
00178
00179 log("'%s' check_process_state() passed.\n", p->name);
00180
00181 }
00182
00183 }
00184
00185 for(pr= p->resourcelist; pr; pr= pr->next) {
00186
00187 if(!check_resources(p, pr, report)) {
00188
00189 switch(pr->action) {
00190 case ACTION_ALERT:
00191 if(p->def_timeout) p->nstart++;
00192 smtp_alert_resource(p, "Reason: %s\n", report);
00193
00194 pr->cycle=0;
00195 break;
00196
00197 case ACTION_STOP:
00198 do_stop(p, "Reason: %s\n", report);
00199 reset_resource_counter(p);
00200 goto reinstall;
00201
00202 case ACTION_RESTART:
00203 do_restart(p, "Reason: %s\n", report);
00204 reset_resource_counter(p);
00205 goto reinstall;
00206
00207 default:
00208 log("'%s' Unknow resource failure action.\n", p->name);
00209 break;
00210 }
00211
00212 }
00213
00214 }
00215
00216 } else {
00217
00218 log("'%s' failed to get process data\n", p->name);
00219
00220 }
00221
00222 }
00223
00224 for(tl= p->timestamplist; tl; tl= tl->next) {
00225
00226 if(!check_timestamp(p, tl, report)) {
00227
00228 switch(tl->action) {
00229 case ACTION_ALERT:
00230 if(p->def_timeout) p->nstart++;
00231 smtp_alert_timestamp(p, "Reason: %s\n", report);
00232 break;
00233
00234 case ACTION_STOP:
00235 do_stop(p, "Reason: %s\n", report);
00236 goto reinstall;
00237
00238 case ACTION_RESTART:
00239 do_restart(p, "Reason: %s\n", report);
00240 goto reinstall;
00241
00242 default:
00243 log("'%s' Unknow timestamp failure action.\n", p->name);
00244 break;
00245
00246 }
00247
00248 }
00249
00250 }
00251
00252
00253 for(pp= p->portlist; pp; pp= pp->next) {
00254
00255 if(!check_connection(p, pp, report)) {
00256
00257 do_restart(p, "Reason: %s\n", report);
00258 goto reinstall;
00259
00260 }
00261
00262 }
00263
00264 reinstall:
00265
00266
00267 pthread_sigmask(SIG_SETMASK, &os, NULL);
00268
00269
00270 END_LOCK;
00271
00272 }
00273
00274
00279 static void do_restart(Process_T p, char *m, ...) {
00280
00281 va_list ap;
00282 char *tmp = NULL;
00283
00284 ASSERT(p);
00285
00286 if(!p->do_validate)
00287 return;
00288
00289 if(p->def_timeout)
00290 p->nstart++;
00291
00292 va_start(ap, m);
00293 if(m)
00294 tmp=format(m, ap);
00295 va_end(ap);
00296
00297 if(p->mode!=MODE_PASSIVE && p->start && p->stop) {
00298
00299 log("Trying to restart '%s'\n", p->name);
00300 check_process(p->name, "restart");
00301 smtp_alert_restart(p, "%s", tmp);
00302
00303 } else {
00304
00305 smtp_alert_failed(p, "%s", tmp);
00306
00307 }
00308
00309 free(tmp);
00310
00311 }
00312
00313
00318 static void do_start(Process_T p, char *m, ...) {
00319
00320 va_list ap;
00321 char *tmp = NULL;
00322
00323 ASSERT(p);
00324
00325 if(!p->do_validate)
00326 return;
00327
00328 if(p->def_timeout)
00329 p->nstart++;
00330
00331 va_start(ap, m);
00332 if(m)
00333 tmp=format(m, ap);
00334 va_end(ap);
00335
00336 if(p->mode!= MODE_PASSIVE && p->start) {
00337
00338 check_process(p->name, "start");
00339 smtp_alert_restart(p, "%s", tmp);
00340
00341 } else {
00342
00343 smtp_alert_failed(p, "%s", tmp);
00344
00345 }
00346
00347 free(tmp);
00348
00349 }
00350
00351
00356 static void do_stop(Process_T p, char *m, ...) {
00357
00358 va_list ap;
00359 char *tmp = NULL;
00360
00361 ASSERT(p);
00362
00363 if(!p->do_validate)
00364 return;
00365
00366 va_start(ap, m);
00367 if(m)
00368 tmp=format(m, ap);
00369 va_end(ap);
00370
00371 LOCK(Run.mutex)
00372 p->do_validate= FALSE;
00373 END_LOCK;
00374
00375 if(p->mode!= MODE_PASSIVE && p->stop) {
00376
00377 smtp_alert_stop(p, "%s", tmp);
00378 check_process(p->name, "stop");
00379
00380 } else {
00381
00382 if(p->def_timeout)
00383 p->nstart++;
00384
00385 smtp_alert_failed(p, "%s", tmp);
00386
00387 }
00388
00389 free(tmp);
00390
00391 }
00392
00393
00398 static int do_not_validate(Process_T p) {
00399
00400 ASSERT(p);
00401
00402 return(!p->do_validate ||
00403 check_skip(p) ||
00404 check_timeout(p) ||
00405 check_checksum(p));
00406
00407 }
00408
00409
00414 static int check_connection(Process_T p, Port_T pp, char *report) {
00415
00416 volatile int rv= TRUE;
00417
00418 ASSERT(p && pp);
00419
00420
00421 if(sigsetjmp(timeout, TRUE)) {
00422
00423 snprintf(report, STRLEN,
00424 "timed out when testing %s [%s]",
00425 pp->address, pp->protocol->name);
00426
00427 log("'%s' %s\n", p->name, report);
00428
00429 rv= FALSE;
00430 goto error;
00431
00432 }
00433
00434
00435 set_alarm_handler(connection_timeout);
00436 alarm(CHECK_TIMEOUT);
00437
00438
00439 if((pp->socket= create_generic_socket(pp)) < 0) {
00440
00441 snprintf(report, STRLEN,
00442 "does not accept connection at %s.",
00443 pp->address);
00444
00445 log("'%s' %s\n", p->name, report);
00446
00447 rv= FALSE;
00448 goto error;
00449
00450 } else {
00451
00452 if(Run.debug) {
00453
00454 log("'%s' succeeded connecting to %s\n",
00455 p->name, pp->address);
00456
00457 }
00458
00459 }
00460
00461
00462 if(!check_connection_io(pp)) {
00463
00464 snprintf(report, STRLEN,
00465 "socket at %s is not ready for i|o -- %s",
00466 pp->address, STRERROR);
00467
00468 log("'%s' %s\n", p->name, report);
00469
00470 rv= FALSE;
00471 goto error;
00472
00473 }
00474
00475 if(pp->ssl != NULL) {
00476
00477 if(!embed_ssl_socket(pp->ssl, pp->socket)) {
00478
00479 snprintf(report, STRLEN,
00480 "failed establish SSL communication on socket at %s",
00481 pp->address);
00482
00483 log("'%s' %s\n", p->name, report);
00484 rv= FALSE;
00485 goto error;
00486
00487 }
00488
00489 if(pp->certmd5 != NULL) {
00490
00491 if(! check_ssl_md5sum(pp->ssl, pp->certmd5)) {
00492
00493 snprintf(report, STRLEN,
00494 "md5sums of SSL certificates do not match at %s",
00495 pp->address);
00496
00497 log("'%s' %s\n", p->name, report);
00498 rv= FALSE;
00499 goto error;
00500
00501 }
00502
00503 }
00504
00505 }
00506
00507
00508 if(! pp->protocol->check(pp)) {
00509
00510 snprintf(report, STRLEN,
00511 "test with protocol [%s] failed at %s.",
00512 pp->protocol->name, pp->address);
00513
00514 log("'%s' %s\n", p->name, report);
00515
00516 rv= FALSE;
00517 goto error;
00518
00519 } else {
00520
00521 if(Run.debug) {
00522
00523 log("'%s' succeeded testing protocol [%s] at %s\n",
00524 p->name, pp->protocol->name, pp->address);
00525
00526 }
00527 }
00528
00529 error:
00530
00531 alarm(0);
00532
00533 if(pp->ssl != NULL) {
00534
00535 cleanup_ssl_socket(pp->ssl);
00536
00537 }
00538
00539 close_socket(pp->socket);
00540
00541 return rv;
00542
00543 }
00544
00545
00550 static int check_process_state(Process_T p, char *report) {
00551
00552 ProcInfo_T pi;
00553
00554 ASSERT(p);
00555
00556 pi= p->procinfo;
00557
00558 if(pi->status_flag & PROCESS_ZOMBIE) {
00559
00560 snprintf(report, STRLEN,
00561 "process with pid %d is a zombie\n", pi->pid);
00562
00563 log("'%s' %s\n", p->name, report);
00564
00565
00566
00567 LOCK(Run.mutex)
00568 p->do_validate= FALSE;
00569 END_LOCK;
00570
00571 return FALSE;
00572
00573 } else {
00574
00575 if(Run.debug) {
00576
00577 log("'%s' zombie check passed [status_flag=%04x]\n",
00578 p->name, pi->status_flag);
00579
00580 }
00581
00582 }
00583
00584 return TRUE;
00585
00586 }
00587
00588
00592 static void reset_resource_counter(Process_T p) {
00593
00594 Resource_T pr;
00595
00596 ASSERT(p);
00597
00598 for(pr= p->resourcelist; pr; pr= pr->next) {
00599
00600 pr->cycle=0;
00601
00602 }
00603
00604 }
00605
00606
00611 static int check_resources(Process_T p, Resource_T pr, char *report) {
00612
00613 ProcInfo_T pi;
00614 int okay= TRUE;
00615
00616 ASSERT(p);
00617 ASSERT(pr);
00618
00619 pi= p->procinfo;
00620
00621 switch(pr->resource_id) {
00622 case RESOURCE_ID_CPU_PERCENT:
00623 if(compare_value(pr->operator, pi->cpu_percent, pr->limit)) {
00624
00625 vlog(report, STRLEN, p,
00626 "cpu usage of %.1f%% matches resource limit [cpu usage%s%.1f%%]",
00627 pi->cpu_percent/10.0, operatorshortnames[pr->operator],
00628 pr->limit/10.0);
00629
00630 okay= FALSE;
00631
00632 } else {
00633
00634 if(Run.debug) {
00635
00636 log("'%s' cpu usage check passed [current cpu usage=%.1f%%]\n",
00637 p->name, pi->cpu_percent/10.0);
00638
00639 }
00640 }
00641 break;
00642
00643 case RESOURCE_ID_MEM_PERCENT:
00644 if(compare_value(pr->operator, pi->mem_percent, pr->limit)) {
00645
00646 vlog(report, STRLEN, p,
00647 "mem usage of %.1f%% matches resource limit [mem usage%s%.1f%%]",
00648 pi->mem_percent/10.0, operatorshortnames[pr->operator],
00649 pr->limit/10.0);
00650
00651 okay= FALSE;
00652
00653 } else {
00654
00655 if(Run.debug) {
00656
00657 log("'%s' mem usage check passed [current mem usage=%.1f%%]\n",
00658 p->name, pi->mem_percent/10.0);
00659
00660 }
00661 }
00662 break;
00663
00664 case RESOURCE_ID_MEM_KBYTE:
00665
00666 if(compare_value(pr->operator, pi->mem_kbyte, pr->limit)) {
00667
00668 vlog(report, STRLEN, p,
00669 "mem amount of %ldkB matches resource limit [mem amount%s%ldkB]",
00670 pi->mem_kbyte, operatorshortnames[pr->operator],
00671 pr->limit);
00672
00673 okay= FALSE;
00674
00675 } else {
00676
00677 if(Run.debug) {
00678
00679 log("'%s' mem amount check passed [current mem amount=%ldkB]\n",
00680 p->name, pi->mem_kbyte);
00681
00682 }
00683 }
00684 break;
00685
00686 case RESOURCE_ID_LOAD1:
00687
00688 if(compare_value(pr->operator, (int)(Run.loadavg[0]*10.0), pr->limit)) {
00689
00690 vlog(report, STRLEN, p,
00691 "loadavg(1min) of %.1f matches resource limit "
00692 "[loadavg(1min)%s%.1f]",
00693 Run.loadavg[0], operatorshortnames[pr->operator],
00694 pr->limit/10.0);
00695
00696 okay= FALSE;
00697
00698 } else {
00699
00700 if(Run.debug) {
00701
00702 log("'%s' loadavg(1min) check passed [current loadavg(1min)=%.1f]\n",
00703 p->name, Run.loadavg[0]);
00704
00705 }
00706 }
00707 break;
00708
00709 case RESOURCE_ID_LOAD5:
00710
00711 if(compare_value(pr->operator, (int)(Run.loadavg[1]*10.0), pr->limit)) {
00712
00713 vlog(report, STRLEN, p,
00714 "loadavg(5min) of %.1f matches resource limit "
00715 "[loadavg(5min)%s%.1f]",
00716 Run.loadavg[1], operatorshortnames[pr->operator],
00717 pr->limit/10.0);
00718
00719 okay= FALSE;
00720
00721 } else {
00722
00723 if(Run.debug) {
00724
00725 log("'%s' loadavg(5min) check passed [current loadavg(5min)=%.1f]\n",
00726 p->name, Run.loadavg[1]);
00727
00728 }
00729 }
00730 break;
00731
00732 case RESOURCE_ID_LOAD15:
00733
00734 if(compare_value(pr->operator, (int)(Run.loadavg[2]*10.0), pr->limit)) {
00735
00736 vlog(report, STRLEN, p,
00737 "loadavg(15min) of %.1f matches resource limit "
00738 "[loadavg(15min)%s%.1f]",
00739 Run.loadavg[2], operatorshortnames[pr->operator],
00740 pr->limit/10.0);
00741
00742 okay= FALSE;
00743
00744 } else {
00745
00746 if(Run.debug) {
00747
00748 log("'%s' loadavg(15min) check passed "
00749 "[current loadavg(15min)=%.1f]\n",
00750 p->name, Run.loadavg[2]);
00751
00752 }
00753 }
00754 break;
00755
00756 default:
00757
00758 log("'%s' error: unknow resource ID: [%d]\n", p->name, pr->resource_id);
00759
00760 }
00761
00762 if(okay && pr->cycle > 0) {
00763
00764 pr->cycle--;
00765
00766 } else if(! okay) {
00767
00768 pr->cycle++;
00769
00770 }
00771
00772 if(pr->cycle >= pr->max_cycle) {
00773
00774 return FALSE;
00775
00776 }
00777
00778 return TRUE;
00779
00780 }
00781
00782
00786 static int check_timeout(Process_T p) {
00787
00788 ASSERT(p);
00789
00790 if(!p->def_timeout) {
00791
00792 return FALSE;
00793
00794 }
00795
00796
00797
00798
00799 if(p->nstart > 0) {
00800
00801 p->ncycle++;
00802
00803 }
00804
00805
00806
00807
00808 if(p->nstart >= p->to_start && p->ncycle <= p->to_cycle) {
00809
00810
00811
00812
00813
00814
00815 LOCK(Run.mutex)
00816 p->do_validate= FALSE;
00817 END_LOCK;
00818
00819
00820
00821
00822 log("**Alert** process '%s' timed out and will not be checked anymore.\n",
00823 p->name);
00824
00825 smtp_alert_timeout(p, NULL);
00826
00827 return TRUE;
00828
00829 }
00830
00831
00832
00833
00834
00835 if(p->ncycle > p->to_cycle) {
00836
00837 p->ncycle= 0;
00838 p->nstart= 0;
00839
00840 }
00841
00842 return FALSE;
00843
00844 }
00845
00846
00851 static int check_skip(Process_T p) {
00852
00853 ASSERT(p);
00854
00855 if(!p->def_every) {
00856
00857 return FALSE;
00858
00859 }
00860
00861 if(++p->nevery < p->every) {
00862
00863 return TRUE;
00864
00865 }
00866
00867 p->nevery= 0;
00868
00869 return FALSE;
00870
00871 }
00872
00873
00881 static int check_checksum(Process_T p) {
00882
00883 Checksum_T c;
00884
00885 ASSERT(p);
00886
00887 if(!p->def_checksum) {
00888
00889 return FALSE;
00890
00891 }
00892
00893 for(c= p->checksumlist; c; c= c->next) {
00894
00895 if(! checksum_helper(p, c->file, c->md5)) {
00896
00897 return TRUE;
00898
00899 }
00900
00901 }
00902
00903 if(Run.debug) {
00904
00905 log("'%s' have valid checksums\n", p->name);
00906
00907 }
00908
00909 return FALSE;
00910
00911 }
00912
00913
00920 static int checksum_helper(Process_T p, char *program, char *sum) {
00921
00922 int rv= TRUE;
00923
00924 ASSERT(p);
00925
00926 if(program && sum) {
00927
00928 if(!check_md5(program, sum)) {
00929
00930 log("'%s' **Alert** checksum error for %s\n", p->name, program);
00931
00932 smtp_alert_checksum(p, NULL);
00933
00934 LOCK(Run.mutex)
00935 p->do_validate= FALSE;
00936 p->has_checksum_error= TRUE;
00937 END_LOCK;
00938
00939 rv= FALSE;
00940
00941 }
00942
00943 }
00944
00945 return rv;
00946
00947 }
00948
00949
00953 static int check_timestamp(Process_T p, Timestamp_T t, char *report) {
00954
00955 time_t now;
00956 time_t timestamp;
00957
00958 ASSERT(p);
00959 ASSERT(t);
00960
00961 if((int)time(&now) == -1) {
00962 vlog(report, STRLEN, p, "can't get actual time");
00963 return FALSE;
00964 }
00965
00966 if(!(timestamp= get_timestamp(t->pathname, S_IFDIR|S_IFREG))) {
00967 vlog(report, STRLEN, p, "can't get timestamp for %s", t->pathname);
00968 return FALSE;
00969 }
00970
00971 if(compare_value(t->operator, (int)(now - timestamp), t->time)) {
00972 vlog(report, STRLEN, p, "timestamp test failed for %s", t->pathname);
00973 return FALSE;
00974 }
00975
00976 if(Run.debug)
00977 log("'%s' timestamp test passed for %s\n", p->name, t->pathname);
00978
00979 return TRUE;
00980
00981 }
00982
00983
00987 static void connection_timeout(int sig) {
00988
00989 siglongjmp(timeout, TRUE);
00990
00991 }
00992
00993
00997 static void vlog(char * report, int n, Process_T p, char *m,...) {
00998
00999 va_list ap;
01000 char *tmp = NULL;
01001
01002 va_start(ap, m);
01003
01004 if(m) {
01005
01006 tmp=format(m,ap);
01007
01008 }
01009
01010 va_end(ap);
01011
01012 strncpy(report, tmp, n);
01013 log("'%s' %s\n", p->name, report);
01014
01015 free(tmp);
01016 }
01017
01018
01023 static int compare_value(int operator, int left, int right) {
01024
01025 switch(operator) {
01026
01027 case OPERATOR_GREATER:
01028
01029 if(left > right)
01030 return TRUE;
01031 break;
01032
01033 case OPERATOR_LESS:
01034
01035 if(left < right)
01036 return TRUE;
01037 break;
01038
01039 case OPERATOR_EQUAL:
01040
01041 if(left == right)
01042 return TRUE;
01043 break;
01044
01045 case OPERATOR_NOTEQUAL:
01046
01047 if(left != right)
01048 return TRUE;
01049 break;
01050
01051 default:
01052 error("Unknow comparison operator\n");
01053 return FALSE;
01054
01055 }
01056
01057 return FALSE;
01058
01059 }
01060