Commit | Line | Data |
---|---|---|
805e021f CE |
1 | /* |
2 | * Copyright 2000, International Business Machines Corporation and others. | |
3 | * All Rights Reserved. | |
4 | * | |
5 | * This software has been released under the terms of the IBM Public | |
6 | * License. For details, see the LICENSE file in the top-level source | |
7 | * directory or online at http://www.openafs.org/dl/license10.html | |
8 | */ | |
9 | ||
10 | #include <afsconfig.h> | |
11 | #include <afs/param.h> | |
12 | ||
13 | #include <afs/procmgmt.h> | |
14 | #include <roken.h> | |
15 | ||
16 | #include <stddef.h> | |
17 | ||
18 | #include <lwp.h> | |
19 | #include <rx/rx.h> | |
20 | #include <afs/audit.h> | |
21 | #include <afs/afsutil.h> | |
22 | #include <afs/fileutil.h> | |
23 | #include <opr/queue.h> | |
24 | ||
25 | #include "bnode.h" | |
26 | #include "bnode_internal.h" | |
27 | #include "bosprototypes.h" | |
28 | ||
29 | #ifndef WCOREDUMP | |
30 | #define WCOREDUMP(x) ((x) & 0200) | |
31 | #endif | |
32 | ||
33 | #define BNODE_LWP_STACKSIZE (16 * 1024) | |
34 | #define BNODE_ERROR_COUNT_MAX 16 /* maximum number of retries */ | |
35 | #define BNODE_ERROR_DELAY_MAX 60 /* maximum retry delay (seconds) */ | |
36 | ||
37 | static PROCESS bproc_pid; /* pid of waker-upper */ | |
38 | static struct opr_queue allBnodes; /**< List of all bnodes */ | |
39 | static struct opr_queue allProcs; /**< List of all processes for which we're waiting */ | |
40 | static struct opr_queue allTypes; /**< List of all registered type handlers */ | |
41 | ||
42 | static struct bnode_stats { | |
43 | int weirdPids; | |
44 | } bnode_stats; | |
45 | ||
46 | extern const char *DoCore; | |
47 | extern const char *DoPidFiles; | |
48 | #ifndef AFS_NT40_ENV | |
49 | extern char **environ; /* env structure */ | |
50 | #endif | |
51 | ||
52 | int hdl_notifier(struct bnode_proc *tp); | |
53 | ||
54 | /* Remember the name of the process, if any, that failed last */ | |
55 | static void | |
56 | RememberProcName(struct bnode_proc *ap) | |
57 | { | |
58 | struct bnode *tbnodep; | |
59 | ||
60 | tbnodep = ap->bnode; | |
61 | if (tbnodep->lastErrorName) { | |
62 | free(tbnodep->lastErrorName); | |
63 | tbnodep->lastErrorName = NULL; | |
64 | } | |
65 | if (ap->coreName) | |
66 | tbnodep->lastErrorName = strdup(ap->coreName); | |
67 | } | |
68 | ||
69 | /* utility for use by BOP_HASCORE functions to determine where a core file might | |
70 | * be stored. | |
71 | */ | |
72 | int | |
73 | bnode_CoreName(struct bnode *abnode, char *acoreName, char *abuffer) | |
74 | { | |
75 | if (DoCore) { | |
76 | strcpy(abuffer, DoCore); | |
77 | strcat(abuffer, "/"); | |
78 | strcat(abuffer, AFSDIR_CORE_FILE); | |
79 | } else | |
80 | strcpy(abuffer, AFSDIR_SERVER_CORELOG_FILEPATH); | |
81 | if (acoreName) { | |
82 | strcat(abuffer, acoreName); | |
83 | strcat(abuffer, "."); | |
84 | } | |
85 | strcat(abuffer, abnode->name); | |
86 | return 0; | |
87 | } | |
88 | ||
89 | /* save core file, if any */ | |
90 | static void | |
91 | SaveCore(struct bnode *abnode, struct bnode_proc | |
92 | *aproc) | |
93 | { | |
94 | char tbuffer[256]; | |
95 | struct stat tstat; | |
96 | afs_int32 code = 0; | |
97 | char *corefile = NULL; | |
98 | #ifdef BOZO_SAVE_CORES | |
99 | struct timeval Start; | |
100 | struct tm *TimeFields; | |
101 | char FileName[256]; | |
102 | #endif | |
103 | ||
104 | /* Linux always appends the PID to core dumps from threaded processes, so | |
105 | * we have to scan the directory to find core files under another name. */ | |
106 | if (DoCore) { | |
107 | strcpy(tbuffer, DoCore); | |
108 | strcat(tbuffer, "/"); | |
109 | strcat(tbuffer, AFSDIR_CORE_FILE); | |
110 | } else | |
111 | code = stat(AFSDIR_SERVER_CORELOG_FILEPATH, &tstat); | |
112 | if (code) { | |
113 | DIR *logdir; | |
114 | struct dirent *file; | |
115 | unsigned long pid; | |
116 | const char *coredir = AFSDIR_LOGS_DIR; | |
117 | ||
118 | if (DoCore) | |
119 | coredir = DoCore; | |
120 | ||
121 | logdir = opendir(coredir); | |
122 | if (logdir == NULL) | |
123 | return; | |
124 | while ((file = readdir(logdir)) != NULL) { | |
125 | if (strncmp(file->d_name, "core.", 5) != 0) | |
126 | continue; | |
127 | pid = atol(file->d_name + 5); | |
128 | if (pid == aproc->pid) { | |
129 | int r; | |
130 | ||
131 | r = asprintf(&corefile, "%s/%s", coredir, file->d_name); | |
132 | if (r < 0 || corefile == NULL) { | |
133 | closedir(logdir); | |
134 | return; | |
135 | } | |
136 | code = 0; | |
137 | break; | |
138 | } | |
139 | } | |
140 | closedir(logdir); | |
141 | } else { | |
142 | corefile = strdup(tbuffer); | |
143 | } | |
144 | if (code) | |
145 | return; | |
146 | ||
147 | bnode_CoreName(abnode, aproc->coreName, tbuffer); | |
148 | #ifdef BOZO_SAVE_CORES | |
149 | FT_GetTimeOfDay(&Start, 0); | |
150 | TimeFields = localtime(&Start.tv_sec); | |
151 | sprintf(FileName, "%s.%d%02d%02d%02d%02d%02d", tbuffer, | |
152 | TimeFields->tm_year + 1900, TimeFields->tm_mon + 1, TimeFields->tm_mday, | |
153 | TimeFields->tm_hour, TimeFields->tm_min, TimeFields->tm_sec); | |
154 | strcpy(tbuffer, FileName); | |
155 | #endif | |
156 | rk_rename(corefile, tbuffer); | |
157 | free(corefile); | |
158 | } | |
159 | ||
160 | int | |
161 | bnode_GetString(struct bnode *abnode, char *abuffer, | |
162 | afs_int32 alen) | |
163 | { | |
164 | return BOP_GETSTRING(abnode, abuffer, alen); | |
165 | } | |
166 | ||
167 | int | |
168 | bnode_GetParm(struct bnode *abnode, afs_int32 aindex, | |
169 | char *abuffer, afs_int32 alen) | |
170 | { | |
171 | return BOP_GETPARM(abnode, aindex, abuffer, alen); | |
172 | } | |
173 | ||
174 | int | |
175 | bnode_GetStat(struct bnode *abnode, afs_int32 * astatus) | |
176 | { | |
177 | return BOP_GETSTAT(abnode, astatus); | |
178 | } | |
179 | ||
180 | int | |
181 | bnode_RestartP(struct bnode *abnode) | |
182 | { | |
183 | return BOP_RESTARTP(abnode); | |
184 | } | |
185 | ||
186 | static int | |
187 | bnode_Check(struct bnode *abnode) | |
188 | { | |
189 | if (abnode->flags & BNODE_WAIT) { | |
190 | abnode->flags &= ~BNODE_WAIT; | |
191 | LWP_NoYieldSignal(abnode); | |
192 | } | |
193 | return 0; | |
194 | } | |
195 | ||
196 | /* tell if an instance has a core file */ | |
197 | int | |
198 | bnode_HasCore(struct bnode *abnode) | |
199 | { | |
200 | return BOP_HASCORE(abnode); | |
201 | } | |
202 | ||
203 | /* wait for all bnodes to stabilize */ | |
204 | int | |
205 | bnode_WaitAll(void) | |
206 | { | |
207 | struct opr_queue *cursor; | |
208 | afs_int32 code; | |
209 | afs_int32 stat; | |
210 | ||
211 | retry: | |
212 | for (opr_queue_Scan(&allBnodes, cursor)) { | |
213 | struct bnode *tb = opr_queue_Entry(cursor, struct bnode, q); | |
214 | ||
215 | bnode_Hold(tb); | |
216 | code = BOP_GETSTAT(tb, &stat); | |
217 | if (code) { | |
218 | bnode_Release(tb); | |
219 | return code; | |
220 | } | |
221 | if (stat != tb->goal) { | |
222 | tb->flags |= BNODE_WAIT; | |
223 | LWP_WaitProcess(tb); | |
224 | bnode_Release(tb); | |
225 | goto retry; | |
226 | } | |
227 | bnode_Release(tb); | |
228 | } | |
229 | return 0; | |
230 | } | |
231 | ||
232 | /* wait until bnode status is correct */ | |
233 | int | |
234 | bnode_WaitStatus(struct bnode *abnode, int astatus) | |
235 | { | |
236 | afs_int32 code; | |
237 | afs_int32 stat; | |
238 | ||
239 | bnode_Hold(abnode); | |
240 | while (1) { | |
241 | /* get the status */ | |
242 | code = BOP_GETSTAT(abnode, &stat); | |
243 | if (code) | |
244 | return code; | |
245 | ||
246 | /* otherwise, check if we're done */ | |
247 | if (stat == astatus) { | |
248 | bnode_Release(abnode); | |
249 | return 0; /* done */ | |
250 | } | |
251 | if (astatus != abnode->goal) { | |
252 | bnode_Release(abnode); | |
253 | return -1; /* no longer our goal, don't keep waiting */ | |
254 | } | |
255 | /* otherwise, block */ | |
256 | abnode->flags |= BNODE_WAIT; | |
257 | LWP_WaitProcess(abnode); | |
258 | } | |
259 | } | |
260 | ||
261 | int | |
262 | bnode_ResetErrorCount(struct bnode *abnode) | |
263 | { | |
264 | abnode->errorStopCount = 0; | |
265 | abnode->errorStopDelay = 0; | |
266 | return 0; | |
267 | } | |
268 | ||
269 | int | |
270 | bnode_SetStat(struct bnode *abnode, int agoal) | |
271 | { | |
272 | abnode->goal = agoal; | |
273 | bnode_Check(abnode); | |
274 | BOP_SETSTAT(abnode, agoal); | |
275 | abnode->flags &= ~BNODE_ERRORSTOP; | |
276 | return 0; | |
277 | } | |
278 | ||
279 | int | |
280 | bnode_SetGoal(struct bnode *abnode, int agoal) | |
281 | { | |
282 | abnode->goal = agoal; | |
283 | bnode_Check(abnode); | |
284 | return 0; | |
285 | } | |
286 | ||
287 | int | |
288 | bnode_SetFileGoal(struct bnode *abnode, int agoal) | |
289 | { | |
290 | if (abnode->fileGoal == agoal) | |
291 | return 0; /* already done */ | |
292 | abnode->fileGoal = agoal; | |
293 | WriteBozoFile(0); | |
294 | return 0; | |
295 | } | |
296 | ||
297 | /* apply a function to all bnodes in the system */ | |
298 | int | |
299 | bnode_ApplyInstance(int (*aproc) (struct bnode *tb, void *), void *arock) | |
300 | { | |
301 | struct opr_queue *cursor, *store; | |
302 | afs_int32 code; | |
303 | ||
304 | for (opr_queue_ScanSafe(&allBnodes, cursor, store)) { | |
305 | struct bnode *tb = opr_queue_Entry(cursor, struct bnode, q); | |
306 | code = (*aproc) (tb, arock); | |
307 | if (code) | |
308 | return code; | |
309 | } | |
310 | return 0; | |
311 | } | |
312 | ||
313 | struct bnode * | |
314 | bnode_FindInstance(char *aname) | |
315 | { | |
316 | struct opr_queue *cursor; | |
317 | ||
318 | for (opr_queue_Scan(&allBnodes, cursor)) { | |
319 | struct bnode *tb = opr_queue_Entry(cursor, struct bnode, q); | |
320 | ||
321 | if (!strcmp(tb->name, aname)) | |
322 | return tb; | |
323 | } | |
324 | return NULL; | |
325 | } | |
326 | ||
327 | static struct bnode_type * | |
328 | FindType(char *aname) | |
329 | { | |
330 | struct opr_queue *cursor; | |
331 | ||
332 | for (opr_queue_Scan(&allTypes, cursor)) { | |
333 | struct bnode_type *tt = opr_queue_Entry(cursor, struct bnode_type, q); | |
334 | ||
335 | if (!strcmp(tt->name, aname)) | |
336 | return tt; | |
337 | } | |
338 | return NULL; | |
339 | } | |
340 | ||
341 | int | |
342 | bnode_Register(char *atype, struct bnode_ops *aprocs, int anparms) | |
343 | { | |
344 | struct opr_queue *cursor; | |
345 | struct bnode_type *tt = NULL; | |
346 | ||
347 | for (opr_queue_Scan(&allTypes, cursor), tt = NULL) { | |
348 | tt = opr_queue_Entry(cursor, struct bnode_type, q); | |
349 | if (!strcmp(tt->name, atype)) | |
350 | break; | |
351 | } | |
352 | if (!tt) { | |
353 | tt = calloc(1, sizeof(struct bnode_type)); | |
354 | opr_queue_Init(&tt->q); | |
355 | opr_queue_Prepend(&allTypes, &tt->q); | |
356 | tt->name = atype; | |
357 | } | |
358 | tt->ops = aprocs; | |
359 | return 0; | |
360 | } | |
361 | ||
362 | afs_int32 | |
363 | bnode_Create(char *atype, char *ainstance, struct bnode ** abp, char *ap1, | |
364 | char *ap2, char *ap3, char *ap4, char *ap5, char *notifier, | |
365 | int fileGoal, int rewritefile) | |
366 | { | |
367 | struct bnode_type *type; | |
368 | struct bnode *tb; | |
369 | char *notifierpath = NULL; | |
370 | struct stat tstat; | |
371 | ||
372 | if (bnode_FindInstance(ainstance)) | |
373 | return BZEXISTS; | |
374 | type = FindType(atype); | |
375 | if (!type) | |
376 | return BZBADTYPE; | |
377 | ||
378 | if (notifier && strcmp(notifier, NONOTIFIER)) { | |
379 | /* construct local path from canonical (wire-format) path */ | |
380 | if (ConstructLocalBinPath(notifier, ¬ifierpath)) { | |
381 | bozo_Log("BNODE-Create: Notifier program path invalid '%s'\n", | |
382 | notifier); | |
383 | return BZNOCREATE; | |
384 | } | |
385 | ||
386 | if (stat(notifierpath, &tstat)) { | |
387 | bozo_Log("BNODE-Create: Notifier program '%s' not found\n", | |
388 | notifierpath); | |
389 | free(notifierpath); | |
390 | return BZNOCREATE; | |
391 | } | |
392 | } | |
393 | tb = (*type->ops->create) (ainstance, ap1, ap2, ap3, ap4, ap5); | |
394 | if (!tb) { | |
395 | free(notifierpath); | |
396 | return BZNOCREATE; | |
397 | } | |
398 | tb->notifier = notifierpath; | |
399 | *abp = tb; | |
400 | tb->type = type; | |
401 | ||
402 | /* The fs_create above calls bnode_InitBnode() which always sets the | |
403 | ** fileGoal to BSTAT_NORMAL .... overwrite it with whatever is passed into | |
404 | ** this function as a parameter... */ | |
405 | tb->fileGoal = fileGoal; | |
406 | ||
407 | bnode_SetStat(tb, tb->goal); /* nudge it once */ | |
408 | ||
409 | if (rewritefile != 0) | |
410 | WriteBozoFile(0); | |
411 | ||
412 | return 0; | |
413 | } | |
414 | ||
415 | int | |
416 | bnode_DeleteName(char *ainstance) | |
417 | { | |
418 | struct bnode *tb; | |
419 | ||
420 | tb = bnode_FindInstance(ainstance); | |
421 | if (!tb) | |
422 | return BZNOENT; | |
423 | ||
424 | return bnode_Delete(tb); | |
425 | } | |
426 | ||
427 | int | |
428 | bnode_Hold(struct bnode *abnode) | |
429 | { | |
430 | abnode->refCount++; | |
431 | return 0; | |
432 | } | |
433 | ||
434 | int | |
435 | bnode_Release(struct bnode *abnode) | |
436 | { | |
437 | abnode->refCount--; | |
438 | if (abnode->refCount == 0 && abnode->flags & BNODE_DELETE) { | |
439 | abnode->flags &= ~BNODE_DELETE; /* we're going for it */ | |
440 | bnode_Delete(abnode); | |
441 | } | |
442 | return 0; | |
443 | } | |
444 | ||
445 | int | |
446 | bnode_Delete(struct bnode *abnode) | |
447 | { | |
448 | afs_int32 code; | |
449 | afs_int32 temp; | |
450 | ||
451 | if (abnode->refCount != 0) { | |
452 | abnode->flags |= BNODE_DELETE; | |
453 | return 0; | |
454 | } | |
455 | ||
456 | /* make sure the bnode is idle before zapping */ | |
457 | bnode_Hold(abnode); | |
458 | code = BOP_GETSTAT(abnode, &temp); | |
459 | bnode_Release(abnode); | |
460 | if (code) | |
461 | return code; | |
462 | if (temp != BSTAT_SHUTDOWN) | |
463 | return BZBUSY; | |
464 | ||
465 | /* all clear to zap */ | |
466 | opr_queue_Remove(&abnode->q); | |
467 | free(abnode->name); /* do this first, since bnode fields may be bad after BOP_DELETE */ | |
468 | code = BOP_DELETE(abnode); /* don't play games like holding over this one */ | |
469 | WriteBozoFile(0); | |
470 | return code; | |
471 | } | |
472 | ||
473 | /* function to tell if there's a timeout coming up */ | |
474 | int | |
475 | bnode_PendingTimeout(struct bnode *abnode) | |
476 | { | |
477 | return (abnode->flags & BNODE_NEEDTIMEOUT); | |
478 | } | |
479 | ||
480 | /* function called to set / clear periodic bnode wakeup times */ | |
481 | int | |
482 | bnode_SetTimeout(struct bnode *abnode, afs_int32 atimeout) | |
483 | { | |
484 | if (atimeout != 0) { | |
485 | abnode->nextTimeout = FT_ApproxTime() + atimeout; | |
486 | abnode->flags |= BNODE_NEEDTIMEOUT; | |
487 | abnode->period = atimeout; | |
488 | IOMGR_Cancel(bproc_pid); | |
489 | } else { | |
490 | abnode->flags &= ~BNODE_NEEDTIMEOUT; | |
491 | } | |
492 | return 0; | |
493 | } | |
494 | ||
495 | /* used by new bnode creation code to format bnode header */ | |
496 | int | |
497 | bnode_InitBnode(struct bnode *abnode, struct bnode_ops *abnodeops, | |
498 | char *aname) | |
499 | { | |
500 | /* format the bnode properly */ | |
501 | memset(abnode, 0, sizeof(struct bnode)); | |
502 | opr_queue_Init(&abnode->q); | |
503 | abnode->ops = abnodeops; | |
504 | abnode->name = strdup(aname); | |
505 | if (!abnode->name) | |
506 | return ENOMEM; | |
507 | abnode->flags = BNODE_ACTIVE; | |
508 | abnode->fileGoal = BSTAT_NORMAL; | |
509 | abnode->goal = BSTAT_SHUTDOWN; | |
510 | ||
511 | /* put the bnode at the end of the list so we write bnode file in same order */ | |
512 | opr_queue_Append(&allBnodes, &abnode->q); | |
513 | ||
514 | return 0; | |
515 | } | |
516 | ||
517 | /* bnode lwp executes this code repeatedly */ | |
518 | static void * | |
519 | bproc(void *unused) | |
520 | { | |
521 | afs_int32 code; | |
522 | struct bnode *tb; | |
523 | afs_int32 temp; | |
524 | struct opr_queue *cursor, *store; | |
525 | struct bnode_proc *tp; | |
526 | int options; /* must not be register */ | |
527 | struct timeval tv; | |
528 | int setAny; | |
529 | int status; | |
530 | ||
531 | while (1) { | |
532 | /* first figure out how long to sleep for */ | |
533 | temp = 0x7fffffff; /* afs_int32 time; maxint doesn't work in select */ | |
534 | setAny = 0; | |
535 | for (opr_queue_Scan(&allBnodes, cursor)) { | |
536 | tb = opr_queue_Entry(cursor, struct bnode, q); | |
537 | if (tb->flags & BNODE_NEEDTIMEOUT) { | |
538 | if (tb->nextTimeout < temp) { | |
539 | setAny = 1; | |
540 | temp = tb->nextTimeout; | |
541 | } | |
542 | } | |
543 | } | |
544 | /* now temp has the time at which we should wakeup next */ | |
545 | ||
546 | /* sleep */ | |
547 | if (setAny) | |
548 | temp -= FT_ApproxTime(); /* how many seconds until next event */ | |
549 | else | |
550 | temp = 999999; | |
551 | if (temp > 0) { | |
552 | tv.tv_sec = temp; | |
553 | tv.tv_usec = 0; | |
554 | code = IOMGR_Select(0, 0, 0, 0, &tv); | |
555 | } else | |
556 | code = 0; /* fake timeout code */ | |
557 | ||
558 | /* figure out why we woke up; child exit or timeouts */ | |
559 | FT_GetTimeOfDay(&tv, 0); /* must do the real gettimeofday once and a while */ | |
560 | temp = tv.tv_sec; | |
561 | ||
562 | /* check all bnodes to see which ones need timeout events */ | |
563 | for (opr_queue_ScanSafe(&allBnodes, cursor, store)) { | |
564 | tb = opr_queue_Entry(cursor, struct bnode, q); | |
565 | if ((tb->flags & BNODE_NEEDTIMEOUT) && temp > tb->nextTimeout) { | |
566 | bnode_Hold(tb); | |
567 | BOP_TIMEOUT(tb); | |
568 | bnode_Check(tb); | |
569 | if (tb->flags & BNODE_NEEDTIMEOUT) { /* check again, BOP_TIMEOUT could change */ | |
570 | tb->nextTimeout = FT_ApproxTime() + tb->period; | |
571 | } | |
572 | bnode_Release(tb); /* delete may occur here */ | |
573 | } | |
574 | } | |
575 | ||
576 | if (code < 0) { | |
577 | /* signalled, probably by incoming signal */ | |
578 | while (1) { | |
579 | options = WNOHANG; | |
580 | code = waitpid((pid_t) - 1, &status, options); | |
581 | if (code == 0 || code == -1) | |
582 | break; /* all done */ | |
583 | /* otherwise code has a process id, which we now search for */ | |
584 | for (tp = NULL, opr_queue_Scan(&allProcs, cursor), tp = NULL) { | |
585 | tp = opr_queue_Entry(cursor, struct bnode_proc, q); | |
586 | ||
587 | if (tp->pid == code) | |
588 | break; | |
589 | } | |
590 | if (tp) { | |
591 | /* found the pid */ | |
592 | tb = tp->bnode; | |
593 | bnode_Hold(tb); | |
594 | ||
595 | /* count restarts in last 30 seconds */ | |
596 | if (temp > tb->rsTime + 30) { | |
597 | /* it's been 30 seconds we've been counting */ | |
598 | tb->rsTime = temp; | |
599 | tb->rsCount = 0; | |
600 | } | |
601 | ||
602 | ||
603 | if (WIFSIGNALED(status) == 0) { | |
604 | /* exited, not signalled */ | |
605 | tp->lastExit = WEXITSTATUS(status); | |
606 | tp->lastSignal = 0; | |
607 | if (tp->lastExit) { | |
608 | tb->errorCode = tp->lastExit; | |
609 | tb->lastErrorExit = FT_ApproxTime(); | |
610 | RememberProcName(tp); | |
611 | tb->errorSignal = 0; | |
612 | } | |
613 | if (tp->coreName) | |
614 | bozo_Log("%s:%s exited with code %d\n", tb->name, | |
615 | tp->coreName, tp->lastExit); | |
616 | else | |
617 | bozo_Log("%s exited with code %d\n", tb->name, | |
618 | tp->lastExit); | |
619 | } else { | |
620 | /* Signal occurred, perhaps spurious due to shutdown request. | |
621 | * If due to a shutdown request, don't overwrite last error | |
622 | * information. | |
623 | */ | |
624 | tp->lastSignal = WTERMSIG(status); | |
625 | tp->lastExit = 0; | |
626 | if (tp->lastSignal != SIGQUIT | |
627 | && tp->lastSignal != SIGTERM | |
628 | && tp->lastSignal != SIGKILL) { | |
629 | tb->errorSignal = tp->lastSignal; | |
630 | tb->lastErrorExit = FT_ApproxTime(); | |
631 | RememberProcName(tp); | |
632 | } | |
633 | if (tp->coreName) | |
634 | bozo_Log("%s:%s exited on signal %d%s\n", | |
635 | tb->name, tp->coreName, tp->lastSignal, | |
636 | WCOREDUMP(status) ? " (core dumped)" : | |
637 | ""); | |
638 | else | |
639 | bozo_Log("%s exited on signal %d%s\n", tb->name, | |
640 | tp->lastSignal, | |
641 | WCOREDUMP(status) ? " (core dumped)" : | |
642 | ""); | |
643 | SaveCore(tb, tp); | |
644 | } | |
645 | tb->lastAnyExit = FT_ApproxTime(); | |
646 | ||
647 | if (tb->notifier) { | |
648 | bozo_Log("BNODE: Notifier %s will be called\n", | |
649 | tb->notifier); | |
650 | hdl_notifier(tp); | |
651 | } | |
652 | ||
653 | if (tb->goal && tb->rsCount++ > 10) { | |
654 | /* 10 in 30 seconds */ | |
655 | if (tb->errorStopCount >= BNODE_ERROR_COUNT_MAX) { | |
656 | tb->errorStopDelay = 0; /* max reached, give up. */ | |
657 | } else { | |
658 | tb->errorStopCount++; | |
659 | if (!tb->errorStopDelay) { | |
660 | tb->errorStopDelay = 1; /* wait a second, then retry */ | |
661 | } else { | |
662 | tb->errorStopDelay *= 2; /* ramp up the retry delays */ | |
663 | } | |
664 | if (tb->errorStopDelay > BNODE_ERROR_DELAY_MAX) { | |
665 | tb->errorStopDelay = BNODE_ERROR_DELAY_MAX; /* cap the delay */ | |
666 | } | |
667 | } | |
668 | tb->flags |= BNODE_ERRORSTOP; | |
669 | bnode_SetGoal(tb, BSTAT_SHUTDOWN); | |
670 | bozo_Log | |
671 | ("BNODE '%s' repeatedly failed to start, perhaps missing executable.\n", | |
672 | tb->name); | |
673 | } | |
674 | BOP_PROCEXIT(tb, tp); | |
675 | bnode_Check(tb); | |
676 | bnode_Release(tb); /* bnode delete can happen here */ | |
677 | opr_queue_Remove(&tp->q); | |
678 | free(tp); | |
679 | } else | |
680 | bnode_stats.weirdPids++; | |
681 | } | |
682 | } | |
683 | } | |
684 | AFS_UNREACHED(return(NULL)); | |
685 | } | |
686 | ||
687 | static afs_int32 | |
688 | SendNotifierData(int fd, struct bnode_proc *tp) | |
689 | { | |
690 | struct bnode *tb = tp->bnode; | |
691 | char buffer[1000], *bufp = buffer, *buf1; | |
692 | int len; | |
693 | ||
694 | /* | |
695 | * First sent out the bnode_proc struct | |
696 | */ | |
697 | (void)sprintf(bufp, "BEGIN bnode_proc\n"); | |
698 | bufp += strlen(bufp); | |
699 | (void)sprintf(bufp, "comLine: %s\n", tp->comLine); | |
700 | bufp += strlen(bufp); | |
701 | if (!(buf1 = tp->coreName)) | |
702 | buf1 = "(null)"; | |
703 | (void)sprintf(bufp, "coreName: %s\n", buf1); | |
704 | bufp += strlen(bufp); | |
705 | (void)sprintf(bufp, "pid: %ld\n", afs_printable_int32_ld(tp->pid)); | |
706 | bufp += strlen(bufp); | |
707 | (void)sprintf(bufp, "lastExit: %ld\n", afs_printable_int32_ld(tp->lastExit)); | |
708 | bufp += strlen(bufp); | |
709 | #ifdef notdef | |
710 | (void)sprintf(bufp, "lastSignal: %ld\n", afs_printable_int32_ld(tp->lastSignal)); | |
711 | bufp += strlen(bufp); | |
712 | #endif | |
713 | (void)sprintf(bufp, "flags: %ld\n", afs_printable_int32_ld(tp->flags)); | |
714 | bufp += strlen(bufp); | |
715 | (void)sprintf(bufp, "END bnode_proc\n"); | |
716 | bufp += strlen(bufp); | |
717 | len = (int)(bufp - buffer); | |
718 | if (write(fd, buffer, len) < 0) { | |
719 | return -1; | |
720 | } | |
721 | ||
722 | /* | |
723 | * Now sent out the bnode struct | |
724 | */ | |
725 | bufp = buffer; | |
726 | (void)sprintf(bufp, "BEGIN bnode\n"); | |
727 | bufp += strlen(bufp); | |
728 | (void)sprintf(bufp, "name: %s\n", tb->name); | |
729 | bufp += strlen(bufp); | |
730 | (void)sprintf(bufp, "rsTime: %ld\n", afs_printable_int32_ld(tb->rsTime)); | |
731 | bufp += strlen(bufp); | |
732 | (void)sprintf(bufp, "rsCount: %ld\n", afs_printable_int32_ld(tb->rsCount)); | |
733 | bufp += strlen(bufp); | |
734 | (void)sprintf(bufp, "procStartTime: %ld\n", afs_printable_int32_ld(tb->procStartTime)); | |
735 | bufp += strlen(bufp); | |
736 | (void)sprintf(bufp, "procStarts: %ld\n", afs_printable_int32_ld(tb->procStarts)); | |
737 | bufp += strlen(bufp); | |
738 | (void)sprintf(bufp, "lastAnyExit: %ld\n", afs_printable_int32_ld(tb->lastAnyExit)); | |
739 | bufp += strlen(bufp); | |
740 | (void)sprintf(bufp, "lastErrorExit: %ld\n", afs_printable_int32_ld(tb->lastErrorExit)); | |
741 | bufp += strlen(bufp); | |
742 | (void)sprintf(bufp, "errorCode: %ld\n", afs_printable_int32_ld(tb->errorCode)); | |
743 | bufp += strlen(bufp); | |
744 | (void)sprintf(bufp, "errorSignal: %ld\n", afs_printable_int32_ld(tb->errorSignal)); | |
745 | bufp += strlen(bufp); | |
746 | /* | |
747 | (void) sprintf(bufp, "lastErrorName: %s\n", tb->lastErrorName); | |
748 | bufp += strlen(bufp); | |
749 | */ | |
750 | (void)sprintf(bufp, "goal: %d\n", tb->goal); | |
751 | bufp += strlen(bufp); | |
752 | (void)sprintf(bufp, "END bnode\n"); | |
753 | bufp += strlen(bufp); | |
754 | len = (int)(bufp - buffer); | |
755 | if (write(fd, buffer, len) < 0) { | |
756 | return -1; | |
757 | } | |
758 | return 0; | |
759 | } | |
760 | ||
761 | int | |
762 | hdl_notifier(struct bnode_proc *tp) | |
763 | { | |
764 | #ifndef AFS_NT40_ENV /* NT notifier callout not yet implemented */ | |
765 | int pid; | |
766 | struct stat tstat; | |
767 | ||
768 | if (stat(tp->bnode->notifier, &tstat)) { | |
769 | bozo_Log("BNODE: Failed to find notifier '%s'; ignored\n", | |
770 | tp->bnode->notifier); | |
771 | return (1); | |
772 | } | |
773 | if ((pid = fork()) == 0) { | |
774 | FILE *fout; | |
775 | struct bnode *tb = tp->bnode; | |
776 | ||
777 | #if defined(AFS_HPUX_ENV) || defined(AFS_SUN5_ENV) || defined(AFS_SGI51_ENV) | |
778 | setsid(); | |
779 | #elif defined(AFS_DARWIN90_ENV) | |
780 | setpgid(0, 0); | |
781 | #elif defined(AFS_LINUX20_ENV) || defined(AFS_AIX_ENV) | |
782 | setpgrp(); | |
783 | #else | |
784 | setpgrp(0, 0); | |
785 | #endif | |
786 | fout = popen(tb->notifier, "w"); | |
787 | if (fout == NULL) { | |
788 | bozo_Log("BNODE: Failed to find notifier '%s'; ignored\n", | |
789 | tb->notifier); | |
790 | perror(tb->notifier); | |
791 | exit(1); | |
792 | } | |
793 | SendNotifierData(fileno(fout), tp); | |
794 | pclose(fout); | |
795 | exit(0); | |
796 | } else if (pid < 0) { | |
797 | bozo_Log("Failed to fork creating process to handle notifier '%s'\n", | |
798 | tp->bnode->notifier); | |
799 | return -1; | |
800 | } | |
801 | #endif /* AFS_NT40_ENV */ | |
802 | return (0); | |
803 | } | |
804 | ||
805 | /* Called by IOMGR at low priority on IOMGR's stack shortly after a SIGCHLD | |
806 | * occurs. Wakes up bproc do redo things */ | |
807 | void * | |
808 | bnode_SoftInt(void *param) | |
809 | { | |
810 | /* int asignal = (int) param; */ | |
811 | ||
812 | IOMGR_Cancel(bproc_pid); | |
813 | return NULL; | |
814 | } | |
815 | ||
816 | /* Called at signal interrupt level; queues function to be called | |
817 | * when IOMGR runs again. | |
818 | */ | |
819 | void | |
820 | bnode_Int(int asignal) | |
821 | { | |
822 | if (asignal == SIGQUIT || asignal == SIGTERM) { | |
823 | IOMGR_SoftSig(bozo_ShutdownAndExit, (void *)(intptr_t)asignal); | |
824 | } else { | |
825 | IOMGR_SoftSig(bnode_SoftInt, (void *)(intptr_t)asignal); | |
826 | } | |
827 | } | |
828 | ||
829 | ||
830 | /* intialize the whole system */ | |
831 | int | |
832 | bnode_Init(void) | |
833 | { | |
834 | PROCESS junk; | |
835 | afs_int32 code; | |
836 | struct sigaction newaction; | |
837 | static int initDone = 0; | |
838 | ||
839 | if (initDone) | |
840 | return 0; | |
841 | initDone = 1; | |
842 | opr_queue_Init(&allTypes); | |
843 | opr_queue_Init(&allProcs); | |
844 | opr_queue_Init(&allBnodes); | |
845 | memset(&bnode_stats, 0, sizeof(bnode_stats)); | |
846 | LWP_InitializeProcessSupport(1, &junk); /* just in case */ | |
847 | IOMGR_Initialize(); | |
848 | code = LWP_CreateProcess(bproc, BNODE_LWP_STACKSIZE, | |
849 | /* priority */ 1, (void *) /* parm */ 0, | |
850 | "bnode-manager", &bproc_pid); | |
851 | if (code) | |
852 | return code; | |
853 | memset(&newaction, 0, sizeof(newaction)); | |
854 | newaction.sa_handler = bnode_Int; | |
855 | code = sigaction(SIGCHLD, &newaction, NULL); | |
856 | if (code) | |
857 | return errno; | |
858 | code = sigaction(SIGQUIT, &newaction, NULL); | |
859 | if (code) | |
860 | return errno; | |
861 | code = sigaction(SIGTERM, &newaction, NULL); | |
862 | if (code) | |
863 | return errno; | |
864 | return code; | |
865 | } | |
866 | ||
867 | /* free token list returned by parseLine */ | |
868 | int | |
869 | bnode_FreeTokens(struct bnode_token *alist) | |
870 | { | |
871 | struct bnode_token *nlist; | |
872 | for (; alist; alist = nlist) { | |
873 | nlist = alist->next; | |
874 | free(alist->key); | |
875 | free(alist); | |
876 | } | |
877 | return 0; | |
878 | } | |
879 | ||
880 | static int | |
881 | space(int x) | |
882 | { | |
883 | if (x == 0 || x == ' ' || x == '\t' || x == '\n') | |
884 | return 1; | |
885 | else | |
886 | return 0; | |
887 | } | |
888 | ||
889 | int | |
890 | bnode_ParseLine(char *aline, struct bnode_token **alist) | |
891 | { | |
892 | char tbuffer[256]; | |
893 | char *tptr = NULL; | |
894 | int inToken; | |
895 | struct bnode_token *first, *last; | |
896 | struct bnode_token *ttok; | |
897 | int tc; | |
898 | ||
899 | inToken = 0; /* not copying token chars at start */ | |
900 | first = (struct bnode_token *)0; | |
901 | last = (struct bnode_token *)0; | |
902 | while (1) { | |
903 | tc = *aline++; | |
904 | if (tc == 0 || space(tc)) { /* terminating null gets us in here, too */ | |
905 | if (inToken) { | |
906 | inToken = 0; /* end of this token */ | |
907 | *tptr++ = 0; | |
908 | ttok = malloc(sizeof(struct bnode_token)); | |
909 | ttok->next = (struct bnode_token *)0; | |
910 | ttok->key = strdup(tbuffer); | |
911 | if (last) { | |
912 | last->next = ttok; | |
913 | last = ttok; | |
914 | } else | |
915 | last = ttok; | |
916 | if (!first) | |
917 | first = ttok; | |
918 | } | |
919 | } else { | |
920 | /* an alpha character */ | |
921 | if (!inToken) { | |
922 | tptr = tbuffer; | |
923 | inToken = 1; | |
924 | } | |
925 | if (tptr - tbuffer >= sizeof(tbuffer)) | |
926 | return -1; /* token too long */ | |
927 | *tptr++ = tc; | |
928 | } | |
929 | if (tc == 0) { | |
930 | /* last token flushed 'cause space(0) --> true */ | |
931 | if (last) | |
932 | last->next = (struct bnode_token *)0; | |
933 | *alist = first; | |
934 | return 0; | |
935 | } | |
936 | } | |
937 | } | |
938 | ||
939 | #define MAXVARGS 128 | |
940 | int | |
941 | bnode_NewProc(struct bnode *abnode, char *aexecString, char *coreName, | |
942 | struct bnode_proc **aproc) | |
943 | { | |
944 | struct bnode_token *tlist, *tt; | |
945 | afs_int32 code; | |
946 | struct bnode_proc *tp; | |
947 | pid_t cpid; | |
948 | char *argv[MAXVARGS]; | |
949 | int i; | |
950 | ||
951 | code = bnode_ParseLine(aexecString, &tlist); /* try parsing first */ | |
952 | if (code) | |
953 | return code; | |
954 | tp = calloc(1, sizeof(struct bnode_proc)); | |
955 | opr_queue_Init(&tp->q); | |
956 | tp->bnode = abnode; | |
957 | tp->comLine = aexecString; | |
958 | tp->coreName = coreName; /* may be null */ | |
959 | abnode->procStartTime = FT_ApproxTime(); | |
960 | abnode->procStarts++; | |
961 | ||
962 | /* convert linked list of tokens into argv structure */ | |
963 | for (tt = tlist, i = 0; i < (MAXVARGS - 1) && tt; tt = tt->next, i++) { | |
964 | argv[i] = tt->key; | |
965 | } | |
966 | argv[i] = NULL; /* null-terminated */ | |
967 | ||
968 | cpid = spawnprocve(argv[0], argv, environ, -1); | |
969 | osi_audit(BOSSpawnProcEvent, 0, AUD_STR, aexecString, AUD_END); | |
970 | ||
971 | if (cpid == (pid_t) - 1) { | |
972 | bozo_Log("Failed to spawn process for bnode '%s'\n", abnode->name); | |
973 | bnode_FreeTokens(tlist); | |
974 | free(tp); | |
975 | return errno; | |
976 | } | |
977 | bozo_Log("%s started pid %ld: %s\n", abnode->name, cpid, aexecString); | |
978 | ||
979 | bnode_FreeTokens(tlist); | |
980 | opr_queue_Prepend(&allProcs, &tp->q); | |
981 | *aproc = tp; | |
982 | tp->pid = cpid; | |
983 | tp->flags = BPROC_STARTED; | |
984 | tp->flags &= ~BPROC_EXITED; | |
985 | BOP_PROCSTARTED(abnode, tp); | |
986 | bnode_Check(abnode); | |
987 | return 0; | |
988 | } | |
989 | ||
990 | int | |
991 | bnode_StopProc(struct bnode_proc *aproc, int asignal) | |
992 | { | |
993 | int code; | |
994 | if (!(aproc->flags & BPROC_STARTED) || (aproc->flags & BPROC_EXITED)) | |
995 | return BZNOTACTIVE; | |
996 | ||
997 | osi_audit(BOSStopProcEvent, 0, AUD_STR, (aproc ? aproc->comLine : NULL), | |
998 | AUD_END); | |
999 | ||
1000 | code = kill(aproc->pid, asignal); | |
1001 | bnode_Check(aproc->bnode); | |
1002 | return code; | |
1003 | } |