2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
16 #ifdef AFS_PTHREAD_ENV
17 # include <opr/lock.h>
19 # include <opr/lockstub.h>
23 #include <afs/afsutil.h>
25 #define UBIK_INTERNALS
31 * The goal is to provide reliable operation among N servers, such that any
32 * server can crash with the remaining servers continuing operation within a
33 * short period of time. While a \b short outage is acceptable, this time
34 * should be order of 3 minutes or less.
36 * Theory of operation:
38 * Note: #SMALLTIME and #BIGTIME are essentially the same time value, separated
39 * only by the clock skew, #MAXSKEW. In general, if you are making guarantees
40 * for someone else, promise them no more than #SMALLTIME seconds of whatever
41 * invariant you provide. If you are waiting to be sure some invariant is now
42 * \b false, wait at least #BIGTIME seconds to be sure that #SMALLTIME seconds
43 * has passed at the other site.
45 * Now, back to the design:
46 * One site in the collection is a special site, designated the \b sync site.
47 * The sync site sends periodic messages, which can be thought of as
48 * keep-alive messages. When a non-sync site hears from the sync site, it
49 * knows that it is getting updates for the next #SMALLTIME seconds from that
52 * If a server does not hear from the sync site in #SMALLTIME seconds, it
53 * determines that it no longer is getting updates, and thus refuses to give
54 * out potentially out-of-date data. If a sync site can not muster a majority
55 * of servers to agree that it is the sync site, then there is a possibility
56 * that a network partition has occurred, allowing another server to claim to
57 * be the sync site. Thus, any time that the sync site has not heard from a
58 * majority of the servers in the last #SMALLTIME seconds, it voluntarily
59 * relinquishes its role as sync site.
61 * While attempting to nominate a new sync site, certain rules apply. First,
62 * a server can not reply "ok" (return 1 from ServBeacon) to two different
63 * hosts in less than #BIGTIME seconds; this allows a server that has heard
64 * affirmative replies from a majority of the servers to know that no other
65 * server in the network has heard enough affirmative replies in the last
66 * #BIGTIME seconds to become sync site, too. The variables #ubik_lastYesTime
67 * and #lastYesHost are used by all servers to keep track of which host they
68 * have last replied affirmatively to, when queried by a potential new sync
71 * Once a sync site has become a sync site, it periodically sends beacon
72 * messages with a parameter of 1, indicating that it already has determined
73 * it is supposed to be the sync site. The servers treat such a message as a
74 * guarantee that no other site will become sync site for the next #SMALLTIME
75 * seconds. In the interim, these servers can answer a query concerning which
76 * site is the sync site without any communication with any server. The
77 * variables #lastBeaconArrival and #lastBeaconHost are used by all servers to
78 * keep track of which sync site has last contacted them.
80 * One complication occurs while nominating a new sync site: each site may be
81 * trying to nominate a different site (based on the value of #lastYesHost),
82 * yet we must nominate the smallest host (under some order), to prevent this
83 * process from looping. The process could loop by having each server give
84 * one vote to another server, but with no server getting a majority of the
85 * votes. To avoid this, we try to withhold our votes for the server with the
86 * lowest internet address (an easy-to-generate order). To this effect, we
87 * keep track (in #lowestTime and #lowestHost) of the lowest server trying to
88 * become a sync site. We wait for this server unless there is already a sync
89 * site (indicated by ServBeacon's parameter being 1).
92 afs_int32 ubik_debugFlag
= 0; /*!< print out debugging messages? */
94 struct vote_data vote_globals
;
98 * \brief Decide if we should try to become sync site.
100 * The basic rule is that we
101 * don't run if there is a valid sync site and it ain't us (we have to run if
102 * it is us, in order to keep our votes). If there is no sync site, then we
103 * want to run if we're the lowest numbered host running, otherwise we defer to
104 * the lowest host. However, if the lowest host hasn't been heard from for a
105 * while, then we start running again, in case he crashed.
107 * \return true if we should run, and false otherwise.
110 uvote_ShouldIRun(void)
113 int code
= 1; /* default to yes */
116 return 0; /* if we cannot be the sync-site, do not ask for votes */
120 now
= FT_ApproxTime();
121 if (BIGTIME
+ vote_globals
.ubik_lastYesTime
< now
)
123 if (vote_globals
.lastYesState
&& vote_globals
.lastYesHost
!= ubik_host
[0]) {
124 code
= 0; /* other guy is sync site, leave him alone */
127 if (ntohl((afs_uint32
)vote_globals
.lastYesHost
) < ntohl((afs_uint32
)ubik_host
[0])) {
128 code
= 0; /* if someone is valid and better than us, don't run */
138 * \brief Return the current synchronization site, if any.
140 * Simple approach: if the
141 * last guy we voted yes for claims to be the sync site, then we we're happy to
142 * use that guy for a sync site until the time his mandate expires. If the guy
143 * does not claim to be sync site, then, of course, there's none.
145 * In addition, if we lost the sync, we set #urecovery_syncSite to an invalid
146 * value, indicating that we no longer know which version of the dbase is the
147 * one we should have. We'll get a new one when we next hear from the sync
150 * \return 0 or currently valid sync site. It can return our own
151 * address, if we're the sync site.
154 uvote_GetSyncSite(void)
160 if (!vote_globals
.lastYesState
)
163 now
= FT_ApproxTime();
164 if (SMALLTIME
+ vote_globals
.lastYesClaim
< now
)
165 code
= 0; /* last guy timed out */
167 code
= vote_globals
.lastYesHost
;
174 * \brief called by the sync site to handle vote beacons; if aconn is null, this is a
177 * \returns 0 or time when the vote was sent. It returns 0 if we are
178 * not voting for this sync site, or the time we actually voted yes, if
182 SVOTE_Beacon(struct rx_call
* rxcall
, afs_int32 astate
,
183 afs_int32 astart
, struct ubik_version
* avers
,
184 struct ubik_tid
* atid
)
189 struct rx_connection
*aconn
;
191 struct ubik_server
*ts
;
195 if (rxcall
) { /* caller's host */
196 aconn
= rx_ConnectionOf(rxcall
);
197 rxp
= rx_PeerOf(aconn
);
198 otherHost
= rx_HostOf(rxp
);
200 /* get the primary interface address for this host. */
201 /* This is the identifier that ubik uses. */
202 otherHost
= ubikGetPrimaryInterfaceAddr(otherHost
);
204 ViceLog(5, ("Received beacon from unknown host %s\n",
205 afs_inet_ntoa_r(rx_HostOf(rxp
), hoststr
)));
206 return 0; /* I don't know about you: vote no */
208 for (ts
= ubik_servers
; ts
; ts
= ts
->next
) {
209 if (ts
->addr
[0] == otherHost
)
213 ViceLog(5, ("Unknown host %x has sent a beacon\n", otherHost
));
214 if (ts
&& ts
->isClone
)
217 otherHost
= ubik_host
[0]; /* this host */
221 ViceLog(5, ("Received beacon type %d from host %s\n", astate
,
222 afs_inet_ntoa_r(otherHost
, hoststr
)));
224 /* compute the lowest server we've heard from. We'll try to only vote for
225 * this dude if we don't already have a synchronization site. Also, don't
226 * let a very old lowestHost confusing things forever. We pick a new
227 * lowestHost after BIGTIME seconds to limit the damage if this host
228 * actually crashes. Finally, we also count in this computation: don't
229 * pick someone else if we're even better!
231 * Note that the test below must be <=, not <, so that we keep refreshing
232 * lowestTime. Otherwise it will look like we haven't heard from
233 * lowestHost in a while and another host could slip in. */
236 /* First compute the lowest host we've heard from, whether we want them
237 * for a sync site or not. If we haven't heard from a site in BIGTIME
238 * seconds, we ignore its presence in lowestHost: it may have crashed.
239 * Note that we don't ever let anyone appear in our lowestHost if we're
240 * lower than them, 'cause we know we're up. */
241 /* But do not consider clones for lowesHost since they never may become
244 now
= FT_ApproxTime(); /* close to current time */
246 && (ntohl((afs_uint32
)otherHost
) <= ntohl((afs_uint32
)vote_globals
.lowestHost
)
247 || vote_globals
.lowestTime
+ BIGTIME
< now
)) {
248 vote_globals
.lowestTime
= now
;
249 vote_globals
.lowestHost
= otherHost
;
251 /* why do we need this next check? Consider the case where each of two
252 * servers decides the other is lowestHost. Each stops sending beacons
253 * 'cause the other is there. Not obvious that this process terminates:
254 * i.e. each guy could restart procedure and again think other side is
255 * lowest. Need to prove: if one guy in the system is lowest and knows
256 * he's lowest, these loops don't occur. because if someone knows he's
257 * lowest, he will send out beacons telling others to vote for him. */
259 && (ntohl((afs_uint32
) ubik_host
[0]) <= ntohl((afs_uint32
)vote_globals
.lowestHost
)
260 || vote_globals
.lowestTime
+ BIGTIME
< now
)) {
261 vote_globals
.lowestTime
= now
;
262 vote_globals
.lowestHost
= ubik_host
[0];
265 /* tell if we've heard from a sync site recently (even if we're not voting
266 * for this dude yet). After a while, time the guy out. */
267 if (astate
) { /* this guy is a sync site */
268 vote_globals
.syncHost
= otherHost
;
269 vote_globals
.syncTime
= now
;
270 } else if (vote_globals
.syncTime
+ BIGTIME
< now
) {
271 if (vote_globals
.syncHost
) {
272 ViceLog(5, ("Ubik: Lost contact with sync-site %s (NOT in quorum)\n",
273 afs_inet_ntoa_r(vote_globals
.syncHost
, hoststr
)));
275 vote_globals
.syncHost
= 0;
278 /* decide how to vote */
279 vote
= 0; /* start off voting no */
281 /* if we this guy isn't a sync site, we don't really have to vote for him.
282 * We get to apply some heuristics to try to avoid weird oscillation sates
283 * in the voting procedure. */
285 /* in here only if this guy doesn't claim to be a sync site */
287 /* lowestHost is also trying for our votes, then just say no. */
288 if (ntohl(vote_globals
.lowestHost
) != ntohl(otherHost
)) {
292 /* someone else *is* a sync site, just say no */
293 if (vote_globals
.syncHost
&& vote_globals
.syncHost
!= otherHost
)
295 } else if (vote_globals
.lastYesHost
== 0xffffffff && otherHost
== ubik_host
[0]) {
296 /* fast startup if this is the only non-clone */
298 for (ts
= ubik_servers
; ts
; ts
= ts
->next
) {
299 if (ts
->addr
[0] == otherHost
)
305 vote_globals
.lastYesHost
= otherHost
;
310 goto done_zero
; /* clone never can become sync site */
312 /* Don't promise sync site support to more than one host every BIGTIME
313 * seconds. This is the heart of our invariants in this system. */
314 if (vote_globals
.ubik_lastYesTime
+ BIGTIME
< now
|| otherHost
== vote_globals
.lastYesHost
) {
315 if ((vote_globals
.ubik_lastYesTime
+ BIGTIME
< now
) || (otherHost
!= vote_globals
.lastYesHost
)
316 || (vote_globals
.lastYesState
!= astate
)) {
317 /* A new vote or a change in the vote or changed quorum */
318 ViceLog(5, ("Ubik: vote 'yes' for %s %s\n",
319 afs_inet_ntoa_r(otherHost
, hoststr
),
320 (astate
? "(in quorum)" : "(NOT in quorum)")));
323 vote
= now
; /* vote yes */
324 vote_globals
.ubik_lastYesTime
= now
; /* remember when we voted yes */
325 vote_globals
.lastYesClaim
= astart
; /* remember for computing when sync site expires */
326 vote_globals
.lastYesHost
= otherHost
; /* and who for */
327 vote_globals
.lastYesState
= astate
; /* remember if site is a sync site */
328 vote_globals
.ubik_dbVersion
= *avers
; /* resync value */
329 vote_globals
.ubik_dbTid
= *atid
; /* transaction id, if any, of active trans */
332 urecovery_CheckTid(atid
, 0); /* check if current write trans needs aborted */
344 * \brief Handle per-server debug command, where 0 is the first server.
346 * Basic network debugging hooks.
349 SVOTE_SDebug(struct rx_call
* rxcall
, afs_int32 awhich
,
350 struct ubik_sdebug
* aparm
)
352 afs_int32 code
, isClone
;
353 code
= SVOTE_XSDebug(rxcall
, awhich
, aparm
, &isClone
);
358 SVOTE_XSDebug(struct rx_call
* rxcall
, afs_int32 awhich
,
359 struct ubik_sdebug
* aparm
, afs_int32
* isclone
)
361 struct ubik_server
*ts
;
363 for (ts
= ubik_servers
; ts
; ts
= ts
->next
) {
366 aparm
->addr
= ntohl(ts
->addr
[0]); /* primary interface */
367 for (i
= 0; i
< UBIK_MAX_INTERFACE_ADDR
- 1; i
++)
368 aparm
->altAddr
[i
] = ntohl(ts
->addr
[i
+ 1]);
369 aparm
->lastVoteTime
= ts
->lastVoteTime
;
370 aparm
->lastBeaconSent
= ts
->lastBeaconSent
;
371 memcpy(&aparm
->remoteVersion
, &ts
->version
,
372 sizeof(struct ubik_version
));
373 aparm
->lastVote
= ts
->lastVote
;
375 aparm
->beaconSinceDown
= ts
->beaconSinceDown
;
376 aparm
->currentDB
= ts
->currentDB
;
377 *isclone
= ts
->isClone
;
385 SVOTE_XDebug(struct rx_call
* rxcall
, struct ubik_debug
* aparm
,
390 code
= SVOTE_Debug(rxcall
, aparm
);
396 * \brief Handle basic network debug command. This is the global state dumper.
399 SVOTE_Debug(struct rx_call
* rxcall
, struct ubik_debug
* aparm
)
402 /* fill in the basic debug structure. Note the the RPC protocol transfers,
403 * integers in host order. */
405 memset(aparm
, 0, sizeof(*aparm
));
406 aparm
->now
= FT_ApproxTime();
407 aparm
->lastYesTime
= vote_globals
.ubik_lastYesTime
;
408 aparm
->lastYesHost
= ntohl(vote_globals
.lastYesHost
);
409 aparm
->lastYesState
= vote_globals
.lastYesState
;
410 aparm
->lastYesClaim
= vote_globals
.lastYesClaim
;
411 aparm
->lowestHost
= ntohl(vote_globals
.lowestHost
);
412 aparm
->lowestTime
= vote_globals
.lowestTime
;
413 aparm
->syncHost
= ntohl(vote_globals
.syncHost
);
414 aparm
->syncTime
= vote_globals
.syncTime
;
415 memcpy(&aparm
->syncVersion
, &vote_globals
.ubik_dbVersion
, sizeof(struct ubik_version
));
416 memcpy(&aparm
->syncTid
, &vote_globals
.ubik_dbTid
, sizeof(struct ubik_tid
));
418 /* fill in all interface addresses of myself in hostbyte order */
419 for (i
= 0; i
< UBIK_MAX_INTERFACE_ADDR
; i
++)
420 aparm
->interfaceAddr
[i
] = ntohl(ubik_host
[i
]);
422 aparm
->amSyncSite
= beacon_globals
.ubik_amSyncSite
;
423 ubeacon_Debug(aparm
);
429 /* Get the recovery state. The label of the database may not have
430 * been written yet but set the flag so udebug behavior remains.
433 aparm
->recoveryState
= urecovery_state
;
434 if ((urecovery_state
& UBIK_RECSYNCSITE
)
435 && (urecovery_state
& UBIK_RECFOUNDDB
)
436 && (urecovery_state
& UBIK_RECHAVEDB
)) {
437 aparm
->recoveryState
|= UBIK_RECLABELDB
;
439 aparm
->activeWrite
= (ubik_dbase
->flags
& DBWRITING
);
440 aparm
->tidCounter
= ubik_dbase
->tidCounter
;
442 if (ubik_currentTrans
) {
443 aparm
->currentTrans
= 1;
444 aparm
->writeTrans
= 1;
446 aparm
->currentTrans
= 0;
449 aparm
->epochTime
= version_globals
.ubik_epochTime
;
455 SVOTE_SDebugOld(struct rx_call
* rxcall
, afs_int32 awhich
,
456 struct ubik_sdebug_old
* aparm
)
458 struct ubik_server
*ts
;
460 for (ts
= ubik_servers
; ts
; ts
= ts
->next
) {
463 aparm
->addr
= ntohl(ts
->addr
[0]); /* primary interface */
464 aparm
->lastVoteTime
= ts
->lastVoteTime
;
465 aparm
->lastBeaconSent
= ts
->lastBeaconSent
;
466 memcpy(&aparm
->remoteVersion
, &ts
->version
,
467 sizeof(struct ubik_version
));
468 aparm
->lastVote
= ts
->lastVote
;
470 aparm
->beaconSinceDown
= ts
->beaconSinceDown
;
471 aparm
->currentDB
= ts
->currentDB
;
480 * \brief Handle basic network debug command. This is the global state dumper.
483 SVOTE_DebugOld(struct rx_call
* rxcall
,
484 struct ubik_debug_old
* aparm
)
487 /* fill in the basic debug structure. Note the the RPC protocol transfers,
488 * integers in host order. */
490 aparm
->now
= FT_ApproxTime();
491 aparm
->lastYesTime
= vote_globals
.ubik_lastYesTime
;
492 aparm
->lastYesHost
= ntohl(vote_globals
.lastYesHost
);
493 aparm
->lastYesState
= vote_globals
.lastYesState
;
494 aparm
->lastYesClaim
= vote_globals
.lastYesClaim
;
495 aparm
->lowestHost
= ntohl(vote_globals
.lowestHost
);
496 aparm
->lowestTime
= vote_globals
.lowestTime
;
497 aparm
->syncHost
= ntohl(vote_globals
.syncHost
);
498 aparm
->syncTime
= vote_globals
.syncTime
;
499 memcpy(&aparm
->syncVersion
, &vote_globals
.ubik_dbVersion
, sizeof(struct ubik_version
));
500 memcpy(&aparm
->syncTid
, &vote_globals
.ubik_dbTid
, sizeof(struct ubik_tid
));
502 aparm
->amSyncSite
= beacon_globals
.ubik_amSyncSite
;
503 ubeacon_Debug((ubik_debug
*)aparm
);
505 udisk_Debug((ubik_debug
*)aparm
);
507 ulock_Debug((ubik_debug
*)aparm
);
509 /* Get the recovery state. The label of the database may not have
510 * been written yet but set the flag so udebug behavior remains.
513 aparm
->recoveryState
= urecovery_state
;
514 if ((urecovery_state
& UBIK_RECSYNCSITE
)
515 && (urecovery_state
& UBIK_RECFOUNDDB
)
516 && (urecovery_state
& UBIK_RECHAVEDB
)) {
517 aparm
->recoveryState
|= UBIK_RECLABELDB
;
519 aparm
->activeWrite
= (ubik_dbase
->flags
& DBWRITING
);
520 aparm
->tidCounter
= ubik_dbase
->tidCounter
;
522 if (ubik_currentTrans
) {
523 aparm
->currentTrans
= 1;
524 aparm
->writeTrans
= 1;
526 aparm
->currentTrans
= 0;
529 aparm
->epochTime
= version_globals
.ubik_epochTime
;
536 * \brief Get the sync site; called by remote servers to find where they should go.
539 SVOTE_GetSyncSite(struct rx_call
* rxcall
,
544 temp
= uvote_GetSyncSite();
545 *ahost
= ntohl(temp
);
550 * \brief Called once/run to init the vote module
556 /* pretend we just voted for someone else, since we just restarted */
557 vote_globals
.ubik_lastYesTime
= FT_ApproxTime();
559 /* Initialize globals */
560 vote_globals
.lastYesHost
= 0xffffffff;
561 vote_globals
.lastYesClaim
= 0;
562 vote_globals
.lastYesState
= 0;
563 vote_globals
.lowestTime
= 0;
564 vote_globals
.lowestHost
= 0xffffffff;
565 vote_globals
.syncTime
= 0;
566 vote_globals
.syncHost
= 0;
573 uvote_set_dbVersion(struct ubik_version version
) {
575 vote_globals
.ubik_dbVersion
= version
;
579 /* Compare given version to current DB version. Return true if equal. */
581 uvote_eq_dbVersion(struct ubik_version version
) {
585 if (vote_globals
.ubik_dbVersion
.epoch
== version
.epoch
&& vote_globals
.ubik_dbVersion
.counter
== version
.counter
) {
593 * \brief Check if there is a sync site and whether we have a given db version
595 * \return 1 if there is a valid sync site, and the given db version matches the sync site's
599 uvote_HaveSyncAndVersion(struct ubik_version version
)
605 now
= FT_ApproxTime();
606 if (!vote_globals
.lastYesState
|| (SMALLTIME
+ vote_globals
.lastYesClaim
< now
) ||
607 vote_globals
.ubik_dbVersion
.epoch
!= version
.epoch
||
608 vote_globals
.ubik_dbVersion
.counter
!= version
.counter
)