| 1 | /* |
| 2 | * Copyright 2000, International Business Machines Corporation and others. |
| 3 | * All Rights Reserved. |
| 4 | * |
| 5 | * This software has been released under the terms of the IBM Public |
| 6 | * License. For details, see the LICENSE file in the top-level source |
| 7 | * directory or online at http://www.openafs.org/dl/license10.html |
| 8 | */ |
| 9 | |
| 10 | /* |
| 11 | * This package is used to actively manage the expiration of callbacks, |
| 12 | * so that the rest of the cache manager doesn't need to compute |
| 13 | * whether a callback has expired or not, but can tell with one simple |
| 14 | * check, that is, whether the CStatd bit is on or off. |
| 15 | * |
| 16 | * The base of the hash table moves periodically (every 128 seconds) |
| 17 | * QueueCallback rarely touches the first 3 slots in the hash table |
| 18 | * (only when called from CheckCallbacks) since MinTimeOut in |
| 19 | * viced/callback.c is currently 7 minutes. |
| 20 | * Therefore, CheckCallbacks should be able to run concurrently with |
| 21 | * QueueCallback, given the proper locking, of course. |
| 22 | * |
| 23 | * Note: |
| 24 | * 1. CheckCallbacks and BumpBase never run simultaneously. This is because |
| 25 | * they are only called from afs_Daemon. Therefore, base and basetime will |
| 26 | * always be consistent during CheckCallbacks. |
| 27 | * 2. cbHashT [base] rarely (if ever) gets stuff queued in it. The only way |
| 28 | * that could happen is CheckCallbacks might fencepost and move something in |
| 29 | * place, or BumpBase might push some stuff up. |
| 30 | * 3. Hash chains aren't particularly sorted. |
| 31 | * 4. The file server keeps its callback state around for 3 minutes |
| 32 | * longer than it promises the cache manager in order to account for |
| 33 | * clock skew, network delay, and other bogeymen. |
| 34 | * |
| 35 | * For now I just use one large lock, which is fine on a uniprocessor, |
| 36 | * since it's not held during any RPCs or low-priority I/O operations. |
| 37 | * To make this code MP-fast, you need no more locks than processors, |
| 38 | * but probably more than one. In measurements on MP-safe implementations, |
| 39 | * I have never seen any contention over the xcbhash lock. |
| 40 | * |
| 41 | * Incompatible operations: |
| 42 | * Enqueue and "dequeue of first vcache" in same slot |
| 43 | * dequeue and "dequeue of preceding vcache" in same slot |
| 44 | * dequeue and "dequeue of successive vcache" in same slot |
| 45 | * BumpBase pushing a list and enqueue in the new base slot |
| 46 | * Two enqueues in same slot |
| 47 | * more... |
| 48 | * |
| 49 | * Certain invariants exist: |
| 50 | * 1 Callback expiration times granted by a file server will never |
| 51 | * decrease for a particular vnode UNLESS a CallBack RPC is invoked |
| 52 | * by the server in the interim. |
| 53 | * 2 A vcache will always expire no sooner than the slot in which it is |
| 54 | * currently enqueued. Callback times granted by the server may |
| 55 | * increase, in which case the vcache will be updated in-place. As a |
| 56 | * result, it may expire later than the slot in which it is enqueued. |
| 57 | * Not to worry, the CheckCallbacks code will move it if neccessary. |
| 58 | * This approach means that busy vnodes won't be continually moved |
| 59 | * around within the expiry queue: they are only moved when they |
| 60 | * finally advance to the lead bucket. |
| 61 | * 3 Anything which has a callback on it must be in the expiry |
| 62 | * queue. In AFS 3.3, that means everything but symlinks (which |
| 63 | * are immutable), including contents of Read-Only volumes |
| 64 | * (which have callbacks by virtue of the whole-volume callback) |
| 65 | * |
| 66 | * QueueCallback only checks that its vcache is in the list |
| 67 | * somewhere, counting on invariant #1 to guarantee that the vcache |
| 68 | * won't be in a slot later than QueueCallback would otherwise place |
| 69 | * it. Therefore, whenever we turn off the CStatd bit on the vcache, we |
| 70 | * *must* remove the vcache from the expiry queue. Otherwise, we |
| 71 | * might have missed a CallBack RPC, and a subsequent callback might be |
| 72 | * granted with a shorter expiration time. |
| 73 | */ |
| 74 | #include <afsconfig.h> |
| 75 | #include "afs/param.h" |
| 76 | |
| 77 | |
| 78 | #include "afs/sysincludes.h" /*Standard vendor system headers */ |
| 79 | #include "afsincludes.h" /*AFS-based standard headers */ |
| 80 | #include "afs/afs_cbqueue.h" |
| 81 | #include "afs/afs.h" |
| 82 | #include "afs/lock.h" |
| 83 | #include "afs/afs_stats.h" |
| 84 | |
| 85 | static unsigned int base = 0; |
| 86 | static unsigned int basetime = 0; |
| 87 | static struct vcache *debugvc; /* used only for post-mortem debugging */ |
| 88 | struct bucket { |
| 89 | struct afs_q head; |
| 90 | /* struct afs_lock lock; only if you want lots of locks... */ |
| 91 | }; |
| 92 | static struct bucket cbHashT[CBHTSIZE]; |
| 93 | struct afs_lock afs_xcbhash; |
| 94 | |
| 95 | /* afs_QueueCallback |
| 96 | * Takes a write-locked vcache pointer and a callback expiration time |
| 97 | * as returned by the file server (ie, in units of 128 seconds from "now"). |
| 98 | * |
| 99 | * Uses the time as an index into a hash table, and inserts the vcache |
| 100 | * structure into the overflow chain. |
| 101 | * |
| 102 | * If the vcache is already on some hash chain, leave it there. |
| 103 | * CheckCallbacks will get to it eventually. In the meantime, it |
| 104 | * might get flushed, or it might already be on the right hash chain, |
| 105 | * so why bother messing with it now? |
| 106 | * |
| 107 | * NOTE: The caller must hold a write lock on afs_xcbhash |
| 108 | */ |
| 109 | |
| 110 | void |
| 111 | afs_QueueCallback(struct vcache *avc, unsigned int atime, struct volume *avp) |
| 112 | { |
| 113 | if (avp && (avp->expireTime < avc->cbExpires)) |
| 114 | avp->expireTime = avc->cbExpires; |
| 115 | if (!(avc->callsort.next)) { |
| 116 | atime = (atime + base) % CBHTSIZE; |
| 117 | QAdd(&(cbHashT[atime].head), &(avc->callsort)); |
| 118 | } |
| 119 | |
| 120 | return; |
| 121 | } /* afs_QueueCallback */ |
| 122 | |
| 123 | /* afs_DequeueCallback |
| 124 | * Takes a write-locked vcache pointer and removes it from the callback |
| 125 | * hash table, without knowing beforehand which slot it was in. |
| 126 | * |
| 127 | * for now, just get a lock on everything when doing the dequeue, don't |
| 128 | * worry about getting a lock on the individual slot. |
| 129 | * |
| 130 | * the only other places that do anything like dequeues are CheckCallbacks |
| 131 | * and BumpBase. |
| 132 | * |
| 133 | * NOTE: The caller must hold a write lock on afs_xcbhash |
| 134 | */ |
| 135 | void |
| 136 | afs_DequeueCallback(struct vcache *avc) |
| 137 | { |
| 138 | |
| 139 | debugvc = avc; |
| 140 | if (avc->callsort.prev) { |
| 141 | QRemove(&(avc->callsort)); |
| 142 | } else; /* must have got dequeued in a race */ |
| 143 | |
| 144 | return; |
| 145 | } /* afs_DequeueCallback */ |
| 146 | |
| 147 | /* afs_CheckCallbacks |
| 148 | * called periodically to determine which callbacks are likely to |
| 149 | * expire in the next n second interval. Preemptively marks them as |
| 150 | * expired. Rehashes items which are now in the wrong hash bucket. |
| 151 | * Preemptively renew recently-accessed items. Only removes things |
| 152 | * from the first and second bucket (as long as secs < 128), and |
| 153 | * inserts things into other, later buckets. either need to advance |
| 154 | * to the second bucket if secs spans two intervals, or else be |
| 155 | * certain to call afs_CheckCallbacks immediately after calling |
| 156 | * BumpBase (allows a little more slop but it's ok because file server |
| 157 | * keeps 3 minutes of slop time) |
| 158 | * |
| 159 | * There is a little race between CheckCallbacks and any code which |
| 160 | * updates cbExpires, always just prior to calling QueueCallback. We |
| 161 | * don't lock the vcache struct here (can't, or we'd risk deadlock), |
| 162 | * so GetVCache (for example) may update cbExpires before or after #1 |
| 163 | * below. If before, CheckCallbacks moves this entry to its proper |
| 164 | * slot. If after, GetVCache blocks in the call to QueueCallbacks, |
| 165 | * this code dequeues the vcache, and then QueueCallbacks re-enqueues it. |
| 166 | * |
| 167 | * XXX to avoid the race, make QueueCallback take the "real" time |
| 168 | * and update cbExpires under the xcbhash lock. |
| 169 | * |
| 170 | * NB #1: There's a little optimization here: if I go to invalidate a |
| 171 | * RO vcache or volume, first check to see if the server is down. If |
| 172 | * it _is_, don't invalidate it, cuz we might just as well keep using |
| 173 | * it. Possibly, we could do the same thing for items in RW volumes, |
| 174 | * but that bears some drinking about. |
| 175 | * |
| 176 | * Don't really need to invalidate the hints, we could just wait to see if |
| 177 | * the dv has changed after a subsequent FetchStatus, but this is safer. |
| 178 | */ |
| 179 | |
| 180 | /* Sanity check on the callback queue. Allow for slop in the computation. */ |
| 181 | #if defined(AFS_LINUX22_ENV) |
| 182 | #define CBQ_LIMIT (afs_maxvcount + 10) |
| 183 | #else |
| 184 | #define CBQ_LIMIT (afs_cacheStats + afs_stats_cmperf.vcacheXAllocs + 10) |
| 185 | #endif |
| 186 | |
| 187 | void |
| 188 | afs_CheckCallbacks(unsigned int secs) |
| 189 | { |
| 190 | struct vcache *tvc; |
| 191 | struct afs_q *tq; |
| 192 | struct afs_q *uq; |
| 193 | afs_uint32 now; |
| 194 | struct volume *tvp; |
| 195 | int safety; |
| 196 | |
| 197 | ObtainWriteLock(&afs_xcbhash, 85); /* pretty likely I'm going to remove something */ |
| 198 | now = osi_Time(); |
| 199 | for (safety = 0, tq = cbHashT[base].head.prev; |
| 200 | (safety <= CBQ_LIMIT) && (tq != &(cbHashT[base].head)); |
| 201 | tq = uq, safety++) { |
| 202 | |
| 203 | uq = QPrev(tq); |
| 204 | tvc = CBQTOV(tq); |
| 205 | if (tvc->cbExpires < now + secs) { /* race #1 here */ |
| 206 | /* Get the volume, and if its callback expiration time is more than secs |
| 207 | * seconds into the future, update this vcache entry and requeue it below |
| 208 | */ |
| 209 | if ((tvc->f.states & CRO) |
| 210 | && (tvp = afs_FindVolume(&(tvc->f.fid), READ_LOCK))) { |
| 211 | if (tvp->expireTime > now + secs) { |
| 212 | tvc->cbExpires = tvp->expireTime; /* XXX race here */ |
| 213 | } else { |
| 214 | int i; |
| 215 | for (i = 0; i < AFS_MAXHOSTS && tvp->serverHost[i]; i++) { |
| 216 | if (!(tvp->serverHost[i]->flags & SRVR_ISDOWN)) { |
| 217 | /* What about locking xvcache or vrefcount++ or |
| 218 | * write locking tvc? */ |
| 219 | afs_StaleVCacheFlags(tvc, AFS_STALEVC_CBLOCKED | |
| 220 | AFS_STALEVC_SKIP_DNLC_FOR_INIT_FLUSHED, |
| 221 | CMValid | CUnique); |
| 222 | tvc->dchint = NULL; /*invalidate em */ |
| 223 | afs_ResetVolumeInfo(tvp); |
| 224 | break; |
| 225 | } |
| 226 | } |
| 227 | } |
| 228 | afs_PutVolume(tvp, READ_LOCK); |
| 229 | } else { |
| 230 | /* Do I need to worry about things like execsorwriters? |
| 231 | * What about locking xvcache or vrefcount++ or write locking tvc? |
| 232 | */ |
| 233 | afs_StaleVCacheFlags(tvc, AFS_STALEVC_CBLOCKED | |
| 234 | AFS_STALEVC_SKIP_DNLC_FOR_INIT_FLUSHED, |
| 235 | CMValid | CUnique); |
| 236 | } |
| 237 | } |
| 238 | |
| 239 | if ((tvc->cbExpires > basetime) && CBHash(tvc->cbExpires - basetime)) { |
| 240 | /* it's been renewed on us. Have to be careful not to put it back |
| 241 | * into this slot, or we may never get out of here. |
| 242 | */ |
| 243 | int slot; |
| 244 | slot = (CBHash(tvc->cbExpires - basetime) + base) % CBHTSIZE; |
| 245 | if (slot != base) { |
| 246 | if (QPrev(tq)) |
| 247 | QRemove(&(tvc->callsort)); |
| 248 | QAdd(&(cbHashT[slot].head), &(tvc->callsort)); |
| 249 | /* XXX remember to update volume expiration time */ |
| 250 | /* -- not needed for correctness, though */ |
| 251 | } |
| 252 | } |
| 253 | } |
| 254 | |
| 255 | if (safety > CBQ_LIMIT) { |
| 256 | afs_stats_cmperf.cbloops++; |
| 257 | if (afs_paniconwarn) |
| 258 | osi_Panic("CheckCallbacks"); |
| 259 | |
| 260 | afs_warn |
| 261 | ("AFS Internal Error (minor): please contact AFS Product Support.\n"); |
| 262 | ReleaseWriteLock(&afs_xcbhash); |
| 263 | afs_FlushCBs(); |
| 264 | return; |
| 265 | } else |
| 266 | ReleaseWriteLock(&afs_xcbhash); |
| 267 | |
| 268 | |
| 269 | /* XXX future optimization: |
| 270 | if this item has been recently accessed, queue up a stat for it. |
| 271 | { |
| 272 | struct dcache * adc; |
| 273 | |
| 274 | ObtainReadLock(&afs_xdcache); |
| 275 | if ((adc = tvc->quick.dc) && (adc->stamp == tvc->quick.stamp) |
| 276 | && (afs_indexTimes[adc->index] > afs_indexCounter - 20)) { |
| 277 | queue up the stat request |
| 278 | } |
| 279 | ReleaseReadLock(&afs_xdcache); |
| 280 | } |
| 281 | */ |
| 282 | |
| 283 | return; |
| 284 | } /* afs_CheckCallback */ |
| 285 | |
| 286 | /* afs_FlushCBs |
| 287 | * to be used only in dire circumstances, this drops all callbacks on |
| 288 | * the floor, without giving them back to the server. It's ok, the server can |
| 289 | * deal with it, but it is a little bit rude. |
| 290 | */ |
| 291 | void |
| 292 | afs_FlushCBs(void) |
| 293 | { |
| 294 | int i; |
| 295 | struct vcache *tvc; |
| 296 | |
| 297 | ObtainWriteLock(&afs_xcbhash, 86); /* pretty likely I'm going to remove something */ |
| 298 | |
| 299 | for (i = 0; i < VCSIZE; i++) /* reset all the vnodes */ |
| 300 | for (tvc = afs_vhashT[i]; tvc; tvc = tvc->hnext) { |
| 301 | afs_StaleVCacheFlags(tvc, AFS_STALEVC_CBLOCKED | |
| 302 | AFS_STALEVC_CLEARCB | |
| 303 | AFS_STALEVC_SKIP_DNLC_FOR_INIT_FLUSHED, 0); |
| 304 | tvc->dchint = NULL; /* invalidate hints */ |
| 305 | } |
| 306 | |
| 307 | afs_InitCBQueue(0); |
| 308 | |
| 309 | ReleaseWriteLock(&afs_xcbhash); |
| 310 | } |
| 311 | |
| 312 | /* afs_FlushServerCBs |
| 313 | * to be used only in dire circumstances, this drops all callbacks on |
| 314 | * the floor for a specific server, without giving them back to the server. |
| 315 | * It's ok, the server can deal with it, but it is a little bit rude. |
| 316 | */ |
| 317 | void |
| 318 | afs_FlushServerCBs(struct server *srvp) |
| 319 | { |
| 320 | int i; |
| 321 | struct vcache *tvc; |
| 322 | |
| 323 | ObtainWriteLock(&afs_xcbhash, 86); /* pretty likely I'm going to remove something */ |
| 324 | |
| 325 | for (i = 0; i < VCSIZE; i++) { /* reset all the vnodes */ |
| 326 | for (tvc = afs_vhashT[i]; tvc; tvc = tvc->hnext) { |
| 327 | if (tvc->callback == srvp) { |
| 328 | afs_StaleVCacheFlags(tvc, AFS_STALEVC_CBLOCKED | |
| 329 | AFS_STALEVC_CLEARCB | |
| 330 | AFS_STALEVC_SKIP_DNLC_FOR_INIT_FLUSHED, 0); |
| 331 | tvc->dchint = NULL; /* invalidate hints */ |
| 332 | } |
| 333 | } |
| 334 | } |
| 335 | |
| 336 | ReleaseWriteLock(&afs_xcbhash); |
| 337 | } |
| 338 | |
| 339 | /* afs_InitCBQueue |
| 340 | * called to initialize static and global variables associated with |
| 341 | * the Callback expiration management mechanism. |
| 342 | */ |
| 343 | void |
| 344 | afs_InitCBQueue(int doLockInit) |
| 345 | { |
| 346 | int i; |
| 347 | |
| 348 | memset(cbHashT, 0, CBHTSIZE * sizeof(struct bucket)); |
| 349 | for (i = 0; i < CBHTSIZE; i++) { |
| 350 | QInit(&(cbHashT[i].head)); |
| 351 | /* Lock_Init(&(cbHashT[i].lock)); only if you want lots of locks, which |
| 352 | * don't seem too useful at present. */ |
| 353 | } |
| 354 | base = 0; |
| 355 | basetime = osi_Time(); |
| 356 | if (doLockInit) |
| 357 | Lock_Init(&afs_xcbhash); |
| 358 | } |
| 359 | |
| 360 | /* Because there are no real-time guarantees, and especially because a |
| 361 | * thread may wait on a lock indefinitely, this routine has to be |
| 362 | * careful that it doesn't get permanently out-of-date. Important |
| 363 | * assumption: this routine is only called from afs_Daemon, so there |
| 364 | * can't be more than one instance of this running at any one time. |
| 365 | * Presumes that basetime is never 0, and is always sane. |
| 366 | * |
| 367 | * Before calling this routine, be sure that the first slot is pretty |
| 368 | * empty. This -20 is because the granularity of the checks in |
| 369 | * afs_Daemon is pretty large, so I'd rather err on the side of safety |
| 370 | * sometimes. The fact that I only bump basetime by CBHTSLOTLEN-1 |
| 371 | * instead of the whole CBHTSLOTLEN is also for "safety". |
| 372 | * Conceptually, it makes this clock run just a little faster than the |
| 373 | * clock governing which slot a callback gets hashed into. Both of these |
| 374 | * things make CheckCallbacks work a little harder than it would have to |
| 375 | * if I wanted to cut things finer. |
| 376 | * Everything from the old first slot is carried over into the new first |
| 377 | * slot. Thus, if there were some things that ought to have been invalidated, |
| 378 | * but weren't (say, if the server was down), they will be examined at every |
| 379 | * opportunity thereafter. |
| 380 | */ |
| 381 | int |
| 382 | afs_BumpBase(void) |
| 383 | { |
| 384 | afs_uint32 now; |
| 385 | int didbump; |
| 386 | u_int oldbase; |
| 387 | |
| 388 | ObtainWriteLock(&afs_xcbhash, 87); |
| 389 | didbump = 0; |
| 390 | now = osi_Time(); |
| 391 | while (basetime + (CBHTSLOTLEN - 20) <= now) { |
| 392 | oldbase = base; |
| 393 | basetime += CBHTSLOTLEN - 1; |
| 394 | base = (base + 1) % CBHTSIZE; |
| 395 | didbump++; |
| 396 | if (!QEmpty(&(cbHashT[oldbase].head))) { |
| 397 | QCat(&(cbHashT[oldbase].head), &(cbHashT[base].head)); |
| 398 | } |
| 399 | } |
| 400 | ReleaseWriteLock(&afs_xcbhash); |
| 401 | |
| 402 | return didbump; |
| 403 | } |