2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include "afs/param.h"
14 #include "afs/sysincludes.h"
15 #include "afsincludes.h"
17 #if !defined(AFS_LINUX26_ENV)
22 #if defined(AFS_AIX31_ENV)
25 #if !defined(AFS_AIX_ENV) && !defined(AFS_SUN5_ENV) && !defined(AFS_SGI_ENV) && !defined(AFS_LINUX20_ENV)
26 #include "h/kernel.h" /* Doesn't needed, so it should go */
28 #endif /* !defined(UKERNEL) */
30 #include "afs/afs_osi.h"
34 #if !defined(UKERNEL) && !defined(AFS_LINUX20_ENV)
36 #endif /* !defined(UKERNEL) */
39 #include "afs/volerrors.h"
40 #include "afs/exporter.h"
41 #include "afs/prs_fs.h"
42 #include "afs/afs_chunkops.h"
45 #include "afs/afs_stats.h"
49 #define BUF_TIME_MAX 0x7fffffff
51 #define NPB 8 /* must be a pwer of 2 */
52 static int afs_max_buffers
; /* should be an integral multiple of NPB */
55 #define AFS_BUFFER_PAGESIZE 2048
58 /* If you change any of this PH stuff, make sure you don't break DZap() */
59 /* use last two bits for page */
61 /* use next five bits for fid */
63 /* page hash table size - this is pretty intertwined with pHash */
64 #define PHSIZE (PHPAGEMASK + PHFIDMASK + 1)
66 #define pHash(fid,page) ((((afs_int32)(fid)) & PHFIDMASK) \
67 | (page & PHPAGEMASK))
70 #undef dirty /* XXX */
73 static struct buffer
*Buffers
= 0;
74 static char *BufferData
;
77 extern struct buf
*geteblk();
80 #define timecounter afs_timecounter
83 /* A note on locking in 'struct buffer'
85 * afs_bufferLock protects the hash chain, and the 'lockers' field where that
86 * has a zero value. It must be held whenever lockers is incremented from zero.
88 * The individual buffer lock protects the contents of the structure, including
91 * For safety: afs_bufferLock and the individual buffer lock must be held
92 * when obtaining a reference on a structure. Only the individual buffer lock
93 * need be held when releasing a reference.
95 * The locking hierarchy is afs_bufferLock-> buffer.lock
99 static afs_lock_t afs_bufferLock
;
100 static struct buffer
*phTable
[PHSIZE
]; /* page hash table */
102 static afs_int32 timecounter
;
104 /* Prototypes for static routines */
105 static struct buffer
*afs_newslot(struct dcache
*adc
, afs_int32 apage
,
108 static int dinit_flag
= 0;
112 /* Initialize the venus buffer system. */
120 /* round up to next multiple of NPB, since we allocate multiple pages per chunk */
121 abuffers
= ((abuffers
- 1) | (NPB
- 1)) + 1;
122 afs_max_buffers
= abuffers
<< 2; /* possibly grow up to 4 times as big */
123 LOCK_INIT(&afs_bufferLock
, "afs_bufferLock");
124 Buffers
= afs_osi_Alloc(afs_max_buffers
* sizeof(struct buffer
));
125 osi_Assert(Buffers
!= NULL
);
127 afs_stats_cmperf
.bufAlloced
= nbuffers
= abuffers
;
128 for (i
= 0; i
< PHSIZE
; i
++)
130 for (i
= 0; i
< abuffers
; i
++) {
131 if ((i
& (NPB
- 1)) == 0) {
132 /* time to allocate a fresh buffer */
133 BufferData
= afs_osi_Alloc(AFS_BUFFER_PAGESIZE
* NPB
);
134 osi_Assert(BufferData
!= NULL
);
136 /* Fill in each buffer with an empty indication. */
139 afs_reset_inode(&tb
->inode
);
142 tb
->data
= &BufferData
[AFS_BUFFER_PAGESIZE
* (i
& (NPB
- 1))];
145 AFS_RWLOCK_INIT(&tb
->lock
, "buffer lock");
151 DRead(struct dcache
*adc
, int page
, struct DirBuffer
*entry
)
153 /* Read a page from the disk. */
154 struct buffer
*tb
, *tb2
;
155 struct osi_file
*tfile
;
160 memset(entry
, 0, sizeof(struct DirBuffer
));
162 ObtainWriteLock(&afs_bufferLock
, 256);
164 #define bufmatch(tb) (tb->page == page && tb->fid == adc->index)
165 #define buf_Front(head,parent,p) {(parent)->hashNext = (p)->hashNext; (p)->hashNext= *(head);*(head)=(p);}
167 /* this apparently-complicated-looking code is simply an example of
168 * a little bit of loop unrolling, and is a standard linked-list
169 * traversal trick. It saves a few assignments at the the expense
170 * of larger code size. This could be simplified by better use of
173 if ((tb
= phTable
[pHash(adc
->index
, page
)])) {
175 ObtainWriteLock(&tb
->lock
, 257);
177 ReleaseWriteLock(&afs_bufferLock
);
178 tb
->accesstime
= timecounter
++;
179 AFS_STATS(afs_stats_cmperf
.bufHits
++);
180 ReleaseWriteLock(&tb
->lock
);
182 entry
->data
= tb
->data
;
185 struct buffer
**bufhead
;
186 bufhead
= &(phTable
[pHash(adc
->index
, page
)]);
187 while ((tb2
= tb
->hashNext
)) {
189 buf_Front(bufhead
, tb
, tb2
);
190 ObtainWriteLock(&tb2
->lock
, 258);
192 ReleaseWriteLock(&afs_bufferLock
);
193 tb2
->accesstime
= timecounter
++;
194 AFS_STATS(afs_stats_cmperf
.bufHits
++);
195 ReleaseWriteLock(&tb2
->lock
);
197 entry
->data
= tb2
->data
;
200 if ((tb
= tb2
->hashNext
)) {
202 buf_Front(bufhead
, tb2
, tb
);
203 ObtainWriteLock(&tb
->lock
, 259);
205 ReleaseWriteLock(&afs_bufferLock
);
206 tb
->accesstime
= timecounter
++;
207 AFS_STATS(afs_stats_cmperf
.bufHits
++);
208 ReleaseWriteLock(&tb
->lock
);
210 entry
->data
= tb
->data
;
220 AFS_STATS(afs_stats_cmperf
.bufMisses
++);
222 /* The last thing we looked at was either tb or tb2 (or nothing). That
223 * is at least the oldest buffer on one particular hash chain, so it's
224 * a pretty good place to start looking for the truly oldest buffer.
226 tb
= afs_newslot(adc
, page
, (tb
? tb
: tb2
));
228 ReleaseWriteLock(&afs_bufferLock
);
231 ObtainWriteLock(&tb
->lock
, 260);
233 ReleaseWriteLock(&afs_bufferLock
);
235 if (adc
->f
.chunk
== 0 && adc
->f
.chunkBytes
== 0) {
236 /* The directory blob is empty, apparently. This is not a valid dir
237 * blob, so throw an error. */
239 } else if (page
* AFS_BUFFER_PAGESIZE
>= adc
->f
.chunkBytes
) {
240 code
= ENOENT
; /* past the end */
244 afs_reset_inode(&tb
->inode
);
246 ReleaseWriteLock(&tb
->lock
);
249 tfile
= afs_CFileOpen(&adc
->f
.inode
);
252 afs_CFileRead(tfile
, tb
->page
* AFS_BUFFER_PAGESIZE
, tb
->data
,
253 AFS_BUFFER_PAGESIZE
);
254 afs_CFileClose(tfile
);
255 if (code
< AFS_BUFFER_PAGESIZE
) {
257 afs_reset_inode(&tb
->inode
);
259 ReleaseWriteLock(&tb
->lock
);
262 /* Note that findslot sets the page field in the buffer equal to
263 * what it is searching for. */
264 ReleaseWriteLock(&tb
->lock
);
266 entry
->data
= tb
->data
;
271 FixupBucket(struct buffer
*ap
)
273 struct buffer
**lp
, *tp
;
275 /* first try to get it out of its current hash bucket, in which it
277 AFS_STATCNT(FixupBucket
);
280 for (tp
= *lp
; tp
; tp
= tp
->hashNext
) {
287 /* now figure the new hash bucket */
288 i
= pHash(ap
->fid
, ap
->page
);
289 ap
->hashIndex
= i
; /* remember where we are for deletion */
290 ap
->hashNext
= phTable
[i
]; /* add us to the list */
291 phTable
[i
] = ap
; /* at the front, since it's LRU */
294 /* lp is pointer to a fairly-old buffer */
295 static struct buffer
*
296 afs_newslot(struct dcache
*adc
, afs_int32 apage
, struct buffer
*lp
)
298 /* Find a usable buffer slot */
302 struct osi_file
*tfile
;
304 AFS_STATCNT(afs_newslot
);
305 /* we take a pointer here to a buffer which was at the end of an
306 * LRU hash chain. Odds are, it's one of the older buffers, not
307 * one of the newer. Having an older buffer to start with may
308 * permit us to avoid a few of the assignments in the "typical
309 * case" for loop below.
311 if (lp
&& (lp
->lockers
== 0)) {
317 /* timecounter might have wrapped, if machine is very very busy
318 * and stays up for a long time. Timecounter mustn't wrap twice
319 * (positive->negative->positive) before calling newslot, but that
320 * would require 2 billion consecutive cache hits... Anyway, the
321 * penalty is only that the cache replacement policy will be
322 * almost MRU for the next ~2 billion DReads... newslot doesn't
323 * get called nearly as often as DRead, so in order to avoid the
324 * performance penalty of using the hypers, it's worth doing the
325 * extra check here every time. It's probably cheaper than doing
326 * hcmp, anyway. There is a little performance hit resulting from
327 * resetting all the access times to 0, but it only happens once
328 * every month or so, and the access times will rapidly sort
329 * themselves back out after just a few more DReads.
331 if (timecounter
< 0) {
334 for (i
= 0; i
< nbuffers
; i
++, tp
++) {
336 if (!lp
&& !tp
->lockers
) /* one is as good as the rest, I guess */
340 /* this is the typical case */
342 for (i
= 0; i
< nbuffers
; i
++, tp
++) {
343 if (tp
->lockers
== 0) {
344 if (!lp
|| tp
->accesstime
< lt
) {
353 /* No unlocked buffers. If still possible, allocate a new increment */
354 if (nbuffers
+ NPB
> afs_max_buffers
) {
355 /* There are no unlocked buffers -- this used to panic, but that
356 * seems extreme. To the best of my knowledge, all the callers
357 * of DRead are prepared to handle a zero return. Some of them
358 * just panic directly, but not all of them. */
359 afs_warn("afs: all buffers locked\n");
363 BufferData
= afs_osi_Alloc(AFS_BUFFER_PAGESIZE
* NPB
);
364 osi_Assert(BufferData
!= NULL
);
365 for (i
= 0; i
< NPB
; i
++) {
366 /* Fill in each buffer with an empty indication. */
367 tp
= &Buffers
[i
+ nbuffers
];
369 afs_reset_inode(&tp
->inode
);
372 tp
->data
= &BufferData
[AFS_BUFFER_PAGESIZE
* i
];
375 AFS_RWLOCK_INIT(&tp
->lock
, "buffer lock");
377 lp
= &Buffers
[nbuffers
];
382 /* see DFlush for rationale for not getting and locking the dcache */
383 tfile
= afs_CFileOpen(&lp
->inode
);
385 afs_CFileWrite(tfile
, lp
->page
* AFS_BUFFER_PAGESIZE
, lp
->data
,
386 AFS_BUFFER_PAGESIZE
);
388 afs_CFileClose(tfile
);
389 AFS_STATS(afs_stats_cmperf
.bufFlushDirty
++);
392 /* Zero out the data so we don't leak something we shouldn't. */
393 memset(lp
->data
, 0, AFS_BUFFER_PAGESIZE
);
394 /* Now fill in the header. */
395 lp
->fid
= adc
->index
;
396 afs_copy_inode(&lp
->inode
, &adc
->f
.inode
);
398 lp
->accesstime
= timecounter
++;
399 FixupBucket(lp
); /* move to the right hash bucket */
405 DRelease(struct DirBuffer
*entry
, int flag
)
409 AFS_STATCNT(DRelease
);
416 ObtainWriteLock(&tp
->lock
, 261);
420 ReleaseWriteLock(&tp
->lock
);
424 DVOffset(struct DirBuffer
*entry
)
428 AFS_STATCNT(DVOffset
);
431 return AFS_BUFFER_PAGESIZE
* bp
->page
432 + (char *)entry
->data
- (char *)bp
->data
;
436 * Zap one dcache entry: destroy one FID's buffers.
438 * 1/1/91 - I've modified the hash function to take the page as well
439 * as the *fid, so that lookup will be a bit faster. That presents some
440 * difficulties for Zap, which now has to have some knowledge of the nature
441 * of the hash function. Oh well. This should use the list traversal
444 * \param adc The dcache entry to be zapped.
447 DZap(struct dcache
*adc
)
450 /* Destroy all buffers pertaining to a particular fid. */
454 ObtainReadLock(&afs_bufferLock
);
456 for (i
= 0; i
<= PHPAGEMASK
; i
++)
457 for (tb
= phTable
[pHash(adc
->index
, i
)]; tb
; tb
= tb
->hashNext
)
458 if (tb
->fid
== adc
->index
) {
459 ObtainWriteLock(&tb
->lock
, 262);
461 afs_reset_inode(&tb
->inode
);
463 ReleaseWriteLock(&tb
->lock
);
465 ReleaseReadLock(&afs_bufferLock
);
469 DFlushBuffer(struct buffer
*ab
)
471 struct osi_file
*tfile
;
473 tfile
= afs_CFileOpen(&ab
->inode
);
475 afs_CFileWrite(tfile
, ab
->page
* AFS_BUFFER_PAGESIZE
,
476 ab
->data
, AFS_BUFFER_PAGESIZE
);
477 ab
->dirty
= 0; /* Clear the dirty flag */
478 afs_CFileClose(tfile
);
482 DFlushDCache(struct dcache
*adc
)
487 ObtainReadLock(&afs_bufferLock
);
489 for (i
= 0; i
<= PHPAGEMASK
; i
++)
490 for (tb
= phTable
[pHash(adc
->index
, i
)]; tb
; tb
= tb
->hashNext
)
491 if (tb
->fid
== adc
->index
) {
492 ObtainWriteLock(&tb
->lock
, 701);
494 ReleaseReadLock(&afs_bufferLock
);
499 ReleaseWriteLock(&tb
->lock
);
500 ObtainReadLock(&afs_bufferLock
);
503 ReleaseReadLock(&afs_bufferLock
);
509 /* Flush all the modified buffers. */
515 ObtainReadLock(&afs_bufferLock
);
516 for (i
= 0; i
< nbuffers
; i
++, tb
++) {
518 ObtainWriteLock(&tb
->lock
, 263);
520 ReleaseReadLock(&afs_bufferLock
);
522 /* it seems safe to do this I/O without having the dcache
523 * locked, since the only things that will update the data in
524 * a directory are the buffer package, which holds the relevant
525 * tb->lock while doing the write, or afs_GetDCache, which
526 * DZap's the directory while holding the dcache lock.
527 * It is not possible to lock the dcache or even call
528 * afs_GetDSlot to map the index to the dcache since the dir
529 * package's caller has some dcache object locked already (so
530 * we cannot lock afs_xdcache). In addition, we cannot obtain
531 * a dcache lock while holding the tb->lock of the same file
532 * since that can deadlock with DRead/DNew */
536 ReleaseWriteLock(&tb
->lock
);
537 ObtainReadLock(&afs_bufferLock
);
540 ReleaseReadLock(&afs_bufferLock
);
546 DNew(struct dcache
*adc
, int page
, struct DirBuffer
*entry
)
548 /* Same as read, only do *not* even try to read the page, since it
549 * probably doesn't exist. */
553 ObtainWriteLock(&afs_bufferLock
, 264);
554 if ((tb
= afs_newslot(adc
, page
, NULL
)) == 0) {
555 ReleaseWriteLock(&afs_bufferLock
);
558 /* extend the chunk, if needed */
559 /* Do it now, not in DFlush or afs_newslot when the data is written out,
560 * since now our caller has adc->lock writelocked, and we can't acquire
561 * that lock (or even map from a fid to a dcache) in afs_newslot or
562 * DFlush due to lock hierarchy issues */
563 if ((page
+ 1) * AFS_BUFFER_PAGESIZE
> adc
->f
.chunkBytes
) {
564 afs_AdjustSize(adc
, (page
+ 1) * AFS_BUFFER_PAGESIZE
);
565 osi_Assert(afs_WriteDCache(adc
, 1) == 0);
567 ObtainWriteLock(&tb
->lock
, 265);
569 ReleaseWriteLock(&afs_bufferLock
);
570 ReleaseWriteLock(&tb
->lock
);
572 entry
->data
= tb
->data
;
578 shutdown_bufferpackage(void)
583 AFS_STATCNT(shutdown_bufferpackage
);
584 /* Free all allocated Buffers and associated buffer pages */
586 if (afs_cold_shutdown
) {
589 for (i
= 0; i
< nbuffers
; i
+= NPB
, tp
+= NPB
) {
590 afs_osi_Free(tp
->data
, NPB
* AFS_BUFFER_PAGESIZE
);
592 afs_osi_Free(Buffers
, nbuffers
* sizeof(struct buffer
));
595 for (i
= 0; i
< PHSIZE
; i
++)
597 memset(&afs_bufferLock
, 0, sizeof(afs_lock_t
));