Import Upstream version 1.8.5
[hcoop/debian/openafs.git] / src / afs / afs_buffer.c
CommitLineData
805e021f
CE
1/*
2 * Copyright 2000, International Business Machines Corporation and others.
3 * All Rights Reserved.
4 *
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
8 */
9
10#include <afsconfig.h>
11#include "afs/param.h"
12
13
14#include "afs/sysincludes.h"
15#include "afsincludes.h"
16#if !defined(UKERNEL)
17#if !defined(AFS_LINUX26_ENV)
18# include "h/param.h"
19#endif
20#include "h/types.h"
21#include "h/time.h"
22#if defined(AFS_AIX31_ENV)
23#include "h/limits.h"
24#endif
25#if !defined(AFS_AIX_ENV) && !defined(AFS_SUN5_ENV) && !defined(AFS_SGI_ENV) && !defined(AFS_LINUX20_ENV)
26#include "h/kernel.h" /* Doesn't needed, so it should go */
27#endif
28#endif /* !defined(UKERNEL) */
29
30#include "afs/afs_osi.h"
31#include "afsint.h"
32#include "afs/lock.h"
33
34#if !defined(UKERNEL) && !defined(AFS_LINUX20_ENV)
35#include "h/buf.h"
36#endif /* !defined(UKERNEL) */
37
38#include "afs/stds.h"
39#include "afs/volerrors.h"
40#include "afs/exporter.h"
41#include "afs/prs_fs.h"
42#include "afs/afs_chunkops.h"
43#include "afs/dir.h"
44
45#include "afs/afs_stats.h"
46#include "afs/afs.h"
47
48#ifndef BUF_TIME_MAX
49#define BUF_TIME_MAX 0x7fffffff
50#endif
51#define NPB 8 /* must be a pwer of 2 */
52static int afs_max_buffers; /* should be an integral multiple of NPB */
53
54/* page size */
55#define AFS_BUFFER_PAGESIZE 2048
56/* log page size */
57#define LOGPS 11
58/* If you change any of this PH stuff, make sure you don't break DZap() */
59/* use last two bits for page */
60#define PHPAGEMASK 3
61/* use next five bits for fid */
62#define PHFIDMASK 124
63/* page hash table size - this is pretty intertwined with pHash */
64#define PHSIZE (PHPAGEMASK + PHFIDMASK + 1)
65/* the pHash macro */
66#define pHash(fid,page) ((((afs_int32)(fid)) & PHFIDMASK) \
67 | (page & PHPAGEMASK))
68
69#ifdef dirty
70#undef dirty /* XXX */
71#endif
72
73static struct buffer *Buffers = 0;
74static char *BufferData;
75
76#ifdef AFS_AIX_ENV
77extern struct buf *geteblk();
78#endif
79#ifdef AFS_FBSD_ENV
80#define timecounter afs_timecounter
81#endif
82
83/* A note on locking in 'struct buffer'
84 *
85 * afs_bufferLock protects the hash chain, and the 'lockers' field where that
86 * has a zero value. It must be held whenever lockers is incremented from zero.
87 *
88 * The individual buffer lock protects the contents of the structure, including
89 * the lockers field.
90 *
91 * For safety: afs_bufferLock and the individual buffer lock must be held
92 * when obtaining a reference on a structure. Only the individual buffer lock
93 * need be held when releasing a reference.
94 *
95 * The locking hierarchy is afs_bufferLock-> buffer.lock
96 *
97 */
98
99static afs_lock_t afs_bufferLock;
100static struct buffer *phTable[PHSIZE]; /* page hash table */
101static int nbuffers;
102static afs_int32 timecounter;
103
104/* Prototypes for static routines */
105static struct buffer *afs_newslot(struct dcache *adc, afs_int32 apage,
106 struct buffer *lp);
107
108static int dinit_flag = 0;
109void
110DInit(int abuffers)
111{
112 /* Initialize the venus buffer system. */
113 int i;
114 struct buffer *tb;
115
116 AFS_STATCNT(DInit);
117 if (dinit_flag)
118 return;
119 dinit_flag = 1;
120 /* round up to next multiple of NPB, since we allocate multiple pages per chunk */
121 abuffers = ((abuffers - 1) | (NPB - 1)) + 1;
122 afs_max_buffers = abuffers << 2; /* possibly grow up to 4 times as big */
123 LOCK_INIT(&afs_bufferLock, "afs_bufferLock");
124 Buffers = afs_osi_Alloc(afs_max_buffers * sizeof(struct buffer));
125 osi_Assert(Buffers != NULL);
126 timecounter = 1;
127 afs_stats_cmperf.bufAlloced = nbuffers = abuffers;
128 for (i = 0; i < PHSIZE; i++)
129 phTable[i] = 0;
130 for (i = 0; i < abuffers; i++) {
131 if ((i & (NPB - 1)) == 0) {
132 /* time to allocate a fresh buffer */
133 BufferData = afs_osi_Alloc(AFS_BUFFER_PAGESIZE * NPB);
134 osi_Assert(BufferData != NULL);
135 }
136 /* Fill in each buffer with an empty indication. */
137 tb = &Buffers[i];
138 tb->fid = NULLIDX;
139 afs_reset_inode(&tb->inode);
140 tb->accesstime = 0;
141 tb->lockers = 0;
142 tb->data = &BufferData[AFS_BUFFER_PAGESIZE * (i & (NPB - 1))];
143 tb->hashIndex = 0;
144 tb->dirty = 0;
145 AFS_RWLOCK_INIT(&tb->lock, "buffer lock");
146 }
147 return;
148}
149
150int
151DRead(struct dcache *adc, int page, struct DirBuffer *entry)
152{
153 /* Read a page from the disk. */
154 struct buffer *tb, *tb2;
155 struct osi_file *tfile;
156 int code;
157
158 AFS_STATCNT(DRead);
159
160 memset(entry, 0, sizeof(struct DirBuffer));
161
162 ObtainWriteLock(&afs_bufferLock, 256);
163
164#define bufmatch(tb) (tb->page == page && tb->fid == adc->index)
165#define buf_Front(head,parent,p) {(parent)->hashNext = (p)->hashNext; (p)->hashNext= *(head);*(head)=(p);}
166
167 /* this apparently-complicated-looking code is simply an example of
168 * a little bit of loop unrolling, and is a standard linked-list
169 * traversal trick. It saves a few assignments at the the expense
170 * of larger code size. This could be simplified by better use of
171 * macros.
172 */
173 if ((tb = phTable[pHash(adc->index, page)])) {
174 if (bufmatch(tb)) {
175 ObtainWriteLock(&tb->lock, 257);
176 tb->lockers++;
177 ReleaseWriteLock(&afs_bufferLock);
178 tb->accesstime = timecounter++;
179 AFS_STATS(afs_stats_cmperf.bufHits++);
180 ReleaseWriteLock(&tb->lock);
181 entry->buffer = tb;
182 entry->data = tb->data;
183 return 0;
184 } else {
185 struct buffer **bufhead;
186 bufhead = &(phTable[pHash(adc->index, page)]);
187 while ((tb2 = tb->hashNext)) {
188 if (bufmatch(tb2)) {
189 buf_Front(bufhead, tb, tb2);
190 ObtainWriteLock(&tb2->lock, 258);
191 tb2->lockers++;
192 ReleaseWriteLock(&afs_bufferLock);
193 tb2->accesstime = timecounter++;
194 AFS_STATS(afs_stats_cmperf.bufHits++);
195 ReleaseWriteLock(&tb2->lock);
196 entry->buffer = tb2;
197 entry->data = tb2->data;
198 return 0;
199 }
200 if ((tb = tb2->hashNext)) {
201 if (bufmatch(tb)) {
202 buf_Front(bufhead, tb2, tb);
203 ObtainWriteLock(&tb->lock, 259);
204 tb->lockers++;
205 ReleaseWriteLock(&afs_bufferLock);
206 tb->accesstime = timecounter++;
207 AFS_STATS(afs_stats_cmperf.bufHits++);
208 ReleaseWriteLock(&tb->lock);
209 entry->buffer = tb;
210 entry->data = tb->data;
211 return 0;
212 }
213 } else
214 break;
215 }
216 }
217 } else
218 tb2 = NULL;
219
220 AFS_STATS(afs_stats_cmperf.bufMisses++);
221 /* can't find it */
222 /* The last thing we looked at was either tb or tb2 (or nothing). That
223 * is at least the oldest buffer on one particular hash chain, so it's
224 * a pretty good place to start looking for the truly oldest buffer.
225 */
226 tb = afs_newslot(adc, page, (tb ? tb : tb2));
227 if (!tb) {
228 ReleaseWriteLock(&afs_bufferLock);
229 return EIO;
230 }
231 ObtainWriteLock(&tb->lock, 260);
232 tb->lockers++;
233 ReleaseWriteLock(&afs_bufferLock);
234 code = 0;
235 if (adc->f.chunk == 0 && adc->f.chunkBytes == 0) {
236 /* The directory blob is empty, apparently. This is not a valid dir
237 * blob, so throw an error. */
238 code = EIO;
239 } else if (page * AFS_BUFFER_PAGESIZE >= adc->f.chunkBytes) {
240 code = ENOENT; /* past the end */
241 }
242 if (code) {
243 tb->fid = NULLIDX;
244 afs_reset_inode(&tb->inode);
245 tb->lockers--;
246 ReleaseWriteLock(&tb->lock);
247 return code;
248 }
249 tfile = afs_CFileOpen(&adc->f.inode);
250 osi_Assert(tfile);
251 code =
252 afs_CFileRead(tfile, tb->page * AFS_BUFFER_PAGESIZE, tb->data,
253 AFS_BUFFER_PAGESIZE);
254 afs_CFileClose(tfile);
255 if (code < AFS_BUFFER_PAGESIZE) {
256 tb->fid = NULLIDX;
257 afs_reset_inode(&tb->inode);
258 tb->lockers--;
259 ReleaseWriteLock(&tb->lock);
260 return EIO;
261 }
262 /* Note that findslot sets the page field in the buffer equal to
263 * what it is searching for. */
264 ReleaseWriteLock(&tb->lock);
265 entry->buffer = tb;
266 entry->data = tb->data;
267 return 0;
268}
269
270static void
271FixupBucket(struct buffer *ap)
272{
273 struct buffer **lp, *tp;
274 int i;
275 /* first try to get it out of its current hash bucket, in which it
276 * might not be */
277 AFS_STATCNT(FixupBucket);
278 i = ap->hashIndex;
279 lp = &phTable[i];
280 for (tp = *lp; tp; tp = tp->hashNext) {
281 if (tp == ap) {
282 *lp = tp->hashNext;
283 break;
284 }
285 lp = &tp->hashNext;
286 }
287 /* now figure the new hash bucket */
288 i = pHash(ap->fid, ap->page);
289 ap->hashIndex = i; /* remember where we are for deletion */
290 ap->hashNext = phTable[i]; /* add us to the list */
291 phTable[i] = ap; /* at the front, since it's LRU */
292}
293
294/* lp is pointer to a fairly-old buffer */
295static struct buffer *
296afs_newslot(struct dcache *adc, afs_int32 apage, struct buffer *lp)
297{
298 /* Find a usable buffer slot */
299 afs_int32 i;
300 afs_int32 lt = 0;
301 struct buffer *tp;
302 struct osi_file *tfile;
303
304 AFS_STATCNT(afs_newslot);
305 /* we take a pointer here to a buffer which was at the end of an
306 * LRU hash chain. Odds are, it's one of the older buffers, not
307 * one of the newer. Having an older buffer to start with may
308 * permit us to avoid a few of the assignments in the "typical
309 * case" for loop below.
310 */
311 if (lp && (lp->lockers == 0)) {
312 lt = lp->accesstime;
313 } else {
314 lp = NULL;
315 }
316
317 /* timecounter might have wrapped, if machine is very very busy
318 * and stays up for a long time. Timecounter mustn't wrap twice
319 * (positive->negative->positive) before calling newslot, but that
320 * would require 2 billion consecutive cache hits... Anyway, the
321 * penalty is only that the cache replacement policy will be
322 * almost MRU for the next ~2 billion DReads... newslot doesn't
323 * get called nearly as often as DRead, so in order to avoid the
324 * performance penalty of using the hypers, it's worth doing the
325 * extra check here every time. It's probably cheaper than doing
326 * hcmp, anyway. There is a little performance hit resulting from
327 * resetting all the access times to 0, but it only happens once
328 * every month or so, and the access times will rapidly sort
329 * themselves back out after just a few more DReads.
330 */
331 if (timecounter < 0) {
332 timecounter = 1;
333 tp = Buffers;
334 for (i = 0; i < nbuffers; i++, tp++) {
335 tp->accesstime = 0;
336 if (!lp && !tp->lockers) /* one is as good as the rest, I guess */
337 lp = tp;
338 }
339 } else {
340 /* this is the typical case */
341 tp = Buffers;
342 for (i = 0; i < nbuffers; i++, tp++) {
343 if (tp->lockers == 0) {
344 if (!lp || tp->accesstime < lt) {
345 lp = tp;
346 lt = tp->accesstime;
347 }
348 }
349 }
350 }
351
352 if (lp == 0) {
353 /* No unlocked buffers. If still possible, allocate a new increment */
354 if (nbuffers + NPB > afs_max_buffers) {
355 /* There are no unlocked buffers -- this used to panic, but that
356 * seems extreme. To the best of my knowledge, all the callers
357 * of DRead are prepared to handle a zero return. Some of them
358 * just panic directly, but not all of them. */
359 afs_warn("afs: all buffers locked\n");
360 return 0;
361 }
362
363 BufferData = afs_osi_Alloc(AFS_BUFFER_PAGESIZE * NPB);
364 osi_Assert(BufferData != NULL);
365 for (i = 0; i< NPB; i++) {
366 /* Fill in each buffer with an empty indication. */
367 tp = &Buffers[i + nbuffers];
368 tp->fid = NULLIDX;
369 afs_reset_inode(&tp->inode);
370 tp->accesstime = 0;
371 tp->lockers = 0;
372 tp->data = &BufferData[AFS_BUFFER_PAGESIZE * i];
373 tp->hashIndex = 0;
374 tp->dirty = 0;
375 AFS_RWLOCK_INIT(&tp->lock, "buffer lock");
376 }
377 lp = &Buffers[nbuffers];
378 nbuffers += NPB;
379 }
380
381 if (lp->dirty) {
382 /* see DFlush for rationale for not getting and locking the dcache */
383 tfile = afs_CFileOpen(&lp->inode);
384 osi_Assert(tfile);
385 afs_CFileWrite(tfile, lp->page * AFS_BUFFER_PAGESIZE, lp->data,
386 AFS_BUFFER_PAGESIZE);
387 lp->dirty = 0;
388 afs_CFileClose(tfile);
389 AFS_STATS(afs_stats_cmperf.bufFlushDirty++);
390 }
391
392 /* Zero out the data so we don't leak something we shouldn't. */
393 memset(lp->data, 0, AFS_BUFFER_PAGESIZE);
394 /* Now fill in the header. */
395 lp->fid = adc->index;
396 afs_copy_inode(&lp->inode, &adc->f.inode);
397 lp->page = apage;
398 lp->accesstime = timecounter++;
399 FixupBucket(lp); /* move to the right hash bucket */
400
401 return lp;
402}
403
404void
405DRelease(struct DirBuffer *entry, int flag)
406{
407 struct buffer *tp;
408
409 AFS_STATCNT(DRelease);
410
411 tp = entry->buffer;
412 if (tp == NULL)
413 return;
414
415 tp = entry->buffer;
416 ObtainWriteLock(&tp->lock, 261);
417 tp->lockers--;
418 if (flag)
419 tp->dirty = 1;
420 ReleaseWriteLock(&tp->lock);
421}
422
423int
424DVOffset(struct DirBuffer *entry)
425{
426 struct buffer *bp;
427
428 AFS_STATCNT(DVOffset);
429
430 bp = entry->buffer;
431 return AFS_BUFFER_PAGESIZE * bp->page
432 + (char *)entry->data - (char *)bp->data;
433}
434
435/*!
436 * Zap one dcache entry: destroy one FID's buffers.
437 *
438 * 1/1/91 - I've modified the hash function to take the page as well
439 * as the *fid, so that lookup will be a bit faster. That presents some
440 * difficulties for Zap, which now has to have some knowledge of the nature
441 * of the hash function. Oh well. This should use the list traversal
442 * method of DRead...
443 *
444 * \param adc The dcache entry to be zapped.
445 */
446void
447DZap(struct dcache *adc)
448{
449 int i;
450 /* Destroy all buffers pertaining to a particular fid. */
451 struct buffer *tb;
452
453 AFS_STATCNT(DZap);
454 ObtainReadLock(&afs_bufferLock);
455
456 for (i = 0; i <= PHPAGEMASK; i++)
457 for (tb = phTable[pHash(adc->index, i)]; tb; tb = tb->hashNext)
458 if (tb->fid == adc->index) {
459 ObtainWriteLock(&tb->lock, 262);
460 tb->fid = NULLIDX;
461 afs_reset_inode(&tb->inode);
462 tb->dirty = 0;
463 ReleaseWriteLock(&tb->lock);
464 }
465 ReleaseReadLock(&afs_bufferLock);
466}
467
468static void
469DFlushBuffer(struct buffer *ab)
470{
471 struct osi_file *tfile;
472
473 tfile = afs_CFileOpen(&ab->inode);
474 osi_Assert(tfile);
475 afs_CFileWrite(tfile, ab->page * AFS_BUFFER_PAGESIZE,
476 ab->data, AFS_BUFFER_PAGESIZE);
477 ab->dirty = 0; /* Clear the dirty flag */
478 afs_CFileClose(tfile);
479}
480
481void
482DFlushDCache(struct dcache *adc)
483{
484 int i;
485 struct buffer *tb;
486
487 ObtainReadLock(&afs_bufferLock);
488
489 for (i = 0; i <= PHPAGEMASK; i++)
490 for (tb = phTable[pHash(adc->index, i)]; tb; tb = tb->hashNext)
491 if (tb->fid == adc->index) {
492 ObtainWriteLock(&tb->lock, 701);
493 tb->lockers++;
494 ReleaseReadLock(&afs_bufferLock);
495 if (tb->dirty) {
496 DFlushBuffer(tb);
497 }
498 tb->lockers--;
499 ReleaseWriteLock(&tb->lock);
500 ObtainReadLock(&afs_bufferLock);
501 }
502
503 ReleaseReadLock(&afs_bufferLock);
504}
505
506int
507DFlush(void)
508{
509 /* Flush all the modified buffers. */
510 int i;
511 struct buffer *tb;
512
513 AFS_STATCNT(DFlush);
514 tb = Buffers;
515 ObtainReadLock(&afs_bufferLock);
516 for (i = 0; i < nbuffers; i++, tb++) {
517 if (tb->dirty) {
518 ObtainWriteLock(&tb->lock, 263);
519 tb->lockers++;
520 ReleaseReadLock(&afs_bufferLock);
521 if (tb->dirty) {
522 /* it seems safe to do this I/O without having the dcache
523 * locked, since the only things that will update the data in
524 * a directory are the buffer package, which holds the relevant
525 * tb->lock while doing the write, or afs_GetDCache, which
526 * DZap's the directory while holding the dcache lock.
527 * It is not possible to lock the dcache or even call
528 * afs_GetDSlot to map the index to the dcache since the dir
529 * package's caller has some dcache object locked already (so
530 * we cannot lock afs_xdcache). In addition, we cannot obtain
531 * a dcache lock while holding the tb->lock of the same file
532 * since that can deadlock with DRead/DNew */
533 DFlushBuffer(tb);
534 }
535 tb->lockers--;
536 ReleaseWriteLock(&tb->lock);
537 ObtainReadLock(&afs_bufferLock);
538 }
539 }
540 ReleaseReadLock(&afs_bufferLock);
541
542 return 0;
543}
544
545int
546DNew(struct dcache *adc, int page, struct DirBuffer *entry)
547{
548 /* Same as read, only do *not* even try to read the page, since it
549 * probably doesn't exist. */
550 struct buffer *tb;
551 AFS_STATCNT(DNew);
552
553 ObtainWriteLock(&afs_bufferLock, 264);
554 if ((tb = afs_newslot(adc, page, NULL)) == 0) {
555 ReleaseWriteLock(&afs_bufferLock);
556 return EIO;
557 }
558 /* extend the chunk, if needed */
559 /* Do it now, not in DFlush or afs_newslot when the data is written out,
560 * since now our caller has adc->lock writelocked, and we can't acquire
561 * that lock (or even map from a fid to a dcache) in afs_newslot or
562 * DFlush due to lock hierarchy issues */
563 if ((page + 1) * AFS_BUFFER_PAGESIZE > adc->f.chunkBytes) {
564 afs_AdjustSize(adc, (page + 1) * AFS_BUFFER_PAGESIZE);
565 osi_Assert(afs_WriteDCache(adc, 1) == 0);
566 }
567 ObtainWriteLock(&tb->lock, 265);
568 tb->lockers++;
569 ReleaseWriteLock(&afs_bufferLock);
570 ReleaseWriteLock(&tb->lock);
571 entry->buffer = tb;
572 entry->data = tb->data;
573
574 return 0;
575}
576
577void
578shutdown_bufferpackage(void)
579{
580 struct buffer *tp;
581 int i;
582
583 AFS_STATCNT(shutdown_bufferpackage);
584 /* Free all allocated Buffers and associated buffer pages */
585 DFlush();
586 if (afs_cold_shutdown) {
587 dinit_flag = 0;
588 tp = Buffers;
589 for (i = 0; i < nbuffers; i += NPB, tp += NPB) {
590 afs_osi_Free(tp->data, NPB * AFS_BUFFER_PAGESIZE);
591 }
592 afs_osi_Free(Buffers, nbuffers * sizeof(struct buffer));
593 nbuffers = 0;
594 timecounter = 1;
595 for (i = 0; i < PHSIZE; i++)
596 phTable[i] = 0;
597 memset(&afs_bufferLock, 0, sizeof(afs_lock_t));
598 }
599}