Import Upstream version 1.8.5
[hcoop/debian/openafs.git] / src / afs / HPUX / osi_vnodeops.c
CommitLineData
805e021f
CE
1/*
2 * Copyright 2000, International Business Machines Corporation and others.
3 * All Rights Reserved.
4 *
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
8 */
9
10/* This is a placeholder for routines unique to the port of AFS to hp-ux*/
11
12#include <afsconfig.h>
13#include "afs/param.h"
14
15
16#include "afs/sysincludes.h" /* Standard vendor system headers */
17#include "afsincludes.h" /* Afs-based standard headers */
18#include "afs/afs_stats.h" /* statistics stuff */
19
20#include <sys/uio.h>
21#include <sys/vfs.h>
22#include <sys/mount.h>
23#include <sys/vnode.h>
24#include <sys/pathname.h>
25
26extern struct vfsops Afs_vfsops;
27extern int afs_hp_strategy();
28extern int afs_bmap(), afs_badop(), afs_noop(), afs_lockf();
29extern int afs_pagein();
30extern int afs_pageout();
31extern int afs_ioctl();
32extern int afs_prealloc();
33extern int afs_mapdbd();
34extern int afs_mmap();
35extern int afs_cachelimit();
36extern int afs_vm_checkpage();
37extern int afs_vm_fscontiguous();
38extern int afs_vm_stopio();
39extern int afs_read_ahead();
40extern int afs_unmap();
41extern int afs_release();
42extern int afs_swapfs_len();
43extern int afs_readdir2();
44extern int afs_readdir();
45extern int afs_readdir3();
46extern int afs_pathconf();
47extern int afs_close();
48
49#define vtoblksz(vp) ((vp)->v_vfsp->vfs_bsize)
50
51#if defined(AFS_HPUX110_ENV)
52/* We no longer need to lock on the VM Empire,
53 * or at least that is what is claimed.
54 * so we will noopt the vmemp_ routines
55 * This needs to be looked at closer.
56 */
57#define vmemp_lockx()
58#undef vmemp_returnx
59#define vmemp_returnx(a) return(a)
60#define vmemp_unlockx()
61#endif
62
63#if !defined(AFS_HPUX110_ENV)
64/*
65 * Copy an mbuf to the contiguous area pointed to by cp.
66 * Skip <off> bytes and copy <len> bytes.
67 * Returns the number of bytes not transferred.
68 * The mbuf is NOT changed.
69 */
70int
71m_cpytoc(m, off, len, cp)
72 struct mbuf *m;
73 int off, len;
74 caddr_t cp;
75{
76 int ml;
77
78 if (m == NULL || off < 0 || len < 0 || cp == NULL)
79 osi_Panic("m_cpytoc");
80 while (off && m)
81 if (m->m_len <= off) {
82 off -= m->m_len;
83 m = m->m_next;
84 continue;
85 } else
86 break;
87 if (m == NULL)
88 return (len);
89
90 ml = MIN(len, m->m_len - off);
91 memcpy(cp, mtod(m, caddr_t) + off, (u_int) ml);
92 cp += ml;
93 len -= ml;
94 m = m->m_next;
95
96 while (len && m) {
97 ml = m->m_len;
98 memcpy(cp, mtod(m, caddr_t), (u_int) ml);
99 cp += ml;
100 len -= ml;
101 m = m->m_next;
102 }
103
104 return (len);
105}
106#endif
107
108/*
109 * Note that the standard Sun vnode interface doesn't haven't an vop_lockf(), so this code is
110 * totally new. This came about because HP-UX has lockf() implemented as
111 * a system call while Sun has it implemented as a library (apparently).
112 * To handle this, we have to translate the lockf() request into an
113 * fcntl() looking request, and then translate the results back if necessary.
114 * we call afs_lockctl() directly .
115 */
116afs_lockf(vp, flag, len, cred, fp, LB, UB)
117 struct vnode *vp;
118 int flag;
119 afs_ucred_t *cred;
120 struct file *fp;
121 k_off_t len, LB, UB;
122{
123 /*for now, just pretend it works */
124 struct k_flock flock;
125 int cmd, code;
126
127 /*
128 * Create a flock structure and translate the lockf request
129 * into an appropriate looking fcntl() type request for afs_lockctl()
130 */
131 flock.l_whence = 0;
132 flock.l_len = len;
133 flock.l_start = fp->f_offset;
134 /* convert negative lengths to positive */
135 if (flock.l_len < 0) {
136 flock.l_start += flock.l_len;
137 flock.l_len = -(flock.l_len);
138 }
139 /*
140 * Adjust values to look like fcntl() requests.
141 * All locks are write locks, only F_LOCK requests
142 * are blocking. F_TEST has to be translated into
143 * a get lock and then back again.
144 */
145 flock.l_type = F_WRLCK;
146 cmd = F_SETLK;
147 switch (flag) {
148 case F_ULOCK:
149 flock.l_type = F_UNLCK;
150 break;
151 case F_LOCK:
152 cmd = F_SETLKW;
153 break;
154 case F_TEST:
155 cmd = F_GETLK;
156 break;
157 }
158 u.u_error = mp_afs_lockctl(vp, &flock, cmd, fp->f_cred);
159 if (u.u_error) {
160 return (u.u_error); /* some other error code */
161 }
162 /*
163 * if request is F_TEST, and GETLK changed
164 * the lock type to ULOCK, then return 0, else
165 * set errno to EACCESS and return.
166 */
167 if (flag == F_TEST && flock.l_type != F_UNLCK) {
168 u.u_error = EACCES;
169 return (u.u_error);
170 }
171 return (0);
172}
173
174
175#if defined(AFS_HPUX1122_ENV)
176#include "machine/vm/vmparam.h"
177#else
178#include "../machine/vmparam.h" /* For KERNELSPACE */
179#endif
180#include "h/debug.h"
181#include "h/types.h"
182#if !defined(AFS_HPUX1123_ENV)
183 /* 11.23 is using 64 bit in many cases */
184#define kern_daddr_t daddr_t
185#endif
186#include "h/param.h"
187#include "h/vmmac.h"
188#include "h/time.h"
189#include "ufs/inode.h"
190#include "ufs/fs.h"
191#include "h/dbd.h"
192#if defined(AFS_HPUX1123_ENV)
193dbd_t *finddbd();
194#endif /* AFS_HPUX1123_ENV */
195#include "h/vfd.h"
196#include "h/region.h"
197#include "h/pregion.h"
198#include "h/vmmeter.h"
199#include "h/user.h"
200#include "h/sysinfo.h"
201#include "h/pfdat.h"
202#if !defined(AFS_HPUX1123_ENV)
203#include "h/tuneable.h"
204#endif
205#include "h/buf.h"
206#include "netinet/in.h"
207
208/* a freelist of one */
209struct buf *afs_bread_freebp = 0;
210
211/*
212 * Only rfs_read calls this, and it only looks at bp->b_un.b_addr.
213 * Thus we can use fake bufs (ie not from the real buffer pool).
214 */
215afs_bread(vp, lbn, bpp)
216 struct vnode *vp;
217 kern_daddr_t lbn;
218 struct buf **bpp;
219{
220 int offset, fsbsize, error;
221 struct buf *bp;
222 struct iovec iov;
223 struct uio uio;
224
225 memset(&uio, 0, sizeof(uio));
226 memset(&iov, 0, sizeof(iov));
227
228 AFS_STATCNT(afs_bread);
229 fsbsize = vp->v_vfsp->vfs_bsize;
230 offset = lbn * fsbsize;
231 if (afs_bread_freebp) {
232 bp = afs_bread_freebp;
233 afs_bread_freebp = 0;
234 } else {
235 bp = (struct buf *)AFS_KALLOC(sizeof(*bp));
236 bp->b_un.b_addr = (caddr_t) AFS_KALLOC(fsbsize);
237 }
238
239 iov.iov_base = bp->b_un.b_addr;
240 iov.iov_len = fsbsize;
241 uio.afsio_iov = &iov;
242 uio.afsio_iovcnt = 1;
243 uio.afsio_seg = AFS_UIOSYS;
244 uio.afsio_offset = offset;
245 uio.afsio_resid = fsbsize;
246 uio.uio_fpflags = 0;
247 *bpp = 0;
248
249 error = afs_read(VTOAFS(vp), &uio, p_cred(u.u_procp), 0);
250 if (error) {
251 afs_bread_freebp = bp;
252 return error;
253 }
254 if (*bpp) {
255 afs_bread_freebp = bp;
256 } else {
257 *(struct buf **)&bp->b_vp = bp; /* mark as fake */
258 *bpp = bp;
259 }
260 return 0;
261}
262
263afs_brelse(vp, bp)
264 struct vnode *vp;
265 struct buf *bp;
266{
267 AFS_STATCNT(afs_brelse);
268
269 if ((struct buf *)bp->b_vp != bp) { /* not fake */
270 ufs_brelse(bp->b_vp, bp);
271 } else if (afs_bread_freebp) {
272 AFS_KFREE(bp->b_un.b_addr, vp->v_vfsp->vfs_bsize);
273 AFS_KFREE(bp, sizeof(*bp));
274 } else {
275 afs_bread_freebp = bp;
276 }
277}
278
279
280afs_bmap(avc, abn, anvp, anbn)
281 struct vcache *avc;
282 kern_daddr_t abn, *anbn;
283 struct vcache **anvp;
284{
285 AFS_STATCNT(afs_bmap);
286 if (anvp)
287 *anvp = avc;
288 if (anbn)
289 *anbn = abn * (8192 / DEV_BSIZE); /* in 512 byte units */
290 return 0;
291}
292
293afs_inactive(avc, acred)
294 struct vcache *avc;
295 afs_ucred_t *acred;
296{
297 struct vnode *vp = AFSTOV(avc);
298 ulong_t context;
299 lock_t *sv_lock;
300 if (afs_shuttingdown != AFS_RUNNING)
301 return;
302
303 /*
304 * In Solaris and HPUX s800 and HP-UX10.0 they actually call us with
305 * v_count 1 on last reference!
306 */
307 MP_H_SPINLOCK_USAV(vn_h_sl_pool, vp, &sv_lock, &context);
308 if (avc->vrefCount < 1)
309 osi_Panic("afs_inactive : v_count < 1\n");
310
311 /*
312 * If more than 1 don't unmap the vnode but do decrement the ref count
313 */
314 vp->v_count--;
315 if (vp->v_count > 0) {
316 MP_SPINUNLOCK_USAV(sv_lock, context);
317 return 0;
318 }
319 MP_SPINUNLOCK_USAV(sv_lock, context);
320 afs_InactiveVCache(avc, acred);
321 return 0;
322}
323
324
325int
326mp_afs_open(struct vnode **avcp, int aflags, afs_ucred_t *acred)
327{
328 int code;
329
330 AFS_GLOCK();
331 code = afs_open(avcp, aflags, acred);
332 AFS_GUNLOCK();
333 return (code);
334}
335
336int
337mp_afs_close(struct vnode *avcp, int aflags, afs_ucred_t *acred)
338{
339 int code;
340
341 AFS_GLOCK();
342 code = afs_close(avcp, aflags, acred);
343 AFS_GUNLOCK();
344 return (code);
345}
346
347int
348mp_afs_rdwr(struct vnode *avcp, struct uio *uio, enum uio_rw arw,
349 int aio, afs_ucred_t *acred)
350{
351 int code;
352 long save_resid;
353
354 AFS_GLOCK();
355 save_resid = uio->uio_resid;
356 code = afs_rdwr(avcp, uio, arw, aio, acred);
357 if (arw == UIO_WRITE && code == ENOSPC) {
358 /* HP clears code if any data written. */
359 uio->uio_resid = save_resid;
360 }
361 AFS_GUNLOCK();
362 return (code);
363}
364
365int
366mp_afs_getattr(struct vnode *avcp, struct vattr *attrs,
367 afs_ucred_t *acred, enum vsync unused1)
368{
369 int code;
370
371 AFS_GLOCK();
372 code = afs_getattr(avcp, attrs, acred);
373 AFS_GUNLOCK();
374 return (code);
375}
376
377int
378mp_afs_setattr(struct vnode *avcp, struct vattr *attrs,
379 afs_ucred_t *acred, int unused1)
380{
381 int code;
382
383 AFS_GLOCK();
384 code = afs_setattr(avcp, attrs, acred);
385 AFS_GUNLOCK();
386 return (code);
387}
388
389int
390mp_afs_access(struct vnode *avcp, int mode, afs_ucred_t *acred)
391{
392 int code;
393
394 AFS_GLOCK();
395 code = afs_access(avcp, mode, acred);
396 AFS_GUNLOCK();
397 return (code);
398}
399
400int
401mp_afs_lookup(struct vnode *adp, char *aname,
402 struct vnode **avcp, afs_ucred_t *acred,
403 struct vnode *unused1)
404{
405 int code;
406
407 AFS_GLOCK();
408 code = afs_lookup(adp, aname, avcp, acred);
409 AFS_GUNLOCK();
410 return (code);
411}
412
413int
414mp_afs_create(struct vnode *adp, char *aname, struct vattr *attrs,
415 enum vcexcl aexcl, int amode, struct vnode **avcp,
416 afs_ucred_t *acred)
417{
418 int code;
419
420 AFS_GLOCK();
421 code = afs_create(adp, aname, attrs, aexcl, amode, avcp, acred);
422 AFS_GUNLOCK();
423 return (code);
424}
425
426
427int
428mp_afs_remove(struct vnode *adp, char *aname,
429 afs_ucred_t *acred)
430{
431 int code;
432
433 AFS_GLOCK();
434 code = afs_remove(adp, aname, acred);
435 AFS_GUNLOCK();
436 return (code);
437}
438
439int
440mp_afs_link(struct vnode *avc, struct vnode *adp,
441 char *aname, afs_ucred_t *acred)
442{
443 int code;
444
445 AFS_GLOCK();
446 code = afs_link(avc, adp, aname, acred);
447 AFS_GUNLOCK();
448 return (code);
449}
450
451int
452mp_afs_rename(struct vnode *aodp, char *aname1,
453 struct vnode *andp, char *aname2,
454 afs_ucred_t *acred)
455{
456 int code;
457
458 AFS_GLOCK();
459 code = afs_rename(aodp, aname1, andp, aname2, acred);
460 AFS_GUNLOCK();
461 return (code);
462}
463
464int
465mp_afs_mkdir(struct vnode *adp, char *aname, struct vattr *attrs,
466 struct vnode **avcp, afs_ucred_t *acred)
467{
468 int code;
469
470 AFS_GLOCK();
471 code = afs_mkdir(adp, aname, attrs, avcp, acred);
472 AFS_GUNLOCK();
473 return (code);
474}
475
476
477int
478mp_afs_rmdir(struct vnode *adp, char *aname, afs_ucred_t *acred)
479{
480 int code;
481
482 AFS_GLOCK();
483 code = afs_rmdir(adp, aname, acred);
484 AFS_GUNLOCK();
485 return (code);
486}
487
488
489int
490mp_afs_readdir(struct vnode *avc, struct uio *auio,
491 afs_ucred_t *acred)
492{
493 int code;
494
495 AFS_GLOCK();
496 code = afs_readdir(avc, auio, acred);
497 AFS_GUNLOCK();
498 return (code);
499}
500
501int
502mp_afs_symlink(struct vnode *adp, char *aname, struct vattr *attrs,
503 char *atargetName, afs_ucred_t *acred)
504{
505 int code;
506
507 AFS_GLOCK();
508 code = afs_symlink(adp, aname, attrs, atargetName, NULL, acred);
509 AFS_GUNLOCK();
510 return (code);
511}
512
513
514int
515mp_afs_readlink(struct vnode *avc, struct uio *auio,
516 afs_ucred_t *acred)
517{
518 int code;
519
520 AFS_GLOCK();
521 code = afs_readlink(avc, auio, acred);
522 AFS_GUNLOCK();
523 return (code);
524}
525
526int
527mp_afs_fsync(struct vnode *avc, afs_ucred_t *acred, int unused1)
528{
529 int code;
530
531 AFS_GLOCK();
532 code = afs_fsync(avc, acred);
533 AFS_GUNLOCK();
534 return (code);
535}
536
537int
538mp_afs_bread(struct vnode *avc, kern_daddr_t lbn, struct buf **bpp,
539 struct vattr *unused1, struct ucred *unused2)
540{
541 int code;
542
543 AFS_GLOCK();
544 code = afs_bread(avc, lbn, bpp);
545 AFS_GUNLOCK();
546 return (code);
547}
548
549int
550mp_afs_brelse(struct vnode *avc, struct buf *bp)
551{
552 int code;
553
554 AFS_GLOCK();
555 code = afs_brelse(avc, bp);
556 AFS_GUNLOCK();
557 return (code);
558}
559
560
561int
562mp_afs_inactive(struct vnode *avc, afs_ucred_t *acred)
563{
564 int code;
565
566 AFS_GLOCK();
567 code = afs_inactive(avc, acred);
568 AFS_GUNLOCK();
569 return (code);
570}
571
572int
573mp_afs_lockctl(struct vnode *avc, struct flock *af, int cmd,
574 afs_ucred_t *acred, struct file *unused1, off_t unused2,
575 off_t unused3)
576{
577 int code;
578
579 AFS_GLOCK();
580 code = afs_lockctl(avc, af, cmd, acred);
581 AFS_GUNLOCK();
582 return (code);
583}
584
585int
586mp_afs_fid(struct vnode *avc, struct fid **fidpp)
587{
588 int code;
589
590 AFS_GLOCK();
591 code = afs_fid(avc, fidpp);
592 AFS_GUNLOCK();
593 return (code);
594}
595
596int
597mp_afs_readdir2(struct vnode *avc, struct uio *auio,
598 afs_ucred_t *acred)
599{
600 int code;
601
602 AFS_GLOCK();
603 code = afs_readdir2(avc, auio, acred);
604 AFS_GUNLOCK();
605 return (code);
606}
607
608
609struct vnodeops Afs_vnodeops = {
610 mp_afs_open,
611 mp_afs_close,
612 mp_afs_rdwr,
613 afs_ioctl,
614 afs_noop,
615 mp_afs_getattr,
616 mp_afs_setattr,
617 mp_afs_access,
618 mp_afs_lookup,
619 mp_afs_create,
620 mp_afs_remove,
621 mp_afs_link,
622 mp_afs_rename,
623 mp_afs_mkdir,
624 mp_afs_rmdir,
625 afs_readdir,
626 mp_afs_symlink,
627 mp_afs_readlink,
628 mp_afs_fsync,
629 mp_afs_inactive,
630 afs_bmap,
631 afs_hp_strategy,
632#if !defined(AFS_NONFSTRANS)
633 /* on HPUX102 the nfs translator calls afs_bread but does
634 * not call afs_brelse. Hence we see a memory leak. If the
635 * VOP_BREAD() call fails, then nfs does VOP_RDWR() to get
636 * the same data : this is the path we follow now. */
637 afs_noop,
638 afs_noop,
639#else
640 mp_afs_bread,
641 mp_afs_brelse,
642#endif
643 afs_badop, /* pathsend */
644 afs_noop, /* setacl */
645 afs_noop, /* getacl */
646 afs_pathconf,
647 afs_pathconf,
648 mp_afs_lockctl,
649 afs_lockf, /* lockf */
650 mp_afs_fid,
651 afs_noop, /*fsctl */
652 afs_badop,
653 afs_pagein,
654 afs_pageout,
655 NULL,
656 NULL,
657 afs_prealloc,
658 afs_mapdbd,
659 afs_mmap,
660 afs_cachelimit,
661 afs_vm_checkpage,
662 afs_vm_fscontiguous,
663 afs_vm_stopio,
664 afs_read_ahead,
665 afs_release,
666 afs_unmap,
667 afs_swapfs_len,
668 mp_afs_readdir2,
669 afs_readdir3,
670};
671
672struct vnodeops *afs_ops = &Afs_vnodeops;
673
674/* vnode file operations, and our own */
675extern int vno_rw();
676extern int vno_ioctl();
677extern int vno_select();
678extern int afs_closex();
679extern int vno_close();
680struct fileops afs_fileops = {
681 vno_rw,
682 vno_ioctl,
683 vno_select,
684 afs_close,
685};
686
687#define vtoblksz(vp) ((vp)->v_vfsp->vfs_bsize)
688
689/*
690 ********************************************************************
691 ****
692 **** afspgin_setup_io_ranges ()
693 **** similar to: nfspgin_setup_io_ranges ()
694 ********************************************************************
695 */
696pgcnt_t
697afspgin_setup_io_ranges(vfspage_t * vm_info, pgcnt_t bpages, k_off_t isize,
698 pgcnt_t startindex)
699{
700 pgcnt_t file_offset = VM_FILE_OFFSET(vm_info);
701 pgcnt_t minpage; /* first page to bring in */
702 pgcnt_t maxpage; /* one past last page to bring in */
703 pgcnt_t maxpagein;
704 pgcnt_t multio_maxpage;
705 kern_daddr_t start_blk;
706 dbd_t *dbd;
707 expnd_flags_t up_reason, down_reason;
708 int count = 1;
709 int indx = 0;
710 int max_num_io;
711 int dbdtype;
712 preg_t *prp;
713
714 VM_GET_IO_INFO(vm_info, maxpagein, max_num_io);
715
716 /*
717 * We do not go past the end of the current pregion nor past the end
718 * of the current file.
719 */
720
721 maxpage = startindex + (bpages - (startindex + file_offset) % bpages);
722 maxpage = vm_reset_maxpage(vm_info, maxpage);
723 maxpage = MIN(maxpage, (pgcnt_t) btorp(isize) - file_offset);
724 maxpage = MIN(maxpage, startindex + maxpagein);
725 multio_maxpage = maxpage = vm_maxpage(vm_info, maxpage);
726
727 if (!maxpage)
728 return (0);
729
730 VASSERT(maxpage >= startindex);
731
732 /*
733 * Expanding the fault will create calls to FINDENTRY() for new
734 * pages, which will obsolete "dbd", so copy what it points to
735 * and clear it to prevent using stale data.
736 */
737
738 prp = VM_PRP(vm_info);
739 dbdtype = DBD_TYPE(vm_info);
740 start_blk = DBD_DATA(vm_info);
741 vm_info->dbd = NULL;
742 vm_info->vfd = NULL;
743 VASSERT(dbdtype != DBD_NONE);
744
745 if (max_num_io == 1) {
746 /*
747 * We need to set up one I/O: First we attempt to expand the
748 * I/O forward. Then we expand the I/O backwards.
749 */
750 count =
751 expand_faultin_up(vm_info, dbdtype, (int)bpages, maxpage, count,
752 startindex, start_blk, &up_reason);
753 maxpage = startindex + count;
754 VASSERT(maxpage <= startindex + maxpagein);
755 minpage = startindex - (startindex + file_offset) % bpages;
756 minpage = MAX(minpage, maxpage - maxpagein);
757 VASSERT(startindex >= VM_BASE_OFFSET(vm_info));
758 minpage = vm_minpage(vm_info, minpage);
759 VASSERT(minpage <= startindex);
760 count =
761 expand_faultin_down(vm_info, dbdtype, (int)bpages, minpage, count,
762 &startindex, &start_blk, &down_reason);
763 VM_SET_IO_STARTINDX(vm_info, 0, startindex);
764 VM_SET_IO_STARTBLK(vm_info, 0, start_blk);
765 VM_SET_IO_COUNT(vm_info, 0, count);
766 VM_SET_NUM_IO(vm_info, 1);
767 }
768
769 if (max_num_io > 1) {
770 /*
771 * We need to set up multiple I/O information; beginning
772 * with the startindex, we will expand upwards. The expansion
773 * could stop for one of 2 reasons; we take the appropriate
774 * action in each of these cases:
775 * o VM reasons: abort setting up the multiple I/O
776 * information and return to our caller indicating
777 * that "retry" is required.
778 * o pagelimit: set up the next I/O info [we may have
779 * reached multio_maxpage at this point].
780 * Note that expansion involves no more than a block at a time;
781 * hence it could never stop due to "discontiguous block"
782 * reason.
783 */
784 startindex = minpage = vm_minpage(vm_info, 0);
785 for (indx = 0; (indx < max_num_io) && (startindex < multio_maxpage);
786 indx++, startindex += count) {
787 dbd = FINDDBD(prp->p_reg, startindex);
788 start_blk = dbd->dbd_data;
789 maxpage =
790 startindex + (bpages - (startindex + file_offset) % bpages);
791 maxpage = min(maxpage, multio_maxpage);
792 count =
793 expand_faultin_up(vm_info, dbdtype, bpages, maxpage,
794 1 /* count */ ,
795 startindex, start_blk, &up_reason);
796 VM_SET_IO_STARTINDX(vm_info, indx, startindex);
797 VM_SET_IO_STARTBLK(vm_info, indx, start_blk);
798 VM_SET_IO_COUNT(vm_info, indx, count);
799 if (up_reason & VM_REASONS)
800 break;
801 VASSERT(!(up_reason & NONCONTIGUOUS_BLOCK));
802 VASSERT(up_reason & PAGELIMIT);
803 }
804 if (startindex < multio_maxpage) {
805 VM_MULT_IO_FAILURE(vm_info);
806 VM_REINIT_FAULT_DBDVFD(vm_info);
807 return (0); /* retry */
808 }
809 count = maxpagein;
810 VM_SET_NUM_IO(vm_info, indx);
811 }
812
813 /*
814 * Tell VM where the I/O intends to start. This may be different
815 * from the faulting point.
816 */
817
818 VM_SET_STARTINDX(vm_info, VM_GET_IO_STARTINDX(vm_info, 0));
819
820 return (count);
821
822}
823
824/*
825 ********************************************************************
826 ****
827 **** afspgin_blkflsh ()
828 **** similar to: nfspgin_blkflsh ()
829 ********************************************************************
830 */
831retval_t
832afspgin_blkflsh(vfspage_t * vm_info, struct vnode * devvp, pgcnt_t * num_4k)
833{
834 int flush_reslt = 0;
835 pgcnt_t count = *num_4k;
836 pgcnt_t page_count;
837 int indx = 0;
838 int num_io = VM_GET_NUM_IO(vm_info);
839
840 /*
841 * On this blkflush() we don't want to purge the buffer cache and we do
842 * want to wait, so the flags are '0'.
843 */
844
845 for (indx = 0; indx < num_io; indx++) {
846 flush_reslt =
847 blkflush(devvp, (kern_daddr_t) VM_GET_IO_STARTBLK(vm_info, indx),
848 ptob(VM_GET_IO_COUNT(vm_info, indx)), 0,
849 VM_REGION(vm_info));
850 if (flush_reslt) {
851 vm_lock(vm_info);
852 if (vm_page_now_valid(vm_info, &page_count)) {
853 vm_release_memory(vm_info);
854 vm_release_structs(vm_info);
855 *num_4k = page_count;
856 return (VM_PAGE_PRESENT);
857 }
858 return (VM_RETRY);
859 }
860 }
861 return (VM_DONE);
862}
863
864/*
865 ********************************************************************
866 ****
867 **** afspgin_io ()
868 **** similar to: nfspgin_io ()
869 ********************************************************************
870 */
871int
872afspgin_io(vfspage_t * vm_info, struct vnode *devvp, pgcnt_t bpages,
873 pgcnt_t maxpagein, pgcnt_t count)
874{
875 int i;
876 int error = 0;
877 caddr_t vaddr = VM_ADDR(vm_info);
878 caddr_t virt_addr = VM_MAPPED_ADDR(vm_info);
879 pagein_info_t *io = VM_PAGEIN_INFO(vm_info);
880 preg_t *prp = VM_PRP(vm_info);
881 int wrt = VM_WRT(vm_info);
882 space_t space = VM_SPACE(vm_info);
883 int num_io = VM_GET_NUM_IO(vm_info);
884
885#ifdef notdef /* Not used in AFS */
886 /*
887 * With VM_READ_AHEAD_ALLOWED() macro, check if read-ahead should
888 * be used in this case.
889 *
890 * Unlike UFS, NFS does not start the faulting page I/O
891 * asynchronously. Why? Asynchronous requests are handled by the
892 * biod's. It doesn't make sense to queue up the faulting request
893 * behind other asynchrnous requests. This is not true for UFS
894 * where the asynchrnous request is immediately handled.
895 */
896
897 if ((VM_READ_AHEAD_ALLOWED(vm_info)) && (nfs_read_ahead_on)
898 && (NFS_DO_READ_AHEAD) && (should_do_read_ahead(prp, vaddr))) {
899
900 pgcnt_t max_rhead_io;
901 caddr_t rhead_vaddr;
902 pgcnt_t total_rheads_allowed;
903
904 /*
905 * Determine the maximum amount of read-ahead I/O.
906 */
907 total_rheads_allowed = maxpagein - count;
908
909 /*
910 * If the count is less than a block, raise it to one.
911 */
912 if (total_rheads_allowed < bpages)
913 total_rheads_allowed = bpages;
914
915 max_rhead_io = total_rheads_allowed;
916 rhead_vaddr = VM_MAPPED_ADDR(vm_info) + (count * NBPG);
917 error =
918 nfs_read_ahead(vm_info->vp, prp, wrt, space, rhead_vaddr,
919 &max_rhead_io);
920
921 /*
922 * Set the next fault location. If read_ahead launches any
923 * I/O it will adjust it accordingly.
924 */
925 vm_info->prp->p_nextfault = vm_info->startindex + count;
926
927 /*
928 * Now perform the faulting I/O synchronously.
929 */
930 vm_unlock(vm_info);
931
932 error =
933 syncpageio((swblk_t) VM_GET_IO_STARTBLK(vm_info, 0),
934 VM_MAPPED_SPACE(vm_info), VM_MAPPED_ADDR(vm_info),
935 (int)ptob(count), B_READ, devvp,
936 B_vfs_pagein | B_pagebf, VM_REGION(vm_info));
937 } else
938#endif
939 {
940 virt_addr = VM_MAPPED_ADDR(vm_info);
941 vm_unlock(vm_info);
942 for (i = 0; i < num_io; i++) {
943 /*
944 * REVISIT -- investigate doing asyncpageio().
945 */
946 error |= (io[i].error =
947 syncpageio((swblk_t) VM_GET_IO_STARTBLK(vm_info, i),
948 VM_MAPPED_SPACE(vm_info), virt_addr,
949 (int)ptob(VM_GET_IO_COUNT(vm_info, i)),
950 B_READ, devvp, B_vfs_pagein | B_pagebf,
951 VM_REGION(vm_info)));
952 virt_addr += ptob(VM_GET_IO_COUNT(vm_info, i));
953 }
954 /*
955 * Set the next fault location. If read_ahead launches any
956 * I/O it will adjust it accordingly.
957 */
958 vm_info->prp->p_nextfault = vm_info->startindex + count;
959 }
960
961 return (error);
962}
963
964/*
965 ********************************************************************
966 ****
967 **** afspgin_update_dbd ()
968 **** similar to: nfspgin_update_dbd ()
969 ********************************************************************
970 */
971void
972afspgin_update_dbd(vfspage_t * vm_info, int bsize)
973{
974 k_off_t off;
975 pgcnt_t count = bsize / NBPG;
976 k_off_t rem;
977 pgcnt_t m;
978 pgcnt_t pgindx;
979 kern_daddr_t blkno;
980 int num_io = VM_GET_NUM_IO(vm_info);
981 int i;
982
983 for (i = 0; i < num_io; i++) {
984
985 pgindx = VM_GET_IO_STARTINDX(vm_info, i);
986 off = vnodindx(VM_REGION(vm_info), pgindx);
987 rem = off % bsize;
988 blkno = VM_GET_IO_STARTBLK(vm_info, i);
989
990 VASSERT(bsize % NBPG == 0);
991 VASSERT(rem % NBPG == 0);
992
993 pgindx -= (pgcnt_t) btop(rem);
994 blkno -= (kern_daddr_t) btodb(rem);
995
996 /*
997 * This region could start in mid-block. If so, pgindx
998 * could be less than 0, so we adjust pgindx and blkno back
999 * up so that pgindx is 0.
1000 */
1001
1002 if (pgindx < 0) {
1003 pgcnt_t prem;
1004 prem = 0 - pgindx;
1005 pgindx = 0;
1006 count -= prem;
1007 blkno += btodb(ptob(prem));
1008 }
1009
1010 for (m = 0; m < count && pgindx < VM_REGION_SIZE(vm_info);
1011 m++, pgindx++, blkno += btodb(NBPG)) {
1012 /*
1013 * Note: since this only changes one block, it
1014 * assumes only one block was faulted in. Currently
1015 * this is always true for remote files, and we only
1016 * get here for remote files, so everything is ok.
1017 */
1018 vm_mark_dbd(vm_info, pgindx, blkno);
1019 }
1020 }
1021}
1022
1023int
1024afs_pagein(vp, prp, wrt, space, vaddr, ret_startindex)
1025 struct vnode *vp;
1026 preg_t *prp;
1027 int wrt;
1028 space_t space;
1029 caddr_t vaddr;
1030 pgcnt_t *ret_startindex;
1031{
1032 pgcnt_t startindex;
1033 pgcnt_t pgindx = *ret_startindex;
1034 pgcnt_t maxpagein;
1035 struct vnode *devvp;
1036 pgcnt_t count;
1037 kern_daddr_t start_blk = 0;
1038 int bsize;
1039 int error;
1040 k_off_t isize;
1041 int shared; /* writable memory mapped file */
1042 retval_t retval = 0;
1043 pgcnt_t ok_dbd_limit = 0; /* last dbd that we can trust */
1044 pgcnt_t bpages; /* number of pages per block */
1045 pgcnt_t page_count;
1046 vfspage_t *vm_info = NULL;
1047 int done;
1048
1049 struct vattr va;
1050
1051 caddr_t nvaddr;
1052 space_t nspace;
1053 int change_to_fstore = 0; /* need to change dbds to DBD_FSTORE */
1054 int flush_start_blk = 0;
1055 int flush_end_blk = 0;
1056
1057 int i, j;
1058
1059 AFS_STATCNT(afs_pagein);
1060 vmemp_lockx(); /* lock down VM empire */
1061
1062 /* Initialize the VM info structure */
1063 done =
1064 vm_pagein_init(&vm_info, prp, pgindx, space, vaddr, wrt, 0,
1065 LGPG_ENABLE);
1066
1067 /* Check to see if we slept and the page was falted in. */
1068 if (done) {
1069 vm_release_structs(vm_info);
1070 vmemp_returnx(1);
1071 }
1072
1073 vp = VM_GET_PAGEIN_VNODE(vm_info);
1074 VASSERT(vp != NULL);
1075 shared = VM_SHARED_OBJECT(vm_info);
1076 VASSERT(DBD_TYPE(vm_info) != DBD_NONE);
1077
1078 /*
1079 * Get the devvp and block size for this vnode type
1080 */
1081 devvp = vp;
1082 bsize = vp->v_vfsp->vfs_bsize;
1083 if (bsize <= 0 || (bsize & (DEV_BSIZE - 1)))
1084 osi_Panic("afs_pagein: bsize is zero or not a multiple of DEV_BSIZE");
1085
1086 bpages = (pgcnt_t) btop(bsize);
1087 VASSERT(bpages > 0);
1088 VM_SET_FS_MAX_PAGES(vm_info, bpages);
1089
1090 /* this trace cannot be here because the afs_global lock might not be
1091 * held at this point. We hold the vm global lock throughout
1092 * this procedure ( and not the AFS global lock )
1093 * afs_Trace4(afs_iclSetp, CM_TRACE_HPPAGEIN, ICL_TYPE_POINTER, (afs_int32) vp,
1094 * ICL_TYPE_LONG, DBD_TYPE(vm_info), ICL_TYPE_LONG, bpages,
1095 * ICL_TYPE_LONG, shared);
1096 */
1097 /* Come here if we have to release the region lock before
1098 * locking pages. This can happen in memreserve() and
1099 * blkflush().
1100 */
1101 retry:
1102 /*
1103 * For remote files like ours, we want to check to see if the file has shrunk.
1104 * If so, we should invalidate any pages past the end. In the name
1105 * of efficiency, we only do this if the page we want to fault is
1106 * past the end of the file.
1107 */
1108 {
1109 if (VOP_GETATTR(vp, &va, kt_cred(u.u_kthreadp), VIFSYNC) != 0) {
1110 VM_ZOMBIE_OBJECT(vm_info);
1111 vm_release_memory(vm_info);
1112 vm_release_structs(vm_info);
1113 vmemp_returnx(0);
1114 }
1115 isize = va.va_size;
1116 if (vnodindx(VM_REGION(vm_info), pgindx) >= isize) {
1117 /*
1118 * The file has shrunk and someone is trying to access a
1119 * page past the end of the object. Shrink the object back
1120 * to its currrent size, send a SIGBUS to the faulting
1121 * process and return.
1122 *
1123 * We must release the region lock before calling mtrunc(),
1124 * since mtrunc() locks all the regions that are using this
1125 * file.
1126 */
1127 vm_release_memory(vm_info);
1128 vm_truncate_region(vm_info, isize);
1129 vm_release_structs(vm_info);
1130 vmemp_returnx(-SIGBUS);
1131 }
1132 }
1133
1134 maxpagein = vm_pick_maxpagein(vm_info);
1135 if (vm_wait_for_memory(vm_info, maxpagein, 1)) {
1136 /* Check to see if we should continue faulting. */
1137 if (vm_page_now_valid(vm_info, &page_count)) {
1138 vm_release_memory(vm_info);
1139 vm_release_structs(vm_info);
1140 vmemp_returnx(page_count);
1141 }
1142 }
1143 if (count = vm_no_io_required(vm_info)) {
1144 /* Release any excess memory. */
1145 vm_release_memory(vm_info);
1146 vm_release_structs(vm_info);
1147 vmemp_returnx(count);
1148 }
1149#ifdef OSDEBUG
1150 /*
1151 * We should never have DBD_HOLE pages in a non-MMF region.
1152 */
1153 if (!shared)
1154 VASSERT(dbd->dbd_type != DBD_HOLE);
1155#endif
1156 VASSERT(DBD_TYPE(vm_info) != DBD_NONE);
1157
1158 startindex = *ret_startindex;
1159
1160 /*
1161 * If the page we want is in memory already, take it
1162 */
1163 if (VM_MEMORY_RESERVED(vm_info) < maxpagein) {
1164 /* pick up the rest of memory now. */
1165 if (vm_wait_for_memory(vm_info, maxpagein, 0)) {
1166 if (vm_page_now_valid(vm_info, &page_count)) {
1167 vm_release_memory(vm_info);
1168 vm_release_structs(vm_info);
1169 vmemp_returnx(page_count);
1170 }
1171 goto retry;
1172 }
1173 }
1174
1175 if (!
1176 (count =
1177 afspgin_setup_io_ranges(vm_info, bpages, isize, startindex))) {
1178 goto retry;
1179 }
1180
1181 startindex = VM_GET_STARTINDX(vm_info);
1182
1183 VASSERT(maxpagein >= count);
1184
1185 /*
1186 * Release the memory we won't need.
1187 */
1188 if (count < maxpagein) {
1189 vm_release_excess_memory(vm_info,
1190 (VM_MEMORY_RESERVED(vm_info) - count));
1191 }
1192
1193 retval = afspgin_blkflsh(vm_info, devvp, &count);
1194
1195 if (retval == VM_RETRY) {
1196 goto retry;
1197 }
1198
1199 if (retval == VM_PAGE_PRESENT)
1200 return (count);
1201
1202#if 0
1203 /*
1204 * The definition of krusage_cntr_t is in h/kmetric.h, which
1205 * is not shipped. Since it's just statistics, we punt and do
1206 * not update it. If it's a problem we'll need to get HP to export
1207 * an interface that we can use to increment the counter.
1208 */
1209
1210 /* It's a real fault, not a reclaim */
1211 {
1212 krusage_cntr_t *temp;
1213 temp = kt_cntrp(u.u_kthreadp);
1214 temp->krc_majflt++;
1215 }
1216#endif
1217
1218 /*
1219 * Tell VM where the I/O intends to start. This may be different
1220 * from the faulting point.
1221 */
1222
1223 /*
1224 * vm_prepare_io will fill the region with pages and release the
1225 * region lock.
1226 */
1227 vm_prepare_io(vm_info, &count);
1228
1229 /*
1230 * Count may have been adjusted, check to make sure it's non-zero.
1231 */
1232 if (count == 0) {
1233 if (vm_retry(vm_info)) {
1234 goto retry;
1235 }
1236
1237 /*
1238 * Release resources and retry the fault. Release any excess
1239 * memory.
1240 */
1241
1242 vm_release_memory(vm_info);
1243 vm_release_structs(vm_info);
1244 vmemp_returnx(0);
1245 }
1246
1247 error = afspgin_io(vm_info, devvp, bpages, maxpagein, count);
1248
1249 if ((VM_IS_ZOMBIE(vm_info)) || (error)) {
1250 retval = -SIGBUS;
1251 VM_ZOMBIE_OBJECT(vm_info);
1252 goto backout;
1253 }
1254 /*
1255 * For a writable memory mapped file that is remote we must
1256 * detect potential holes in the file and force allocation of
1257 * disk space on the remote system. Unfortunately, there is
1258 * no easy way to do this, so this gets a little ugly.
1259 */
1260 if (shared && wrt) {
1261 /*
1262 * See if The user wants to write to this page. Write some
1263 * minimal amount of data back to the remote file to
1264 * force allocation of file space. We only need to
1265 * write a small amount, since holes are always at
1266 * least one filesystem block in size.
1267 */
1268 error = vm_alloc_hole(vm_info);
1269
1270 /*
1271 * If some sort of I/O error occurred we generate a
1272 * SIGBUS for the process that caused the write,
1273 * undo our page locks, etc and return.
1274 */
1275 if ((VM_IS_ZOMBIE(vm_info)) || (error)) {
1276 VM_ZOMBIE_OBJECT(vm_info);
1277 retval = -SIGBUS;
1278 goto backout;
1279 }
1280
1281 /*
1282 * Change these dbds to DBD_FSTORE. We cannot do it here,
1283 * since the region must be locked, and it is not locked
1284 * at the moment. We cannot lock the region yet, as we
1285 * first have to release the page locks.
1286 */
1287 change_to_fstore = 1;
1288 }
1289
1290 vm_finish_io(vm_info, count);
1291
1292 /*
1293 * Acquire the lock before we play around with changing the vfd's.
1294 */
1295 vm_lock(vm_info);
1296
1297 if (change_to_fstore)
1298 afspgin_update_dbd(vm_info, bsize);
1299
1300#if defined(AFS_HPUX110_ENV)
1301 getppdp()->cnt.v_exfod += count;
1302#else
1303 mpproc_info[getprocindex()].cnt.v_exfod += count;
1304#endif
1305 vmemp_unlockx(); /* free up VM empire */
1306 *ret_startindex = startindex;
1307
1308 /*
1309 * In case we have any excess memory...
1310 */
1311 if (VM_MEMORY_RESERVED(vm_info))
1312 vm_release_memory(vm_info);
1313 vm_release_structs(vm_info);
1314
1315 return count;
1316
1317 backout:
1318
1319 vm_finish_io_failed(vm_info, count);
1320
1321 vm_lock(vm_info);
1322
1323 vm_undo_validation(vm_info, count);
1324
1325 /*
1326 * In case we have any excess memory...
1327 */
1328 if (VM_MEMORY_RESERVED(vm_info))
1329 vm_release_memory(vm_info);
1330 vm_release_structs(vm_info);
1331
1332 vmemp_unlockx(); /* free up VM empire */
1333 return retval;
1334}
1335
1336int
1337afs_pageout(vp, prp, start, end, flags)
1338 struct vnode *vp; /* not used */
1339 preg_t *prp;
1340 pgcnt_t start;
1341 pgcnt_t end;
1342 int flags;
1343{
1344 struct vnode *filevp;
1345 struct vnode *devvp;
1346 pgcnt_t i;
1347 int steal;
1348 int vhand;
1349 int hard;
1350 int *piocnt; /* wakeup counter used if PAGEOUT_WAIT */
1351 struct ucred *old_cred;
1352 vfspage_t vm_info;
1353 fsdata_t args;
1354
1355 int inode_changed = 0;
1356 int file_is_remote;
1357 struct inode *ip;
1358
1359 AFS_STATCNT(afs_pageout);
1360
1361 steal = (flags & PAGEOUT_FREE);
1362 vhand = (flags & PAGEOUT_VHAND);
1363 hard = (flags & PAGEOUT_HARD);
1364
1365 vmemp_lockx();
1366
1367 /* Initialize the VM info structure. */
1368 vm_pageout_init(&vm_info, prp, start, end, 0, 0, 0, flags);
1369
1370 /*
1371 * If the region is marked "don't swap", then don't steal any pages
1372 * from it. We can, however, write dirty pages out to disk (only if
1373 * PAGEOUT_FREE is not set).
1374 */
1375 if (vm_no_pageout(&vm_info)) {
1376 vmemp_unlockx();
1377 return (0);
1378 }
1379
1380 /*
1381 * If caller wants to wait until the I/O is complete.
1382 */
1383 vm_setup_wait_for_io(&vm_info);
1384
1385 filevp = VM_GET_PAGEOUT_VNODE(&vm_info); /* always page out to back store */
1386 VASSERT(filevp != NULL);
1387
1388 memset((caddr_t) & args, 0, sizeof(fsdata_t));
1389 args.remote_down = 0; /* assume remote file servers are up */
1390 args.remote = 1; /* we are remote */
1391 args.bsize = 0; /* filled up later by afs_vm_checkpage() */
1392
1393 if (filevp->v_fstype == VUFS) {
1394 ip = VTOI(filevp);
1395 devvp = ip->i_devvp;
1396 file_is_remote = 0;
1397 } else {
1398 file_is_remote = 1;
1399 devvp = filevp;
1400
1401 /*
1402 * If we are vhand(), and this is an NFS file, we need to
1403 * see if the NFS server is "down". If so, we decide
1404 * if we will try to talk to it again, or defer pageouts
1405 * of dirty NFS pages until a future time.
1406 */
1407#ifdef notdef
1408 if (vhand && filevp->v_fstype == VNFS && vtomi(filevp)->mi_down
1409 && vtomi(filevp)->mi_hard) {
1410 extern afs_int32 vhand_nfs_retry;
1411 /*
1412 * If there is still time left on our timer, we will
1413 * not talk to this server right now.
1414 */
1415 if (vhand_nfs_retry > 0)
1416 args.remote_down = 1;
1417 }
1418#endif
1419 }
1420
1421 /*
1422 * Initialize args. We set bsize to 0 to tell vfs_vfdcheck() that
1423 * it must get the file size and other attributes if it comes across
1424 * a dirty page.
1425 */
1426 vm_info.fs_data = (caddr_t) & args;
1427
1428 /* this trace cannot be here because the afs_global lock might not be
1429 * held at this point. We hold the vm global lock throughout
1430 * this procedure ( and not the AFS global lock )
1431 * afs_Trace4(afs_iclSetp, CM_TRACE_HPPAGEOUT, ICL_TYPE_POINTER, (afs_int32) filevp,
1432 * ICL_TYPE_LONG, start, ICL_TYPE_LONG, end, ICL_TYPE_LONG, flags);
1433 */
1434
1435 i = start;
1436
1437 while (i <= end) {
1438 struct buf *bp;
1439 k_off_t start;
1440 pgcnt_t npages;
1441 k_off_t nbytes;
1442 int error;
1443
1444 extern int pageiodone();
1445 space_t nspace;
1446 caddr_t nvaddr;
1447
1448 /*
1449 * Ask the VM system to find the next run of pages.
1450 */
1451 vm_find_next_range(&vm_info, i, end);
1452
1453 /*
1454 * It's possible that the remote file shrunk in size. Check the flags
1455 * to see if the request was beyond the end of the file. If it was,
1456 * truncate the region to the file size and continue. We could be on a
1457 * run so after trunction continue, there may be some I/O to write
1458 * out.
1459 */
1460 if (VM_FS_FLAGS(&vm_info) & PAGEOUT_TRUNCATE) {
1461 pgcnt_t pglen = (pgcnt_t) btorp(args.isize);
1462
1463 /*
1464 * This page is past the end of the file. Unlock this page
1465 * (region_trunc will throw it away) and then call
1466 * region_trunc() to invalidate all pages past the new end of
1467 * the file.
1468 */
1469 region_trunc(VM_REGION(&vm_info), pglen, pglen + 1);
1470
1471 /*
1472 * remove the truncation flag.
1473 */
1474 VM_UNSETFS_FLAGS(&vm_info, PAGEOUT_TRUNCATE);
1475 }
1476
1477 if (VM_NO_PAGEOUT_RUN(&vm_info))
1478 break;
1479
1480 /*
1481 * We have a run of dirty pages [args.start...args.end].
1482 */
1483 VASSERT(filevp->v_fstype != VCDFS);
1484 VASSERT((filevp->v_vfsp->vfs_flag & VFS_RDONLY) == 0);
1485 VASSERT(VM_GET_NUM_IO(&vm_info) == 1);
1486
1487 /*
1488 * We will be doing an I/O on the region, let the VM system know.
1489 */
1490 (void)vm_up_physio_count(&vm_info);
1491
1492 /*
1493 * Okay, get set to perform the I/O.
1494 */
1495 inode_changed = 1;
1496 npages =
1497 (VM_END_PAGEOUT_INDX(&vm_info) + 1) -
1498 VM_START_PAGEOUT_INDX(&vm_info);
1499
1500 /*
1501 * Allocate and initialize an I/O buffer.
1502 */
1503 bp = bswalloc();
1504 vm_init_bp(&vm_info, bp); /* Let the VM system initialize */
1505
1506 /* Identify this buffer for KI */
1507 bp->b_bptype = B_vfs_pageout | B_pagebf;
1508
1509 if (steal)
1510 bp->b_flags = B_CALL | B_BUSY | B_PAGEOUT; /* steal pages */
1511 else
1512 bp->b_flags = B_CALL | B_BUSY; /* keep pages */
1513
1514 /*
1515 * If we are vhand paging over NFS, we will wait for the I/O
1516 * to complete.
1517 */
1518 if (vhand && filevp->v_fstype == VNFS) {
1519 bp->b_flags &= ~B_CALL;
1520 } else {
1521 bp->b_iodone = (int (*)())pageiodone;
1522 }
1523
1524 /*
1525 * Make sure we do not write past the end of the file.
1526 */
1527 nbytes = ptob(npages);
1528 start = vnodindx(VM_REGION(&vm_info), vm_info.start);
1529 if (start + nbytes > args.isize) {
1530#ifdef OSDEBUG
1531 /*
1532 * The amount we are off better not be bigger than a
1533 * filesystem block.
1534 */
1535 if (start + nbytes - args.isize >= args.bsize) {
1536 osi_Panic("afs_pageout: remainder too large");
1537 }
1538#endif
1539 /*
1540 * Reset the size of the I/O as necessary. For remote
1541 * files, we set the size to the exact number of bytes to
1542 * the end of the file. For local files, we round this up
1543 * to the nearest DEV_BSIZE chunk since disk I/O must always
1544 * be in multiples of DEV_BSIZE. In this case, we do not
1545 * bother to zero out the data past the "real" end of the
1546 * file, this is done when the data is read (either through
1547 * mmap() or by normal file system access).
1548 */
1549 if (file_is_remote)
1550 nbytes = args.isize - start;
1551 else
1552 nbytes = roundup(args.isize - start, DEV_BSIZE);
1553 }
1554
1555 /*
1556 * Now get ready to perform the I/O
1557 */
1558 if (!vm_protect_pageout(&vm_info, npages)) {
1559 VASSERT(vhand);
1560 vm_undo_invalidation(&vm_info, vm_info.start, vm_info.end);
1561 vm_finish_io_failed(&vm_info, npages);
1562 bswfree(bp);
1563 break;
1564 }
1565 /*
1566 * If this is an NFS write by vhand(), we will not be calling
1567 * pageiodone(). asyncpageio() increments parolemem for us
1568 * if bp->b_iodone is pageiodone, so we must do it manually
1569 * if pageiodone() will not be called automatically.
1570 */
1571 if (!(bp->b_flags & B_CALL) && steal) {
1572 ulong_t context;
1573
1574 SPINLOCK_USAV(pfdat_lock, context);
1575 parolemem += btorp(nbytes);
1576 SPINUNLOCK_USAV(pfdat_lock, context);
1577 }
1578 blkflush(devvp, VM_START_PAGEOUT_BLK(&vm_info), (long)nbytes,
1579 (BX_NOBUFWAIT | BX_PURGE), VM_REGION(&vm_info));
1580
1581 /*
1582 * If vhand is the one paging things out, and this is an NFS
1583 * file, we need to temporarily become a different user so
1584 * that we are not trying to page over NFS as root. We use
1585 * the user credentials associated with the writable file
1586 * pointer that is in the psuedo-vas for this MMF.
1587 *
1588 * NOTE: we are currently using "va_rss" to store the ucred
1589 * value in the vas (this should be fixed in 10.0).
1590 */
1591 old_cred = kt_cred(u.u_kthreadp);
1592 if (vhand) {
1593#if defined(AFS_HPUX1123_ENV)
1594 /*
1595 * DEE - 1123 does not have the vas.h, and it looks
1596 * we should never be called with a NFS type file anyway.
1597 * so where did this come from? Was it copied from NFS?
1598 * I assume it was, so we will add an assert for now
1599 * and see if the code runs at all.
1600 */
1601 VASSERT(filevp->v_fstype != VNFS);
1602#else
1603 set_kt_cred(u.u_kthreadp, filevp->v_vas->va_cred);
1604
1605 /*
1606 * If root was the one who opened the mmf for write,
1607 * va_cred will be NULL. So reset kt_cred(u.u_kthreadp) to what it
1608 * was. We will page out as root, but that is the
1609 * correct thing to do in this case anyway.
1610 */
1611 if (kt_cred(u.u_kthreadp) == NULL)
1612 set_kt_cred(u.u_kthreadp, old_cred);
1613#endif
1614 }
1615
1616 /*
1617 * Really do the I/O.
1618 */
1619 error =
1620 asyncpageio(bp, VM_START_PAGEOUT_BLK(&vm_info),
1621 VM_MAPPED_SPACE(&vm_info), VM_MAPPED_ADDR(&vm_info),
1622 (int)nbytes, B_WRITE, devvp);
1623
1624 VASSERT(error == 0);
1625
1626#ifdef notdef
1627 /*
1628 * If we are vhand paging over NFS we want to wait for the
1629 * I/O to complete and take the appropriate actions if an
1630 * error is encountered.
1631 */
1632 if (vhand) {
1633 if (waitforpageio(bp) && nfs_mi_harddown(filevp)) {
1634 /*
1635 * The server is down, ignore this failure, and
1636 * try again later. (rfscall() has set our retry
1637 * timer).
1638 */
1639 fsdata.remote_down = 1;
1640 pageiocleanup(bp, 0);
1641
1642 /*
1643 * vm_vfdcheck() has cleared the valid bit on the
1644 * vfds for these pages. We must go back and set the
1645 * valid bit, as the pages are really not gone.
1646 *
1647 * NOTE: we can do this because we still hold (and have
1648 * not released) the region lock.
1649 */
1650 if (steal)
1651 vm_undo_invalidation(&vm_info, vm_info.start,
1652 vm_info.end);
1653 } else {
1654 /*
1655 * The I/O succeeded, or we had an error that we do
1656 * not want to defer until later. Call pageidone()
1657 * to handle things.
1658 */
1659 pageiodone(bp);
1660 }
1661 }
1662#endif
1663
1664 /*
1665 * And restore our credentials to what they were.
1666 */
1667 set_kt_cred(u.u_kthreadp, old_cred);
1668
1669 /*
1670 * If we reserved memory in vfs_vfdcheck(), (only for NFS) we
1671 * can now unreserve it.
1672 */
1673 if (vm_info.vm_flags & PAGEOUT_RESERVED) {
1674 vm_info.vm_flags &= ~PAGEOUT_RESERVED;
1675 vm_release_malloc_memory();
1676 }
1677
1678 /*
1679 * Update statistics
1680 */
1681 if (steal) {
1682 if (flags & PF_DEACT) {
1683#if defined(AFS_HPUX110_ENV)
1684 getppdp()->cnt.v_pswpout += npages;
1685#else
1686 mpproc_info[getprocindex()].cnt.v_pswpout += npages;
1687#endif
1688/* sar_bswapout += ptod(npages);*/
1689 } else if (vhand) {
1690#if defined(AFS_HPUX110_ENV)
1691 getppdp()->cnt.v_pgout++;
1692 getppdp()->cnt.v_pgpgout += npages;
1693#else
1694 mpproc_info[getprocindex()].cnt.v_pgout++;
1695 mpproc_info[getprocindex()].cnt.v_pgpgout += npages;
1696#endif
1697 }
1698 }
1699
1700 /*
1701 * If time and patience have delivered enough
1702 * pages, then quit now while we are ahead.
1703 */
1704 if (VM_STOP_PAGING(&vm_info))
1705 break;
1706
1707 i = VM_END_PAGEOUT_INDX(&vm_info) - VM_BASE_OFFSET(&vm_info) + 1;
1708 }
1709
1710 vm_finish_pageout(&vm_info); /* update vhand's stealscan */
1711
1712 vmemp_unlockx();
1713
1714 /*
1715 * If we wanted to wait for the I/O to complete, sleep on piocnt.
1716 * We must decrement it by one first, and then make sure that it
1717 * is non-zero before going to sleep.
1718 */
1719 vm_wait_for_io(&vm_info);
1720
1721 if (inode_changed && !file_is_remote) {
1722 imark(ip, IUPD | ICHG);
1723 iupdat(ip, 0, 0);
1724 }
1725 return 0;
1726}
1727
1728int
1729afs_mapdbd(filevp, offset, bn, flags, hole, startidx, endidx)
1730 struct vnode *filevp;
1731 off_t offset;
1732 kern_daddr_t *bn; /* Block number. */
1733 int flags; /* B_READ or B_WRITE */
1734 int *hole; /* To be used for read-ahead. */
1735 pgcnt_t *startidx; /* To be used for read-ahead. */
1736 pgcnt_t *endidx; /* To be used for read-ahead. */
1737{
1738 kern_daddr_t lbn, local_bn;
1739 int on;
1740 int err;
1741 long bsize = vtoblksz(filevp) & ~(DEV_BSIZE - 1);
1742
1743 if (startidx)
1744 *startidx = (pgcnt_t) (offset / NBPG);
1745 if (endidx)
1746 *endidx = (pgcnt_t) (offset / NBPG);
1747 if (hole)
1748 *hole = 0; /* Can't have holes. */
1749 if (bsize <= 0)
1750 osi_Panic("afs_mapdbd: zero size");
1751
1752 lbn = (kern_daddr_t) (offset / bsize);
1753 on = offset % bsize;
1754
1755 err = VOP_BMAP(filevp, lbn, NULL, &local_bn, flags);
1756 VASSERT(err == 0);
1757
1758 /*
1759 * We can never get a bn less than zero on remote files.
1760 */
1761 VASSERT(local_bn >= 0);
1762
1763 local_bn = local_bn + btodb(on);
1764 *bn = local_bn;
1765
1766 return (0);
1767}
1768
1769/*
1770 * Return values:
1771 * 1: The blocks are contiguous.
1772 * 0: The blocks are not contiguous.
1773 */
1774int
1775afs_vm_fscontiguous(vp, args, cur_data)
1776 struct vnode *vp;
1777 vfspage_t *args;
1778 u_int cur_data;
1779{
1780 if (cur_data == (VM_END_PAGEOUT_BLK(args) + btodb(NBPG))) {
1781 return (1);
1782 } else {
1783 return (0);
1784 }
1785}
1786
1787/*
1788 * Return values:
1789 * 1: Stop, this page is the last in the block.
1790 * 0: Continue on
1791 * Terminate requests at filesystem block boundaries
1792 */
1793afs_vm_stopio(vp, args)
1794 struct vnode *vp;
1795 vfspage_t *args;
1796{
1797 fsdata_t *fsdata = (fsdata_t *) args->fs_data;
1798
1799#if defined(AFS_HPUX1123_ENV)
1800 uint64_t tmpdb;
1801 tmpdb = VM_END_PAGEOUT_BLK(args);
1802
1803 if ((dbtob(tmpdb) + NBPG) % (fsdata->bsize) == 0)
1804#else
1805 if ((dbtob(VM_END_PAGEOUT_BLK(args)) + NBPG) % (fsdata->bsize) == 0)
1806#endif /* AFS_HPUX1123_ENV */
1807 {
1808 return (1);
1809 } else {
1810 return (0);
1811 }
1812}
1813
1814/*
1815 * afs_vm_checkpage is called by the VM while collecting a run of
1816 * pages on a pageout. afs_vm_checkpage() is called for each page
1817 * VM wants to write to disk.
1818 */
1819afs_vm_checkpage(vp, args, pgindx, cur_data)
1820 struct vnode *vp;
1821 vfspage_t *args;
1822 pgcnt_t pgindx;
1823 int cur_data;
1824{
1825 fsdata_t *fsdata = (fsdata_t *) args->fs_data;
1826
1827 if (fsdata->remote_down) { /* never happens for AFS */
1828 /*
1829 * The remote system is down.
1830 */
1831 VASSERT(args->run == 0);
1832 return 1;
1833 }
1834 /*
1835 * A dirty page. If we have not yet determined the file size and
1836 * other attributes that we need to write out pages (the block
1837 * size and ok_dbd_limit), get that information now.
1838 */
1839 if (fsdata->bsize == 0) {
1840 k_off_t isize;
1841 long bsize;
1842 struct vattr va;
1843 struct vnode *filevp;
1844 /*
1845 * Get the various attributes about the file. Store them
1846 * in args for the next time around.
1847 */
1848 filevp = args->vp;
1849
1850 bsize = vtoblksz(filevp);
1851 args->maxpgs = (pgcnt_t) btop(bsize);
1852
1853 if (VOP_GETATTR(filevp, &va, kt_cred(u.u_kthreadp), VIFSYNC) != 0) {
1854 /*
1855 * The VOP_GETATTR() failed.
1856 * we are vhand, and this is a hard mount, we will
1857 * skip dirty pages for a while and try again later.
1858 */
1859 if (args->vm_flags & PAGEOUT_VHAND) {
1860 VASSERT(args->run == 0);
1861 return 1;
1862 }
1863 /*
1864 * This is a "soft" mount, or some other error was
1865 * returned from the server. Mark this region
1866 * as a zombie, and free this dirty page.
1867 */
1868 VM_ZOMBIE_OBJECT(args);
1869
1870 /*
1871 * The caller will see r_zomb and remove the page
1872 * appropriately.
1873 */
1874 return (1);
1875 }
1876 isize = va.va_size;
1877 fsdata->isize = isize;
1878 fsdata->bsize = bsize;
1879 fsdata->remote = 1;
1880 }
1881 /*
1882 * See if the file has shrunk (this could have happened
1883 * asynchronously because of NFS or DUX). If so, invalidate
1884 * all of the pages past the end of the file. This is only
1885 * needed for remote files, as local files are truncated
1886 * synchronously.
1887 */
1888
1889 if (vnodindx(VM_REGION(args), pgindx) > fsdata->isize) {
1890 /*
1891 * This page is past the end of the file. Unlock this page
1892 * (region_trunc will throw it away) and then call region_trunc()
1893 * to invalidate all pages past the new end of the file.
1894 */
1895 VM_SETFS_FLAGS(args, PAGEOUT_TRUNCATE);
1896 return (1);
1897 }
1898#ifdef notdef
1899 if ((args->vm_flags & PAGEOUT_VHAND)
1900 && (!(args->vm_flags & PAGEOUT_RESERVED))
1901 && (!(VM_IS_ZOMBIE(args)))) {
1902 VASSERT(args->run == 0);
1903 if (vm_reserve_malloc_memory(NFS_PAGEOUT_MEM)) {
1904 /*
1905 * Got enough memory to pageout. Mark the fact that we did
1906 * a sysprocmemreserve(), so that we can sysprocmemunreserve() it
1907 * later (in remote_pageout()).
1908 */
1909 args->vm_flags |= PAGEOUT_RESERVED;
1910 } else {
1911 /*
1912 * We do not have enough memory to do this pageout. By
1913 * definition, we do not yet have a run, so we just unlock
1914 * this page and tell foreach_valid() to continue scanning.
1915 * If we come across another dirty page, we will try to
1916 * reserve memory again. That is okay, in fact some memory
1917 * may have freed up (as earlier pageouts complete under
1918 * interrupt).
1919 */
1920 return 1;
1921 }
1922 }
1923#endif
1924 return (0);
1925}
1926
1927afs_swapfs_len(bp)
1928 struct buf *bp;
1929{
1930 long fs_bsize;
1931 long max_size;
1932 long bnrem;
1933
1934 fs_bsize = vtoblksz(bp->b_vp);
1935 /*
1936 * Check to see if we are starting mid block. If so, then
1937 * we must return the remainder of the block or less depending
1938 * on the length.
1939 */
1940 bnrem = bp->b_offset % fs_bsize;
1941 if (bnrem) {
1942 max_size = fs_bsize - bnrem;
1943 } else {
1944 max_size = fs_bsize;
1945 }
1946
1947 if (bp->b_bcount > max_size) {
1948 return (max_size);
1949 } else {
1950 return (bp->b_bcount);
1951 }
1952}
1953
1954afs_mmap(vp, off, size_bytes, access)
1955 struct vnode *vp;
1956 u_int off;
1957#if defined(AFS_HPUX1111_ENV)
1958 u_long size_bytes;
1959#else
1960 u_int size_bytes;
1961#endif
1962 int access;
1963{
1964 long bsize = vtoblksz(vp);
1965
1966 if (bsize % NBPG != 0) {
1967 return (EINVAL);
1968 }
1969
1970 return (0);
1971}
1972
1973afs_cachelimit(vp, len, location)
1974 struct vnode *vp;
1975 k_off_t len;
1976 int *location;
1977{
1978 /*
1979 * Disk addresses are logical, not physical, so fragments are
1980 * transparent.
1981 */
1982 *location = btorp(len) + 1;
1983}
1984
1985afs_release(vp)
1986 struct vnode *vp;
1987{
1988 return (0);
1989}
1990
1991int
1992afs_unmap(vp, off, size_bytes, access)
1993 struct vnode *vp;
1994 u_int off;
1995#if defined(AFS_HPUX1111_ENV)
1996 u_long size_bytes;
1997#else
1998 u_int size_bytes;
1999#endif
2000 int access;
2001{
2002 return 0;
2003}
2004
2005int
2006afs_read_ahead(vp, prp, wrt, space, vaddr, rhead_cnt)
2007 struct vnode *vp;
2008 preg_t *prp;
2009 int wrt;
2010 space_t space;
2011 caddr_t vaddr;
2012 pgcnt_t *rhead_cnt;
2013{
2014 printf("afs_read_ahead returning 0 \n");
2015 return 0;
2016}
2017
2018int
2019afs_prealloc(vp, size, ignore_minfree, reserved)
2020 struct vnode *vp;
2021 /* DEE on 11.22 following is off_t */
2022 size_t size;
2023 int ignore_minfree;
2024 int reserved;
2025{
2026 printf("afs_prealloc returning ENOSPC\n");
2027 return ENOSPC;
2028}
2029
2030int
2031afs_ioctl(vp, com, data, flag, cred)
2032 struct vnode *vp;
2033 int com;
2034 caddr_t data;
2035 int flag;
2036 struct ucred *cred;
2037{
2038 int error;
2039 struct afs_ioctl afsioctl, *ai;
2040
2041 AFS_STATCNT(afs_ioctl);
2042
2043 /* The call must be a VICEIOCTL call */
2044 if (((com >> 8) & 0xff) == 'V') {
2045#ifdef notdef
2046 /* AFS_COPYIN returns error 14. Copy data in instead */
2047 AFS_COPYIN(data, (caddr_t) & afsioctl, sizeof(afsioctl), error);
2048 if (error)
2049 return (error);
2050#endif
2051 ai = (struct afs_ioctl *)data;
2052 afsioctl.in = ai->in;
2053 afsioctl.out = ai->out;
2054 afsioctl.in_size = ai->in_size;
2055 afsioctl.out_size = ai->out_size;
2056 error = HandleIoctl(VTOAFS(vp), com, &afsioctl);
2057 return (error);
2058 }
2059 return (ENOTTY);
2060}
2061
2062#if defined(AFS_HPUX1111_ENV)
2063/* looks like even if appl is 32 bit, we need to round to 8 bytes */
2064/* This had no effect, it must not be being used */
2065
2066#define roundtoint(x) (((x) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
2067#define reclen(dp) roundtoint(((dp)->d_namlen + 1 + (sizeof(u_long)) +\
2068 sizeof(u_int) + 2 * sizeof(u_short)))
2069#else
2070
2071#define roundtoint(x) (((x) + (sizeof(int) - 1)) & ~(sizeof(int) - 1))
2072#define reclen(dp) roundtoint(((dp)->d_namlen + 1 + (sizeof(u_long)) +\
2073 2 * sizeof(u_short)))
2074#endif
2075
2076int
2077afs_readdir(vp, uiop, cred)
2078 struct vnode *vp;
2079 struct uio *uiop;
2080 struct ucred *cred;
2081{
2082 struct uio auio;
2083 struct iovec aiov;
2084 caddr_t ibuf, obuf, ibufend, obufend;
2085 struct __dirent32 *idp;
2086 struct dirent *odp;
2087 int count, outcount;
2088 dir_off_t offset;
2089 uint64_t tmp_offset;
2090
2091 memset(&auio, 0, sizeof(auio));
2092 memset(&aiov, 0, sizeof(aiov));
2093
2094 count = uiop->uio_resid;
2095 /* Allocate temporary space for format conversion */
2096 ibuf = kmem_alloc(2 * count); /* overkill - fix later */
2097 obuf = kmem_alloc(count + sizeof(struct dirent));
2098 aiov.iov_base = ibuf;
2099 aiov.iov_len = count;
2100 auio.uio_iov = &aiov;
2101 auio.uio_iovcnt = 1;
2102 offset = auio.uio_offset = uiop->uio_offset;
2103 auio.uio_seg = UIOSEG_KERNEL;
2104 auio.uio_resid = count;
2105 auio.uio_fpflags = 0;
2106
2107 u.u_error = mp_afs_readdir2(vp, &auio, cred);
2108 if (u.u_error)
2109 goto out;
2110
2111 /* Convert entries from __dirent32 to dirent format */
2112
2113 for (idp = (struct __dirent32 *)ibuf, odp =
2114 (struct dirent *)obuf, ibufend =
2115 ibuf + (count - auio.uio_resid), obufend = obuf + count;
2116 (caddr_t) idp < ibufend;
2117 idp = (struct __dirent32 *)((caddr_t) idp + idp->__d_reclen), odp =
2118 (struct dirent *)((caddr_t) odp + odp->d_reclen)) {
2119 odp->d_ino = idp->__d_ino;
2120 odp->d_namlen = idp->__d_namlen;
2121 (void)strcpy(odp->d_name, idp->__d_name);
2122 odp->d_reclen = reclen(odp);
2123 if ((caddr_t) odp + odp->d_reclen > obufend)
2124 break;
2125 /* record offset *after* we're sure to use this entry */
2126 memcpy((char *)&tmp_offset, (char *)&idp->__d_off, sizeof tmp_offset);
2127 offset = tmp_offset;
2128 }
2129
2130 outcount = (caddr_t) odp - obuf;
2131 AFS_UIOMOVE(obuf, outcount, UIO_READ, uiop, u.u_error);
2132 if (u.u_error)
2133 goto out;
2134 uiop->uio_offset = offset;
2135 out:
2136 kmem_free(ibuf, count);
2137 kmem_free(obuf, count + sizeof(struct dirent));
2138 return u.u_error;
2139}
2140
2141
2142#define roundtolong(x) (((x) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
2143#define reclen_dirent64(dp) roundtolong(((dp)->__d_namlen + 1 + (2*sizeof(u_long)) +\
2144 2 * sizeof(u_short)))
2145
2146int
2147afs_readdir3(vp, uiop, cred)
2148 struct vnode *vp;
2149 struct uio *uiop;
2150 struct ucred *cred;
2151{
2152 struct uio auio;
2153 struct iovec aiov;
2154 caddr_t ibuf, obuf, ibufend, obufend;
2155 struct __dirent32 *idp;
2156 struct __dirent64 *odp;
2157 int count, outcount;
2158 dir_off_t offset;
2159
2160 memset(&auio, 0, sizeof(auio));
2161 memset(&aiov, 0, sizeof(aiov));
2162
2163 count = uiop->uio_resid;
2164 /* Allocate temporary space for format conversion */
2165 ibuf = kmem_alloc(2 * count); /* overkill - fix later */
2166 obuf = kmem_alloc(count + sizeof(struct __dirent64));
2167 aiov.iov_base = ibuf;
2168 aiov.iov_len = count;
2169 auio.uio_iov = &aiov;
2170 auio.uio_iovcnt = 1;
2171 offset = auio.uio_offset = uiop->uio_offset;
2172 auio.uio_seg = UIOSEG_KERNEL;
2173 auio.uio_resid = count;
2174 auio.uio_fpflags = 0;
2175
2176 u.u_error = mp_afs_readdir2(vp, &auio, cred);
2177 if (u.u_error)
2178 goto out;
2179
2180 /* Convert entries from __dirent32 to __dirent64 format */
2181
2182 for (idp = (struct __dirent32 *)ibuf, odp =
2183 (struct __dirent64 *)obuf, ibufend =
2184 ibuf + (count - auio.uio_resid), obufend = obuf + count;
2185 (caddr_t) idp < ibufend;
2186 idp = (struct __dirent32 *)((caddr_t) idp + idp->__d_reclen), odp =
2187 (struct __dirent64 *)((caddr_t) odp + odp->__d_reclen)) {
2188 memcpy((char *)&odp->__d_off, (char *)&idp->__d_off,
2189 sizeof odp->__d_off);
2190 odp->__d_ino = idp->__d_ino;
2191 odp->__d_namlen = idp->__d_namlen;
2192 (void)strcpy(odp->__d_name, idp->__d_name);
2193 odp->__d_reclen = reclen_dirent64(odp);
2194 if ((caddr_t) odp + odp->__d_reclen > obufend)
2195 break;
2196 /* record offset *after* we're sure to use this entry */
2197 offset = odp->__d_off;
2198 }
2199
2200 outcount = (caddr_t) odp - obuf;
2201 AFS_UIOMOVE(obuf, outcount, UIO_READ, uiop, u.u_error);
2202 if (u.u_error)
2203 goto out;
2204 uiop->uio_offset = offset;
2205 out:
2206 kmem_free(ibuf, count);
2207 kmem_free(obuf, count + sizeof(struct __dirent64));
2208 return u.u_error;
2209}
2210
2211#define AFS_SV_SEMA_HASH 1
2212#define AFS_SV_SEMA_HASH_DEBUG 0
2213
2214#if AFS_SV_SEMA_HASH
2215/* This portion of the code was originally used to implement
2216 * thread specific storage for the semaphore save area. However,
2217 * there were some spare fields in the proc structure, this is
2218 * now being used for the saving semapores. Hence, this portion of
2219 * the code is no longer used.
2220 */
2221
2222/* This portion of the code implements thread specific information.
2223 * The thread id is passed in as the key. The semaphore saved area
2224 * is hashed on this key.
2225 */
2226
2227/* why is this hash table required ?
2228 * The AFS code is written in such a way that a GLOCK() is done in
2229 * one function and the GUNLOCK() is done in another function further
2230 * down the call chain. The GLOCK() call has to save the current
2231 * semaphore status before acquiring afs_global_sema. The GUNLOCK
2232 * has to release afs_global_sema and reacquire the sempahore status
2233 * that existed before the corresponding GLOCK. If GLOCK() and
2234 * GUNLOCK() were called in the same function, the GLOCK call could
2235 * have stored the saved sempahore status in a local variable and the
2236 * corresponding GUNLOCK() call could have restored the original
2237 * status from this local variable. But this is not the case with
2238 * AFS code. Hence, we have to implement a thread specific semaphore
2239 * save area. This is implemented as a hash table. The key is the
2240 * thread id.
2241 */
2242
2243/* In order for multithreaded processes to work, the sv_sema structures
2244 * must be saved on a per-thread basis, not a per-process basis. There
2245 * is no per-thread storage available to hijack in the OS per-thread
2246 * data structures (e.g. struct user) so we revive this code.
2247 * I removed the upper limit on the memory consumption since we don't
2248 * know how many threads there will be. Now the code first checks the
2249 * freeList. If that fails it then tries garbage collecting. If that
2250 * doesn't free up anything then it allocs what it needs.
2251 */
2252
2253#define ELEMENT sv_sema_t
2254#define KEY tid_t
2255#define Hash(xx) ( (xx) % sizeOfHashTable )
2256#define hashLockInit(xx) initsema(&xx,1, FILESYS_SEMA_PRI, FILESYS_SEMA_ORDER)
2257#define hashLock(xx) MP_PSEMA(&xx)
2258#define hashUnlock(xx) MP_VSEMA(&xx)
2259
2260typedef struct elem {
2261 struct elem *next;
2262 ELEMENT element;
2263 KEY key;
2264 int refCnt;
2265} Element;
2266
2267typedef struct bucket {
2268 sema_t lock;
2269 Element *element;
2270} Bucket;
2271
2272static int sizeOfHashTable;
2273static Bucket *hashTable;
2274
2275static int currentSize = 0;
2276static Element *freeList; /* free list */
2277
2278#pragma align 64
2279static sema_t afsHashLock = { 0 }; /* global lock for hash table */
2280
2281static void afsHashGarbageCollect();
2282
2283/*
2284** The global lock protects the global data structures,
2285** e.g. freeList and currentSize.
2286** The bucket lock protects the link list hanging off that bucket.
2287** The lock hierarchy : one can obtain the bucket lock while holding
2288** the global lock, but not vice versa.
2289*/
2290
2291
2292void
2293afsHash(int nbuckets)
2294{ /* allocate the hash table */
2295 int i;
2296
2297#if AFS_SV_SEMA_HASH_DEBUG
2298 printf("afsHash: enter\n");
2299#endif
2300
2301 sizeOfHashTable = nbuckets;
2302 currentSize = nbuckets * sizeof(Bucket);
2303
2304 if (hashTable)
2305 osi_Panic("afs: SEMA Hashtable already created\n");
2306
2307 hashTable = (Bucket *) AFS_KALLOC(sizeOfHashTable * sizeof(Bucket));
2308 if (!hashTable)
2309 osi_Panic("afs: cannot create SEMA Hashtable\n");
2310
2311 /* initialize the hash table and associated locks */
2312 memset(hashTable, 0, sizeOfHashTable * sizeof(Bucket));
2313 for (i = 0; i < sizeOfHashTable; i++)
2314 hashLockInit(hashTable[i].lock);
2315 hashLockInit(afsHashLock);
2316
2317#if AFS_SV_SEMA_HASH_DEBUG
2318 printf("afsHash: exit\n");
2319#endif
2320}
2321
2322ELEMENT *
2323afsHashInsertFind(KEY key)
2324{
2325 int index;
2326 Element *ptr;
2327
2328#if AFS_SV_SEMA_HASH_DEBUG
2329 printf("afsHashInsertFind: %d\n", key);
2330#endif
2331 if (!hashTable)
2332 osi_Panic("afs: afsHashInsertFind: no hashTable\n");
2333
2334 index = Hash(key); /* get bucket number */
2335 hashLock(hashTable[index].lock); /* lock this bucket */
2336 ptr = hashTable[index].element;
2337
2338 /* if it is already there */
2339 while (ptr) {
2340 if (ptr->key == key) {
2341 ptr->refCnt++; /* hold it */
2342 hashUnlock(hashTable[index].lock);
2343#if AFS_SV_SEMA_HASH_DEBUG
2344 printf("afsHashInsertFind: %d FOUND\n", key);
2345#endif
2346 return &(ptr->element);
2347 } else {
2348 ptr = ptr->next;
2349 }
2350 }
2351
2352 hashUnlock(hashTable[index].lock);
2353
2354 /* if something exists in the freeList, take it from there */
2355 ptr = NULL;
2356 hashLock(afsHashLock);
2357
2358 if (freeList) {
2359 ptr = freeList; /* reuse entry */
2360 freeList = freeList->next;
2361 } else {
2362 afsHashGarbageCollect(); /* afsHashLock locked */
2363 if (freeList) {
2364 ptr = freeList; /* reuse entry */
2365 freeList = freeList->next;
2366 } else {
2367 ptr = (Element *) AFS_KALLOC(sizeof(Element));
2368 }
2369 }
2370
2371 currentSize += sizeof(Element); /* update memory used */
2372 hashUnlock(afsHashLock);
2373
2374 if (!ptr)
2375 osi_Panic("afs: SEMA Hashtable cannot create new entry\n");
2376 /* create new entry */
2377 ptr->key = key;
2378 memset(&ptr->element, 0, sizeof(ptr->element));
2379 ptr->refCnt = 1; /* this guy */
2380
2381 /* insert new entry in bucket */
2382 hashLock(hashTable[index].lock); /* lock this bucket */
2383 ptr->next = hashTable[index].element;
2384 hashTable[index].element = ptr;
2385 hashUnlock(hashTable[index].lock);
2386
2387#if AFS_SV_SEMA_HASH_DEBUG
2388 printf("afsHashInsertFind: %d MADE\n", key);
2389#endif
2390
2391 return &(ptr->element);
2392}
2393
2394ELEMENT *
2395afsHashFind(KEY key)
2396{
2397 int index;
2398 Element *ptr;
2399
2400#if AFS_SV_SEMA_HASH_DEBUG
2401 printf("afsHashFind: %d\n", key);
2402#endif
2403 if (!hashTable)
2404 osi_Panic("afs: afsHashFind: no hashTable\n");
2405
2406 index = Hash(key); /* get bucket number */
2407 hashLock(hashTable[index].lock); /* lock this bucket */
2408 ptr = hashTable[index].element;
2409
2410 /* it should be in the hash table */
2411 while (ptr) {
2412 if (ptr->key == key) {
2413 if (ptr->refCnt <= 0)
2414 osi_Panic("afs: SEMA HashTable entry already released\n");
2415 hashUnlock(hashTable[index].lock);
2416#if AFS_SV_SEMA_HASH_DEBUG
2417 printf("afsHashFind: %d FOUND\n", key);
2418#endif
2419 return &(ptr->element);
2420 } else {
2421 ptr = ptr->next;
2422 }
2423 }
2424
2425 hashUnlock(hashTable[index].lock);
2426 /* it better be in the hash table */
2427 osi_Panic("afs: SEMA HashTable wants non-existent entry \n");
2428 return 0;
2429}
2430
2431void
2432afsHashRelease(KEY key)
2433{
2434 int index;
2435 Element *ptr;
2436
2437#if AFS_SV_SEMA_HASH_DEBUG
2438 printf("afsHashRelease: %d\n", key);
2439#endif
2440 if (!hashTable)
2441 osi_Panic("afs: afsHashRelease: no hashTable\n");
2442
2443 index = Hash(key); /* get bucket number */
2444 hashLock(hashTable[index].lock); /* lock this bucket */
2445 ptr = hashTable[index].element;
2446
2447 /* it should be in the hash table */
2448 while (ptr) {
2449 if (ptr->key == key) {
2450 if (ptr->refCnt <= 0)
2451 osi_Panic("afs: SEMA HashTable entry already released\n");
2452 ptr->refCnt--; /* release this guy */
2453 hashUnlock(hashTable[index].lock);
2454#if AFS_SV_SEMA_HASH_DEBUG
2455 printf("afsHashRelease: %d FOUND\n", key);
2456#endif
2457 return;
2458 } else {
2459 ptr = ptr->next;
2460 }
2461 }
2462
2463 hashUnlock(hashTable[index].lock);
2464 /* it better be in the hash table */
2465 osi_Panic("afs: SEMA HashTable deleting non-existent entry \n");
2466}
2467
2468/* this should be called with afsHashLock WRITE locked */
2469static void
2470afsHashGarbageCollect()
2471{
2472 int index;
2473 Element *ptr;
2474 int foundFlag = 0;
2475
2476 if (!hashTable)
2477 osi_Panic("afs: afsHashGarbageCollect: no hashTable\n");
2478
2479 for (index = 0; index < sizeOfHashTable; index++) {
2480 hashLock(hashTable[index].lock);
2481 ptr = hashTable[index].element; /* pick up bucket */
2482
2483 while (ptr && !ptr->refCnt) {
2484 /* insert this element into free list */
2485 Element *temp;
2486 temp = ptr->next;
2487 ptr->next = freeList;
2488 freeList = ptr;
2489
2490 foundFlag = 1; /* found at least one */
2491 currentSize -= sizeof(Element);
2492 ptr = temp;
2493 }
2494 hashTable[index].element = ptr;
2495
2496 /* scan thru the remaining list */
2497 if (ptr) {
2498 while (ptr->next) {
2499 if (ptr->next->refCnt == 0) {
2500 /* collect this element */
2501 Element *temp;
2502 temp = ptr->next;
2503 ptr->next = ptr->next->next;
2504 temp->next = freeList;
2505 freeList = temp;
2506 foundFlag = 1;
2507 currentSize -= sizeof(Element);
2508 } else {
2509 ptr = ptr->next;
2510 }
2511 }
2512 }
2513 hashUnlock(hashTable[index].lock);
2514 }
2515#if 0
2516 if (!foundFlag)
2517 osi_Panic("afs: SEMA HashTable full\n");
2518#endif
2519}
2520
2521#endif /* AFS_SV_SEMA_HASH */
2522
2523
2524afs_hp_strategy(bp)
2525 struct buf *bp;
2526{
2527 afs_int32 code;
2528 struct uio tuio;
2529 struct iovec tiovec[1];
2530 extern caddr_t hdl_kmap_bp();
2531 struct kthread *t = u.u_kthreadp;
2532
2533 memset(&tuio, 0, sizeof(tuio));
2534 memset(&tiovec, 0, sizeof(tiovec));
2535
2536 AFS_STATCNT(afs_hp_strategy);
2537 /*
2538 * hdl_kmap_bp() saves "b_bcount" and restores it in hdl_remap_bp() after
2539 * the I/O. We must save and restore the count because pageiodone()
2540 * uses b_bcount to determine how many pages to unlock.
2541 *
2542 * Remap the entire range.
2543 */
2544 hdl_kmap_bp(bp);
2545
2546 AFS_GLOCK();
2547 afs_Trace4(afs_iclSetp, CM_TRACE_HPSTRAT, ICL_TYPE_POINTER, bp->b_vp,
2548 ICL_TYPE_LONG, (int)bp->b_blkno * DEV_BSIZE, ICL_TYPE_LONG,
2549 bp->b_bcount, ICL_TYPE_LONG, 0);
2550
2551 /* Set up the uio structure */
2552 tuio.afsio_iov = tiovec;
2553 tuio.afsio_iovcnt = 1;
2554 tuio.afsio_offset = DEV_BSIZE * bp->b_blkno;
2555 tuio.afsio_seg = AFS_UIOSYS;
2556 tuio.afsio_resid = bp->b_bcount;
2557 tuio.uio_fpflags = 0;
2558 tiovec[0].iov_base = bp->b_un.b_addr;
2559 tiovec[0].iov_len = bp->b_bcount;
2560
2561 /* Do the I/O */
2562 if ((bp->b_flags & B_READ) == B_READ) {
2563 /* read b_bcount bytes into kernel address b_un.b_addr
2564 * starting at byte DEV_BSIZE * b_blkno. Bzero anything
2565 * we can't read, and finally call iodone(bp). File is
2566 * in bp->b_vp. Credentials are from u area??
2567 */
2568 code = afs_rdwr(VTOAFS(bp->b_vp), &tuio, UIO_READ, 0, kt_cred(t));
2569 if (code == 0)
2570 if (tuio.afsio_resid > 0) {
2571 privlbzero(bvtospace(bp, bp->b_un.b_addr),
2572 bp->b_un.b_addr + bp->b_bcount - tuio.afsio_resid,
2573 (size_t) tuio.afsio_resid);
2574
2575 }
2576 } else
2577 code = afs_rdwr(VTOAFS(bp->b_vp), &tuio, UIO_WRITE, 0, kt_cred(t));
2578
2579 /* Remap back to the user's space */
2580 hdl_remap_bp(bp);
2581
2582 AFS_GUNLOCK();
2583
2584 iodone(bp);
2585 return code;
2586}
2587
2588afs_pathconf(vp, name, resultp, cred)
2589 struct vnode *vp;
2590 int name;
2591 int *resultp;
2592 struct ucred *cred; /* unused */
2593{
2594 switch (name) {
2595 case _PC_LINK_MAX: /* Maximum number of links to a file */
2596 *resultp = 255; /* an unsigned short on the fileserver */
2597 break; /* a unsigned char in the client.... */
2598
2599 case _PC_NAME_MAX: /* Max length of file name */
2600 *resultp = 255;
2601 break;
2602
2603 case _PC_PATH_MAX: /* Maximum length of Path Name */
2604 *resultp = 1024;
2605 break;
2606
2607 case _PC_PIPE_BUF: /* Max atomic write to pipe. See fifo_vnops */
2608 case _PC_CHOWN_RESTRICTED: /* Anybody can chown? */
2609 case _PC_NO_TRUNC: /* No file name truncation on overflow? */
2610 u.u_error = EOPNOTSUPP;
2611 return (EOPNOTSUPP);
2612 break;
2613
2614 case _PC_MAX_CANON: /* TTY buffer size for canonical input */
2615 /* need more work here for pty, ite buffer size, if differ */
2616 if (vp->v_type != VCHR) {
2617 u.u_error = EINVAL;
2618 return (EINVAL);
2619 }
2620 *resultp = CANBSIZ; /*for tty */
2621 break;
2622
2623 case _PC_MAX_INPUT:
2624 /* need more work here for pty, ite buffer size, if differ */
2625 if (vp->v_type != VCHR) { /* TTY buffer size */
2626 u.u_error = EINVAL;
2627 return (EINVAL);
2628 }
2629 *resultp = TTYHOG; /*for tty */
2630 break;
2631
2632 case _PC_VDISABLE:
2633 /* Terminal special characters can be disabled? */
2634 if (vp->v_type != VCHR) {
2635 u.u_error = EINVAL;
2636 return (EINVAL);
2637 }
2638 *resultp = 1;
2639 break;
2640
2641 case _PC_SYNC_IO:
2642 if ((vp->v_type != VREG) && (vp->v_type != VBLK)) {
2643 *resultp = -1;
2644 return EINVAL;
2645 }
2646 *resultp = 1; /* Synchronized IO supported for this file */
2647 break;
2648
2649 case _PC_FILESIZEBITS:
2650 if (vp->v_type != VDIR)
2651 return (EINVAL);
2652 *resultp = MAX_SMALL_FILE_BITS;
2653 break;
2654
2655 default:
2656 return (EINVAL);
2657 }
2658
2659 return (0);
2660}