2 * Copyright 2009-2010, Sine Nomine Associates and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
12 * volume group membership cache
15 #include <afsconfig.h>
16 #include <afs/param.h>
20 #ifdef HAVE_SYS_FILE_H
24 #ifdef AFS_DEMAND_ATTACH_FS
27 #include <rx/rx_queue.h>
30 #include <afs/afsutil.h>
32 #include <afs/afsint.h>
36 #include "viceinode.h"
38 #include "partition.h"
39 #include <afs/errors.h>
41 #define __VOL_VG_CACHE_IMPL 1
44 #include "vg_cache_impl.h"
46 static int _VVGC_lookup(struct DiskPartition64
*,
48 VVGCache_entry_t
** entry
,
49 VVGCache_hash_entry_t
** hentry
);
50 static int _VVGC_entry_alloc(VVGCache_entry_t
** entry
);
51 static int _VVGC_entry_free(VVGCache_entry_t
* entry
);
52 static int _VVGC_entry_get(VVGCache_entry_t
* entry
);
53 static int _VVGC_entry_put(struct DiskPartition64
*,
54 VVGCache_entry_t
* entry
);
55 static int _VVGC_entry_add(struct DiskPartition64
*,
58 VVGCache_hash_entry_t
**);
59 static int _VVGC_entry_cl_add(VVGCache_entry_t
*, VolumeId
);
60 static int _VVGC_entry_cl_del(struct DiskPartition64
*, VVGCache_entry_t
*,
62 static int _VVGC_entry_export(VVGCache_entry_t
*, VVGCache_query_t
*);
63 static int _VVGC_hash_entry_alloc(VVGCache_hash_entry_t
** entry
);
64 static int _VVGC_hash_entry_free(VVGCache_hash_entry_t
* entry
);
65 static int _VVGC_hash_entry_add(struct DiskPartition64
*,
68 VVGCache_hash_entry_t
**);
69 static int _VVGC_hash_entry_del(VVGCache_hash_entry_t
* entry
);
70 static int _VVGC_hash_entry_unlink(VVGCache_hash_entry_t
* entry
);
72 VVGCache_hash_table_t VVGCache_hash_table
;
76 * initialize volume group cache subsystem.
78 * @return operation status
82 VVGCache_PkgInit(void)
87 /* allocate hash table */
88 VVGCache_hash_table
.hash_buckets
=
89 malloc(VolumeHashTable
.Size
* sizeof(struct rx_queue
));
90 if (VVGCache_hash_table
.hash_buckets
== NULL
) {
95 /* setup hash chain heads */
96 for (i
= 0; i
< VolumeHashTable
.Size
; i
++) {
97 queue_Init(&VVGCache_hash_table
.hash_buckets
[i
]);
100 /* initialize per-partition VVGC state */
101 for (i
= 0; i
<= VOLMAXPARTS
; i
++) {
102 VVGCache
.part
[i
].state
= VVGC_PART_STATE_INVALID
;
103 VVGCache
.part
[i
].dlist_hash_buckets
= NULL
;
104 CV_INIT(&VVGCache
.part
[i
].cv
, "cache part", CV_DEFAULT
, 0);
115 * shut down volume group cache subsystem.
117 * @return operation status
123 VVGCache_PkgShutdown(void)
129 /* free hash table */
130 free(VVGCache_hash_table
.hash_buckets
);
131 VVGCache_hash_table
.hash_buckets
= NULL
;
133 /* destroy per-partition VVGC state */
134 for (i
= 0; i
<= VOLMAXPARTS
; i
++) {
135 VVGCache
.part
[i
].state
= VVGC_PART_STATE_INVALID
;
136 CV_DESTROY(&VVGCache
.part
[i
].cv
);
143 * allocate a cache entry.
145 * @param[out] entry_out pointer to newly allocated entry
147 * @return operation status
153 _VVGC_entry_alloc(VVGCache_entry_t
** entry_out
)
155 *entry_out
= calloc(1, sizeof(VVGCache_entry_t
));
157 if (*entry_out
== NULL
)
164 * free a cache entry.
166 * @param[in] entry cache entry
168 * @return operation status
174 _VVGC_entry_free(VVGCache_entry_t
* entry
)
178 opr_Assert(entry
->refcnt
== 0);
185 * allocate and register an entry for a volume group.
187 * @param[in] dp disk partition object
188 * @param[in] volid volume id
189 * @param[out] entry_out vg cache object pointer
190 * @param[out] hash_out vg cache hash entry object pointer
192 * @pre - VOL_LOCK held
193 * - no such entry exists in hash table
195 * @return operation status
201 _VVGC_entry_add(struct DiskPartition64
* dp
,
203 VVGCache_entry_t
** entry_out
,
204 VVGCache_hash_entry_t
** hash_out
)
207 VVGCache_entry_t
* ent
;
209 code
= _VVGC_entry_alloc(&ent
);
215 /* refcnt will be inc'd when a child is added */
218 code
= _VVGC_hash_entry_add(dp
, volid
, ent
, hash_out
);
230 _VVGC_entry_free(ent
);
237 * add a volid to the entry's child list.
239 * @param[in] ent volume group object
240 * @param[in] volid volume id
242 * @return operation status
244 * @retval -1 child table is full
249 _VVGC_entry_cl_add(VVGCache_entry_t
* ent
,
255 /* search table to avoid duplicates */
256 for (i
= 0; i
< VOL_VG_MAX_VOLS
; i
++) {
257 if (ent
->children
[i
] == volid
) {
258 ViceLog(1, ("VVGC_entry_cl_add: tried to add duplicate vol "
260 afs_printable_uint32_lu(volid
),
261 afs_printable_uint32_lu(ent
->rw
)));
264 if (empty_idx
== -1 && !ent
->children
[i
]) {
266 /* don't break; make sure we go through all children so we don't
267 * add a duplicate entry */
271 /* verify table isn't full */
272 if (empty_idx
== -1) {
274 ViceLog(0, ("VVGC_entry_cl_add: tried to add vol %lu to VG %lu, but VG "
275 "is full\n", afs_printable_uint32_lu(volid
),
276 afs_printable_uint32_lu(ent
->rw
)));
281 ent
->children
[empty_idx
] = volid
;
284 code
= _VVGC_entry_get(ent
);
291 * delete a volid from the entry's child list.
293 * @param[in] dp disk partition object
294 * @param[in] ent volume group object
295 * @param[in] volid volume id
297 * @return operation status
299 * @retval -1 no such entry found
304 _VVGC_entry_cl_del(struct DiskPartition64
*dp
,
305 VVGCache_entry_t
* ent
,
310 for (i
= 0; i
< VOL_VG_MAX_VOLS
; i
++) {
311 if (ent
->children
[i
] == volid
) {
312 ent
->children
[i
] = 0;
320 code
= _VVGC_entry_put(dp
, ent
);
327 * add a refcount to an entry.
329 * @param[in] entry cache entry
333 * @return operation status
338 static int _VVGC_entry_get(VVGCache_entry_t
* entry
)
345 * put back a reference to an entry.
347 * @param[in] dp disk partition object
348 * @param[in] entry cache entry
352 * @warning do not attempt to deref pointer after calling this interface
354 * @return operation status
357 * @note dp is needed to lookup the RW hash entry to unlink, if we are
358 * putting back the final reference and freeing
363 _VVGC_entry_put(struct DiskPartition64
* dp
, VVGCache_entry_t
* entry
)
367 opr_Assert(entry
->refcnt
> 0);
369 if (--entry
->refcnt
== 0) {
370 VVGCache_entry_t
*nentry
;
371 VVGCache_hash_entry_t
*hentry
;
373 /* first, try to delete the RW id hash entry pointing to this
375 code
= _VVGC_lookup(dp
, entry
->rw
, &nentry
, &hentry
);
377 if (nentry
!= entry
) {
378 /* looking up the rw of this entry points to a different
379 * entry; should not happen */
380 ViceLog(0, ("VVGC_entry_put: error: entry lookup for entry %lu "
381 "found different entry than was passed",
382 afs_printable_uint32_lu(entry
->rw
)));
385 code
= _VVGC_hash_entry_unlink(hentry
);
388 } else if (code
== ENOENT
) {
389 /* ignore ENOENT; this shouldn't happen, since the RW hash
390 * entry should always exist if the entry does... but we
391 * were going to delete it anyway, so try to continue */
392 ViceLog(0, ("VVGC_entry_put: warning: tried to unlink entry for "
393 "vol %lu, but RW hash entry doesn't exist; continuing "
394 "anyway...\n", afs_printable_uint32_lu(entry
->rw
)));
399 /* now, just free the entry itself */
401 code
= _VVGC_entry_free(entry
);
409 * export a volume group entry in the external object format.
411 * @param[in] ent internal-format volume group object
412 * @param[out] qry external-format volume group object
416 * @return operation status
422 _VVGC_entry_export(VVGCache_entry_t
* ent
, VVGCache_query_t
* qry
)
427 for (i
= 0; i
< VOL_VG_MAX_VOLS
; i
++) {
428 qry
->children
[i
] = ent
->children
[i
];
435 * allocate a hash table entry structure.
437 * @param[out] entry_out address in which to store newly allocated hash entry struct
439 * @return operation status
445 _VVGC_hash_entry_alloc(VVGCache_hash_entry_t
** entry_out
)
448 VVGCache_hash_entry_t
* ent
;
450 *entry_out
= ent
= malloc(sizeof(VVGCache_hash_entry_t
));
459 * free a hash table entry structure.
461 * @param[in] entry hash table entry structure to be freed
463 * @return operation status
469 _VVGC_hash_entry_free(VVGCache_hash_entry_t
* entry
)
479 * add an entry to the hash table.
481 * @param[in] dp disk partition object
482 * @param[in] volid volume id
483 * @param[in] ent volume group object
484 * @param[out] hash_out address in which to store pointer to hash entry
488 * @return operation status
490 * @retval EEXIST hash entry for volid already exists, and it points to
491 * a different VG entry
496 _VVGC_hash_entry_add(struct DiskPartition64
* dp
,
498 VVGCache_entry_t
* ent
,
499 VVGCache_hash_entry_t
** hash_out
)
502 VVGCache_hash_entry_t
* hent
;
503 int hash
= VVGC_HASH(volid
);
504 VVGCache_entry_t
*nent
;
506 code
= _VVGC_lookup(dp
, volid
, &nent
, hash_out
);
509 ViceLog(0, ("_VVGC_hash_entry_add: tried to add a duplicate "
510 " nonmatching entry for vol %lu: original "
511 "(%"AFS_PTR_FMT
",%lu) new (%"AFS_PTR_FMT
",%lu)\n",
512 afs_printable_uint32_lu(volid
),
513 nent
, afs_printable_uint32_lu(nent
->rw
),
514 ent
, afs_printable_uint32_lu(ent
->rw
)));
517 ViceLog(1, ("_VVGC_hash_entry_add: tried to add duplicate "
518 "hash entry for vol %lu, VG %lu",
519 afs_printable_uint32_lu(volid
),
520 afs_printable_uint32_lu(ent
->rw
)));
521 /* accept attempts to add matching duplicate entries; just
522 * pretend we added it */
526 code
= _VVGC_hash_entry_alloc(&hent
);
534 queue_Append(&VVGCache_hash_table
.hash_buckets
[hash
],
545 * remove an entry from the hash table.
547 * @param[in] hent hash table entry
551 * @return operation status
557 _VVGC_hash_entry_del(VVGCache_hash_entry_t
* hent
)
562 if (hent
->entry
->rw
== hent
->volid
) {
566 code
= _VVGC_entry_cl_del(hent
->dp
, hent
->entry
, hent
->volid
);
567 /* note: hent->entry is possibly NULL after _VVGC_entry_cl_del, and
568 * if hent->entry->rw == hent->volid, it is possible for hent to
572 /* If we are the RW id, don't unlink, since we still need the
573 * hash entry to exist, so when we lookup children, they can
574 * look up the RW id hash chain, and they will all go to the
577 * If we are the last entry and the entry should be deleted,
578 * _VVGC_entry_cl_del will take care of unlinking the RW hash entry.
580 res
= _VVGC_hash_entry_unlink(hent
);
590 * low-level interface to remove an entry from the hash table.
592 * Does not alter the refcount or worry about the children lists or
593 * anything like that; just removes the hash table entry, frees it, and
594 * that's all. You probably want @see _VVGC_hash_entry_del instead.
596 * @param[in] hent hash table entry
600 * @return operation status
606 _VVGC_hash_entry_unlink(VVGCache_hash_entry_t
* hent
)
613 code
= _VVGC_hash_entry_free(hent
);
619 * lookup a vg cache entry given any member volume id.
621 * @param[in] dp disk partition object
622 * @param[in] volid vg member volume id
623 * @param[out] entry_out address in which to store volume group entry structure pointer
624 * @param[out] hash_out address in which to store hash entry pointer
628 * @warning - it is up to the caller to get a ref to entry_out, if needed
629 * - hash_out must not be referenced after dropping VOL_LOCK
631 * @return operation status
633 * @retval ENOENT volume id not found
634 * @retval EINVAL partition's VGC is invalid
639 _VVGC_lookup(struct DiskPartition64
* dp
,
641 VVGCache_entry_t
** entry_out
,
642 VVGCache_hash_entry_t
** hash_out
)
645 int bucket
= VVGC_HASH(volid
);
646 struct VVGCache_hash_entry
* ent
, * nent
;
648 if (VVGCache
.part
[dp
->index
].state
== VVGC_PART_STATE_INVALID
) {
654 for (queue_Scan(&VVGCache_hash_table
.hash_buckets
[bucket
],
657 VVGCache_hash_entry
)) {
658 if (ent
->volid
== volid
&& ent
->dp
== dp
) {
660 *entry_out
= ent
->entry
;
672 * add an entry to the volume group cache.
674 * @param[in] dp disk partition object
675 * @param[in] parent parent volume id
676 * @param[in] child child volume id
677 * @param[out] newvg if non-NULL, *newvg is 1 if adding this added a
678 * new VG, 0 if we added to an existing VG
682 * @return operation status
684 * @retval -1 parent and child are already registered in
688 VVGCache_entry_add_r(struct DiskPartition64
* dp
,
694 VVGCache_entry_t
* child_ent
, * parent_ent
;
700 /* check for existing entries */
701 res
= _VVGC_lookup(dp
, child
, &child_ent
, NULL
);
702 if (res
&& res
!= ENOENT
) {
707 res
= _VVGC_lookup(dp
, parent
, &parent_ent
, NULL
);
708 if (res
&& res
!= ENOENT
) {
714 * branch based upon existence of parent and child nodes
716 if (parent_ent
&& child_ent
) {
717 /* both exist. we're done.
718 * if they point different places, then report the error. */
719 if (child_ent
!= parent_ent
) {
722 if (parent
== child
) {
723 /* if we're adding the RW entry as a child, the RW id may
724 * not be in the child array yet, so make sure not to skip
729 } else if (!parent_ent
&& child_ent
) {
731 * update vg root volid, and add hash entry. */
732 parent_ent
= child_ent
;
733 parent_ent
->rw
= parent
;
735 code
= _VVGC_hash_entry_add(dp
,
740 } else if (!child_ent
&& !parent_ent
) {
741 code
= _VVGC_entry_add(dp
,
751 if (child
== parent
) {
752 /* if we're the RW, skip over adding the child hash entry;
753 * we already added the hash entry when creating the entry */
754 child_ent
= parent_ent
;
759 opr_Assert(!child_ent
);
760 child_ent
= parent_ent
;
761 code
= _VVGC_hash_entry_add(dp
,
770 code
= _VVGC_entry_cl_add(child_ent
, child
);
773 if (code
&& code
!= EINVAL
) {
774 ViceLog(0, ("VVGCache_entry_add: error %d trying to add vol %lu to VG"
775 " %lu on partition %s", code
, afs_printable_uint32_lu(child
),
776 afs_printable_uint32_lu(parent
), VPartitionPath(dp
)));
779 if (code
== 0 && VVGCache
.part
[dp
->index
].state
== VVGC_PART_STATE_UPDATING
) {
780 /* we successfully added the entry; make sure it's not on the
781 * to-delete list, so it doesn't get deleted later */
782 code
= _VVGC_dlist_del_r(dp
, parent
, child
);
783 if (code
&& code
!= ENOENT
) {
784 ViceLog(0, ("VVGCache_entry_add: error %d trying to remove vol "
785 "%lu (parent %lu) from the to-delete list for part "
786 "%s.\n", code
, afs_printable_uint32_lu(child
),
787 afs_printable_uint32_lu(parent
),
788 VPartitionPath(dp
)));
798 * add an entry to the volume group cache.
800 * @param[in] dp disk partition object
801 * @param[in] parent parent volume id
802 * @param[in] child child volume id
803 * @param[out] newvg if non-NULL, *newvg is 1 if adding this added a
804 * new VG, 0 if we added to an existing VG
806 * @return operation status
810 VVGCache_entry_add(struct DiskPartition64
* dp
,
818 VVGCache_entry_add_r(dp
, parent
, child
, newvg
);
825 * delete an entry from the volume group cache.
827 * If partition is scanning, actually puts the entry on a list of entries
828 * to delete when the scan is done.
830 * @param[in] dp disk partition object
831 * @param[in] parent parent volume id
832 * @param[in] child child volume id
836 * @return operation status
840 VVGCache_entry_del_r(struct DiskPartition64
* dp
,
841 VolumeId parent
, VolumeId child
)
843 if (VVGCache
.part
[dp
->index
].state
== VVGC_PART_STATE_UPDATING
) {
845 code
= _VVGC_dlist_add_r(dp
, parent
, child
);
850 return _VVGC_entry_purge_r(dp
, parent
, child
);
854 * delete an entry from the volume group cache.
856 * @param[in] dp disk partition object
857 * @param[in] parent parent volume id
858 * @param[in] child child volume id
864 * @return operation status
868 _VVGC_entry_purge_r(struct DiskPartition64
* dp
,
869 VolumeId parent
, VolumeId child
)
872 VVGCache_entry_t
* parent_ent
, * child_ent
;
873 VVGCache_hash_entry_t
* child_hent
;
875 /* check mappings for each volid */
876 res
= _VVGC_lookup(dp
, parent
, &parent_ent
, NULL
);
881 res
= _VVGC_lookup(dp
, child
, &child_ent
, &child_hent
);
887 /* if the mappings don't match, we have a serious error */
888 if (parent_ent
!= child_ent
) {
889 ViceLog(0, ("VVGCache_entry_del: trying to delete vol %lu from VG %lu, "
890 "but vol %lu points to VGC entry %"AFS_PTR_FMT
" and VG %lu "
891 "points to VGC entry %"AFS_PTR_FMT
"\n",
892 afs_printable_uint32_lu(child
),
893 afs_printable_uint32_lu(parent
),
894 afs_printable_uint32_lu(child
),
895 child_ent
, afs_printable_uint32_lu(parent
), parent_ent
));
900 code
= _VVGC_hash_entry_del(child_hent
);
907 * delete an entry from the volume group cache.
909 * @param[in] dp disk partition object
910 * @param[in] parent parent volume id
911 * @param[in] child child volume id
913 * @return operation status
917 VVGCache_entry_del(struct DiskPartition64
* dp
,
918 VolumeId parent
, VolumeId child
)
923 code
= VVGCache_entry_del_r(dp
, parent
, child
);
930 * query a volume group by any member volume id.
932 * @param[in] dp disk partition object
933 * @param[in] volume volume id of a member of VG
934 * @param[out] res vg membership data
938 * @return operation status
940 * @retval EAGAIN partition needs to finish scanning
943 VVGCache_query_r(struct DiskPartition64
* dp
,
945 VVGCache_query_t
* res
)
948 VVGCache_entry_t
* ent
;
950 /* If cache for this partition doesn't exist; start a scan */
951 if (VVGCache
.part
[dp
->index
].state
== VVGC_PART_STATE_INVALID
) {
952 code
= VVGCache_scanStart_r(dp
);
953 if (code
== 0 || code
== -3) {
954 /* -3 means another thread already started scanning */
959 if (VVGCache
.part
[dp
->index
].state
== VVGC_PART_STATE_UPDATING
) {
963 code
= _VVGC_lookup(dp
, volume
, &ent
, NULL
);
965 code
= _VVGC_entry_export(ent
, res
);
972 * query a volume group by any member volume id.
974 * @param[in] dp disk partition object
975 * @param[in] volume volume id of a member of VG
976 * @param[out] res vg membership data
978 * @return operation status
982 VVGCache_query(struct DiskPartition64
* dp
,
983 VolumeId volume
, VVGCache_query_t
* res
)
988 code
= VVGCache_query_r(dp
, volume
, res
);
995 * begin asynchronous scan of on-disk volume group metadata.
997 * @param[in] dp disk partition object
1001 * @return operation status
1005 VVGCache_scanStart_r(struct DiskPartition64
* dp
)
1010 code
= _VVGC_scan_start(dp
);
1012 /* start a scanner thread on each partition */
1013 for (dp
= DiskPartitionList
; dp
; dp
= dp
->next
) {
1014 res
= _VVGC_scan_start(dp
);
1025 * begin asynchronous scan of on-disk volume group metadata.
1027 * @param[in] dp disk partition object
1029 * @return operation status
1033 VVGCache_scanStart(struct DiskPartition64
* dp
)
1038 code
= VVGCache_scanStart_r(dp
);
1045 * wait for async on-disk VG metadata scan to complete.
1047 * @param[in] dp disk partition object
1049 * @pre VOL_LOCK held
1051 * @warning this routine must drop VOL_LOCK internally
1053 * @return operation status
1057 VVGCache_scanWait_r(struct DiskPartition64
* dp
)
1061 while (VVGCache
.part
[dp
->index
].state
== VVGC_PART_STATE_UPDATING
) {
1062 VOL_CV_WAIT(&VVGCache
.part
[dp
->index
].cv
);
1069 * wait for async on-disk VG metadata scan to complete.
1071 * @param[in] dp disk partition object
1073 * @return operation status
1077 VVGCache_scanWait(struct DiskPartition64
* dp
)
1082 code
= VVGCache_scanWait_r(dp
);
1089 * flush all cache entries for a given disk partition.
1091 * @param[in] part disk partition object
1093 * @pre VOL_LOCK held
1095 * @return operation status
1101 _VVGC_flush_part_r(struct DiskPartition64
* part
)
1105 VVGCache_hash_entry_t
* ent
, * nent
;
1107 for (i
= 0; i
< VolumeHashTable
.Size
; i
++) {
1108 for (queue_Scan(&VVGCache_hash_table
.hash_buckets
[i
],
1111 VVGCache_hash_entry
)) {
1112 if (ent
->dp
== part
) {
1113 VolumeId volid
= ent
->volid
;
1114 res
= _VVGC_hash_entry_del(ent
);
1116 ViceLog(0, ("_VVGC_flush_part_r: error %d deleting hash entry for %lu\n",
1117 res
, afs_printable_uint32_lu(volid
)));
1128 * flush all cache entries for a given disk partition.
1130 * @param[in] part disk partition object
1132 * @return operation status
1138 _VVGC_flush_part(struct DiskPartition64
* part
)
1143 code
= _VVGC_flush_part_r(part
);
1151 * change VVGC partition state.
1153 * @param[in] part disk partition object
1154 * @param[in] state new state
1156 * @pre VOL_LOCK is held
1163 _VVGC_state_change(struct DiskPartition64
* part
,
1164 VVGCache_part_state_t state
)
1166 VVGCache_part_state_t old_state
;
1168 old_state
= VVGCache
.part
[part
->index
].state
;
1169 VVGCache
.part
[part
->index
].state
= state
;
1171 if (old_state
!= state
) {
1172 CV_BROADCAST(&VVGCache
.part
[part
->index
].cv
);
1178 #endif /* AFS_DEMAND_ATTACH_FS */