Import Upstream version 1.8.5
[hcoop/debian/openafs.git] / src / vol / clone.c
1 /*
2 * Copyright 2000, International Business Machines Corporation and others.
3 * All Rights Reserved.
4 *
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
8 */
9
10 /*
11 System: VICE-TWO
12 Module: clone.c
13
14 */
15
16 /* Clone a volume. Assumes the new volume is already created */
17
18 #include <afsconfig.h>
19 #include <afs/param.h>
20
21 #include <roken.h>
22
23 #ifdef AFS_NT40_ENV
24 #include <windows.h>
25 #include <winbase.h>
26 #endif
27
28 #include <rx/xdr.h>
29 #include <afs/afsint.h>
30 #include <afs/afssyscalls.h>
31 #include <rx/rx_queue.h>
32
33 #include "nfs.h"
34 #include "lwp.h"
35 #include "lock.h"
36 #include "ihandle.h"
37 #include "vnode.h"
38 #include "volume.h"
39 #include "partition.h"
40 #include "viceinode.h"
41 #include "vol_prototypes.h"
42 #include "common.h"
43
44 int (*vol_PollProc) (void) = 0; /* someone must init this */
45
46 #define ERROR_EXIT(code) do { \
47 error = code; \
48 goto error_exit; \
49 } while (0)
50
51 /* parameters for idec call - this could just be an IHandle_t, but leaving
52 * open the possibility of decrementing the special files as well.
53 */
54 struct clone_rock {
55 IHandle_t *h;
56 VolId vol;
57 };
58
59 #define CLONE_MAXITEMS 100
60 struct clone_items {
61 struct clone_items *next;
62 afs_int32 nitems;
63 Inode data[CLONE_MAXITEMS];
64 };
65
66 struct clone_head {
67 struct clone_items *first;
68 struct clone_items *last;
69 };
70
71 void CloneVolume(Error *, Volume *, Volume *, Volume *);
72
73 static int
74 ci_AddItem(struct clone_head *ah, Inode aino)
75 {
76 struct clone_items *ti;
77
78 /* if no last elt (first call) or last item full, get a new one */
79 if ((!ah->last) || ah->last->nitems >= CLONE_MAXITEMS) {
80 ti = malloc(sizeof(struct clone_items));
81 if (!ti) {
82 Log("ci_AddItem: malloc failed\n");
83 osi_Panic("ci_AddItem: malloc failed\n");
84 }
85 ti->nitems = 0;
86 ti->next = (struct clone_items *)0;
87 if (ah->last) {
88 ah->last->next = ti;
89 ah->last = ti;
90 } else {
91 /* first dude in the list */
92 ah->first = ah->last = ti;
93 }
94 } else
95 ti = ah->last;
96
97 /* now ti points to the end of the list, to a clone_item with room
98 * for at least one more element. Add it.
99 */
100 ti->data[ti->nitems++] = aino;
101 return 0;
102 }
103
104 /* initialize a clone header */
105 int
106 ci_InitHead(struct clone_head *ah)
107 {
108 memset(ah, 0, sizeof(*ah));
109 return 0;
110 }
111
112 /* apply a function to all dudes in the set */
113 int
114 ci_Apply(struct clone_head *ah, int (*aproc) (Inode, void *), void *arock)
115 {
116 struct clone_items *ti;
117 int i;
118
119 for (ti = ah->first; ti; ti = ti->next) {
120 for (i = 0; i < ti->nitems; i++) {
121 (*aproc) (ti->data[i], arock);
122 }
123 }
124 return 0;
125 }
126
127 /* free all dudes in the list */
128 int
129 ci_Destroy(struct clone_head *ah)
130 {
131 struct clone_items *ti, *ni;
132
133 for (ti = ah->first; ti; ti = ni) {
134 ni = ti->next; /* guard against freeing */
135 free(ti);
136 }
137 return 0;
138 }
139
140 static int
141 IDecProc(Inode adata, void *arock)
142 {
143 struct clone_rock *aparm = (struct clone_rock *)arock;
144 IH_DEC(aparm->h, adata, aparm->vol);
145 DOPOLL;
146 return 0;
147 }
148
149 afs_int32
150 DoCloneIndex(Volume * rwvp, Volume * clvp, VnodeClass class, int reclone)
151 {
152 afs_int32 code, error = 0;
153 FdHandle_t *rwFd = 0, *clFdIn = 0, *clFdOut = 0;
154 StreamHandle_t *rwfile = 0, *clfilein = 0, *clfileout = 0;
155 IHandle_t *rwH = 0, *clHin = 0, *clHout = 0;
156 char buf[SIZEOF_LARGEDISKVNODE], dbuf[SIZEOF_LARGEDISKVNODE];
157 struct VnodeDiskObject *rwvnode = (struct VnodeDiskObject *)buf;
158 struct VnodeDiskObject *clvnode = (struct VnodeDiskObject *)dbuf;
159 Inode rwinode = 0;
160 Inode clinode;
161 struct clone_head decHead;
162 struct clone_rock decRock;
163 afs_foff_t offset = 0;
164 afs_int32 dircloned, inodeinced;
165 afs_int32 filecount = 0, diskused = 0;
166 afs_ino_str_t stmp;
167
168 struct VnodeClassInfo *vcp = &VnodeClassInfo[class];
169 /*
170 * The fileserver's -readonly switch should make this false, but we
171 * have no useful way to know in the volserver.
172 * This doesn't make client data mutable.
173 */
174 int ReadWriteOriginal = 1;
175
176 /* Correct number of files in volume: this assumes indexes are always
177 cloned starting with vLarge */
178 if (ReadWriteOriginal && class != vLarge) {
179 filecount = V_filecount(rwvp);
180 diskused = V_diskused(rwvp);
181 }
182
183 /* Initialize list of inodes to nuke - must do this before any calls
184 * to ERROR_EXIT, as the error handler requires an initialised list
185 */
186 ci_InitHead(&decHead);
187 decRock.h = V_linkHandle(rwvp);
188 decRock.vol = V_parentId(rwvp);
189
190 /* Open the RW volume's index file and seek to beginning */
191 IH_COPY(rwH, rwvp->vnodeIndex[class].handle);
192 rwFd = IH_OPEN(rwH);
193 if (!rwFd)
194 ERROR_EXIT(EIO);
195 rwfile = FDH_FDOPEN(rwFd, ReadWriteOriginal ? "r+" : "r");
196 if (!rwfile)
197 ERROR_EXIT(EIO);
198 STREAM_ASEEK(rwfile, vcp->diskSize); /* Will fail if no vnodes */
199
200 /* Open the clone volume's index file and seek to beginning */
201 IH_COPY(clHout, clvp->vnodeIndex[class].handle);
202 clFdOut = IH_OPEN(clHout);
203 if (!clFdOut)
204 ERROR_EXIT(EIO);
205 clfileout = FDH_FDOPEN(clFdOut, "a");
206 if (!clfileout)
207 ERROR_EXIT(EIO);
208 code = STREAM_ASEEK(clfileout, vcp->diskSize);
209 if (code)
210 ERROR_EXIT(EIO);
211
212 /* If recloning, open the new volume's index; this time for
213 * reading. We never read anything that we're simultaneously
214 * writing, so this all works.
215 */
216 if (reclone) {
217 IH_COPY(clHin, clvp->vnodeIndex[class].handle);
218 clFdIn = IH_OPEN(clHin);
219 if (!clFdIn)
220 ERROR_EXIT(EIO);
221 clfilein = FDH_FDOPEN(clFdIn, "r");
222 if (!clfilein)
223 ERROR_EXIT(EIO);
224 STREAM_ASEEK(clfilein, vcp->diskSize); /* Will fail if no vnodes */
225 }
226
227 /* Read each vnode in the old volume's index file */
228 for (offset = vcp->diskSize;
229 STREAM_READ(rwvnode, vcp->diskSize, 1, rwfile) == 1;
230 offset += vcp->diskSize) {
231 dircloned = inodeinced = 0;
232
233 /* If we are recloning the volume, read the corresponding vnode
234 * from the clone and determine its inode number.
235 */
236 if (reclone && !STREAM_EOF(clfilein)
237 && (STREAM_READ(clvnode, vcp->diskSize, 1, clfilein) == 1)) {
238 clinode = VNDISK_GET_INO(clvnode);
239 } else {
240 clinode = 0;
241 }
242
243 if (rwvnode->type != vNull) {
244 afs_fsize_t ll;
245
246 if (rwvnode->vnodeMagic != vcp->magic)
247 ERROR_EXIT(-1);
248 rwinode = VNDISK_GET_INO(rwvnode);
249 filecount++;
250 VNDISK_GET_LEN(ll, rwvnode);
251 diskused += nBlocks(ll);
252
253 /* Increment the inode if not already */
254 if (clinode && (clinode == rwinode)) {
255 clinode = 0; /* already cloned - don't delete later */
256 } else if (rwinode) {
257 if (IH_INC(V_linkHandle(rwvp), rwinode, V_parentId(rwvp)) ==
258 -1) {
259 Log("IH_INC failed: %"AFS_PTR_FMT", %s, %" AFS_VOLID_FMT " errno %d\n",
260 V_linkHandle(rwvp), PrintInode(stmp, rwinode),
261 afs_printable_VolumeId_lu(V_parentId(rwvp)), errno);
262 VForceOffline(rwvp);
263 ERROR_EXIT(EIO);
264 }
265 inodeinced = 1;
266 }
267
268 /* If a directory, mark vnode in old volume as cloned */
269 if ((rwvnode->type == vDirectory) && ReadWriteOriginal) {
270 #ifdef DVINC
271 /*
272 * It is my firmly held belief that immediately after
273 * copy-on-write, the two directories can be identical.
274 * If the new copy is changed (presumably, that is the very
275 * next thing that will happen) then the dataVersion will
276 * get bumped.
277 */
278 /* NOTE: the dataVersion++ is incredibly important!!!.
279 * This will cause the inode created by the file server
280 * on copy-on-write to be stamped with a dataVersion bigger
281 * than the current one. The salvager will then do the
282 * right thing */
283 rwvnode->dataVersion++;
284 #endif /* DVINC */
285 rwvnode->cloned = 1;
286 code = STREAM_ASEEK(rwfile, offset);
287 if (code == -1)
288 goto clonefailed;
289 code = STREAM_WRITE(rwvnode, vcp->diskSize, 1, rwfile);
290 if (code != 1)
291 goto clonefailed;
292 dircloned = 1;
293 code = STREAM_ASEEK(rwfile, offset + vcp->diskSize);
294 if (code == -1)
295 goto clonefailed;
296 #ifdef DVINC
297 rwvnode->dataVersion--; /* Really needs to be set to the value in the inode,
298 * for the read-only volume */
299 #endif /* DVINC */
300 }
301 }
302
303 /* Overwrite the vnode entry in the clone volume */
304 rwvnode->cloned = 0;
305 code = STREAM_WRITE(rwvnode, vcp->diskSize, 1, clfileout);
306 if (code != 1) {
307 clonefailed:
308 /* Couldn't clone, go back and decrement the inode's link count */
309 if (inodeinced) {
310 if (IH_DEC(V_linkHandle(rwvp), rwinode, V_parentId(rwvp)) ==
311 -1) {
312 Log("IH_DEC failed: %"AFS_PTR_FMT", %s, %" AFS_VOLID_FMT " errno %d\n",
313 V_linkHandle(rwvp), PrintInode(stmp, rwinode),
314 afs_printable_VolumeId_lu(V_parentId(rwvp)), errno);
315 VForceOffline(rwvp);
316 ERROR_EXIT(EIO);
317 }
318 }
319 /* And if the directory was marked clone, unmark it */
320 if (dircloned) {
321 rwvnode->cloned = 0;
322 if (STREAM_ASEEK(rwfile, offset) != -1)
323 (void)STREAM_WRITE(rwvnode, vcp->diskSize, 1, rwfile);
324 }
325 ERROR_EXIT(EIO);
326 }
327
328 /* Removal of the old cloned inode */
329 if (clinode) {
330 ci_AddItem(&decHead, clinode); /* just queue it */
331 }
332
333 DOPOLL;
334 }
335 if (STREAM_ERROR(clfileout))
336 ERROR_EXIT(EIO);
337
338 /* Clean out any junk at end of clone file */
339 if (reclone) {
340 STREAM_ASEEK(clfilein, offset);
341 while (STREAM_READ(clvnode, vcp->diskSize, 1, clfilein) == 1) {
342 if (clvnode->type != vNull && VNDISK_GET_INO(clvnode) != 0) {
343 ci_AddItem(&decHead, VNDISK_GET_INO(clvnode));
344 }
345 DOPOLL;
346 }
347 }
348
349 /* come here to finish up. If code is non-zero, we've already run into problems,
350 * and shouldn't do the idecs.
351 */
352 error_exit:
353 if (rwfile)
354 STREAM_CLOSE(rwfile);
355 if (clfilein)
356 STREAM_CLOSE(clfilein);
357 if (clfileout)
358 STREAM_CLOSE(clfileout);
359
360 if (rwFd)
361 FDH_CLOSE(rwFd);
362 if (clFdIn)
363 FDH_CLOSE(clFdIn);
364 if (clFdOut)
365 FDH_CLOSE(clFdOut);
366
367 if (rwH)
368 IH_RELEASE(rwH);
369 if (clHout)
370 IH_RELEASE(clHout);
371 if (clHin)
372 IH_RELEASE(clHin);
373
374 /* Next, we sync the disk. We have to reopen in case we're truncating,
375 * since we were using stdio above, and don't know when the buffers
376 * would otherwise be flushed. There's no stdio fftruncate call.
377 */
378 rwFd = IH_OPEN(clvp->vnodeIndex[class].handle);
379 if (rwFd == NULL) {
380 if (!error)
381 error = EIO;
382 } else {
383 if (reclone) {
384 /* If doing a reclone, we're keeping the clone. We need to
385 * truncate the file to offset bytes.
386 */
387 if (reclone && !error) {
388 error = FDH_TRUNC(rwFd, offset);
389 }
390 }
391 (void)FDH_SYNC(rwFd);
392 FDH_CLOSE(rwFd);
393 }
394
395 /* Now finally do the idec's. At this point, all potential
396 * references have been cleaned up and sent to the disk
397 * (see above fclose and fsync). No matter what happens, we
398 * no longer need to keep these references around.
399 */
400 code = ci_Apply(&decHead, IDecProc, (char *)&decRock);
401 if (!error)
402 error = code;
403 ci_Destroy(&decHead);
404
405 if (ReadWriteOriginal && filecount > 0)
406 V_filecount(rwvp) = filecount;
407 if (ReadWriteOriginal && diskused > 0)
408 V_diskused(rwvp) = diskused;
409 return error;
410 }
411
412 void
413 CloneVolume(Error * rerror, Volume * original, Volume * new, Volume * old)
414 {
415 afs_int32 code, error = 0;
416 afs_int32 reclone;
417 afs_int32 filecount = V_filecount(original), diskused = V_diskused(original);
418
419 *rerror = 0;
420 reclone = ((new == old) ? 1 : 0);
421
422 code = DoCloneIndex(original, new, vLarge, reclone);
423 if (code)
424 ERROR_EXIT(code);
425 code = DoCloneIndex(original, new, vSmall, reclone);
426 if (code)
427 ERROR_EXIT(code);
428 if (filecount != V_filecount(original) || diskused != V_diskused(original))
429 Log("Clone %" AFS_VOLID_FMT ": filecount %d -> %d diskused %d -> %d\n",
430 afs_printable_VolumeId_lu(V_id(original)), filecount,
431 V_filecount(original), diskused, V_diskused(original));
432
433 code = CopyVolumeHeader(&V_disk(original), &V_disk(new));
434 if (code)
435 ERROR_EXIT(code);
436
437 error_exit:
438 *rerror = error;
439 }