Allow 'zpool replace' to use short device names
[zfs.git] / cmd / zpool / zpool_vdev.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25
26 /*
27  * Functions to convert between a list of vdevs and an nvlist representing the
28  * configuration.  Each entry in the list can be one of:
29  *
30  *      Device vdevs
31  *              disk=(path=..., devid=...)
32  *              file=(path=...)
33  *
34  *      Group vdevs
35  *              raidz[1|2]=(...)
36  *              mirror=(...)
37  *
38  *      Hot spares
39  *
40  * While the underlying implementation supports it, group vdevs cannot contain
41  * other group vdevs.  All userland verification of devices is contained within
42  * this file.  If successful, the nvlist returned can be passed directly to the
43  * kernel; we've done as much verification as possible in userland.
44  *
45  * Hot spares are a special case, and passed down as an array of disk vdevs, at
46  * the same level as the root of the vdev tree.
47  *
48  * The only function exported by this file is 'make_root_vdev'.  The
49  * function performs several passes:
50  *
51  *      1. Construct the vdev specification.  Performs syntax validation and
52  *         makes sure each device is valid.
53  *      2. Check for devices in use.  Using libblkid to make sure that no
54  *         devices are also in use.  Some can be overridden using the 'force'
55  *         flag, others cannot.
56  *      3. Check for replication errors if the 'force' flag is not specified.
57  *         validates that the replication level is consistent across the
58  *         entire pool.
59  *      4. Call libzfs to label any whole disks with an EFI label.
60  */
61
62 #include <assert.h>
63 #include <ctype.h>
64 #include <devid.h>
65 #include <errno.h>
66 #include <fcntl.h>
67 #include <libintl.h>
68 #include <libnvpair.h>
69 #include <limits.h>
70 #include <stdio.h>
71 #include <string.h>
72 #include <unistd.h>
73 #include <sys/efi_partition.h>
74 #include <sys/stat.h>
75 #include <sys/vtoc.h>
76 #include <sys/mntent.h>
77 #include <uuid/uuid.h>
78 #ifdef HAVE_LIBBLKID
79 #include <blkid/blkid.h>
80 #else
81 #define blkid_cache void *
82 #endif /* HAVE_LIBBLKID */
83
84 #include "zpool_util.h"
85
86 /*
87  * For any given vdev specification, we can have multiple errors.  The
88  * vdev_error() function keeps track of whether we have seen an error yet, and
89  * prints out a header if its the first error we've seen.
90  */
91 boolean_t error_seen;
92 boolean_t is_force;
93
94 /*PRINTFLIKE1*/
95 static void
96 vdev_error(const char *fmt, ...)
97 {
98         va_list ap;
99
100         if (!error_seen) {
101                 (void) fprintf(stderr, gettext("invalid vdev specification\n"));
102                 if (!is_force)
103                         (void) fprintf(stderr, gettext("use '-f' to override "
104                             "the following errors:\n"));
105                 else
106                         (void) fprintf(stderr, gettext("the following errors "
107                             "must be manually repaired:\n"));
108                 error_seen = B_TRUE;
109         }
110
111         va_start(ap, fmt);
112         (void) vfprintf(stderr, fmt, ap);
113         va_end(ap);
114 }
115
116 /*
117  * Check that a file is valid.  All we can do in this case is check that it's
118  * not in use by another pool, and not in use by swap.
119  */
120 static int
121 check_file(const char *file, boolean_t force, boolean_t isspare)
122 {
123         char  *name;
124         int fd;
125         int ret = 0;
126         pool_state_t state;
127         boolean_t inuse;
128
129         if ((fd = open(file, O_RDONLY)) < 0)
130                 return (0);
131
132         if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
133                 const char *desc;
134
135                 switch (state) {
136                 case POOL_STATE_ACTIVE:
137                         desc = gettext("active");
138                         break;
139
140                 case POOL_STATE_EXPORTED:
141                         desc = gettext("exported");
142                         break;
143
144                 case POOL_STATE_POTENTIALLY_ACTIVE:
145                         desc = gettext("potentially active");
146                         break;
147
148                 default:
149                         desc = gettext("unknown");
150                         break;
151                 }
152
153                 /*
154                  * Allow hot spares to be shared between pools.
155                  */
156                 if (state == POOL_STATE_SPARE && isspare)
157                         return (0);
158
159                 if (state == POOL_STATE_ACTIVE ||
160                     state == POOL_STATE_SPARE || !force) {
161                         switch (state) {
162                         case POOL_STATE_SPARE:
163                                 vdev_error(gettext("%s is reserved as a hot "
164                                     "spare for pool %s\n"), file, name);
165                                 break;
166                         default:
167                                 vdev_error(gettext("%s is part of %s pool "
168                                     "'%s'\n"), file, desc, name);
169                                 break;
170                         }
171                         ret = -1;
172                 }
173
174                 free(name);
175         }
176
177         (void) close(fd);
178         return (ret);
179 }
180
181 static void
182 check_error(int err)
183 {
184         (void) fprintf(stderr, gettext("warning: device in use checking "
185             "failed: %s\n"), strerror(err));
186 }
187
188 static int
189 check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare)
190 {
191         int err;
192 #ifdef HAVE_LIBBLKID
193         char *value;
194
195         /* No valid type detected device is safe to use */
196         value = blkid_get_tag_value(cache, "TYPE", path);
197         if (value == NULL)
198                 return (0);
199
200         /*
201          * If libblkid detects a ZFS device, we check the device
202          * using check_file() to see if it's safe.  The one safe
203          * case is a spare device shared between multiple pools.
204          */
205         if (strcmp(value, "zfs") == 0) {
206                 err = check_file(path, force, isspare);
207         } else {
208                 if (force) {
209                         err = 0;
210                 } else {
211                         err = -1;
212                         vdev_error(gettext("%s contains a filesystem of "
213                                    "type '%s'\n"), path, value);
214                 }
215         }
216
217         free(value);
218 #else
219         err = check_file(path, force, isspare);
220 #endif /* HAVE_LIBBLKID */
221
222         return (err);
223 }
224
225 /*
226  * Validate a whole disk.  Iterate over all slices on the disk and make sure
227  * that none is in use by calling check_slice().
228  */
229 static int
230 check_disk(const char *path, blkid_cache cache, int force,
231            boolean_t isspare, boolean_t iswholedisk)
232 {
233         struct dk_gpt *vtoc;
234         char slice_path[MAXPATHLEN];
235         int err = 0;
236         int fd, i;
237
238         /* This is not a wholedisk we only check the given partition */
239         if (!iswholedisk)
240                 return check_slice(path, cache, force, isspare);
241
242         /*
243          * When the device is a whole disk try to read the efi partition
244          * label.  If this is successful we safely check the all of the
245          * partitions.  However, when it fails it may simply be because
246          * the disk is partitioned via the MBR.  Since we currently can
247          * not easily decode the MBR return a failure and prompt to the
248          * user to use force option since we cannot check the partitions.
249          */
250         if ((fd = open(path, O_RDWR|O_DIRECT|O_EXCL)) < 0) {
251                 check_error(errno);
252                 return -1;
253         }
254
255         if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) {
256                 (void) close(fd);
257
258                 if (force) {
259                         return 0;
260                 } else {
261                         vdev_error(gettext("%s does not contain an EFI "
262                             "label but it may contain partition\n"
263                             "information in the MBR.\n"), path);
264                         return -1;
265                 }
266         }
267
268         /*
269          * The primary efi partition label is damaged however the secondary
270          * label at the end of the device is intact.  Rather than use this
271          * label we should play it safe and treat this as a non efi device.
272          */
273         if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
274                 efi_free(vtoc);
275                 (void) close(fd);
276
277                 if (force) {
278                         /* Partitions will no be created using the backup */
279                         return 0;
280                 } else {
281                         vdev_error(gettext("%s contains a corrupt primary "
282                             "EFI label.\n"), path);
283                         return -1;
284                 }
285         }
286
287         for (i = 0; i < vtoc->efi_nparts; i++) {
288
289                 if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED ||
290                     uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid))
291                         continue;
292
293                 if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0)
294                         (void) snprintf(slice_path, sizeof (slice_path),
295                             "%s%s%d", path, "-part", i+1);
296                 else
297                         (void) snprintf(slice_path, sizeof (slice_path),
298                             "%s%s%d", path, isdigit(path[strlen(path)-1]) ?
299                             "p" : "", i+1);
300
301                 err = check_slice(slice_path, cache, force, isspare);
302                 if (err)
303                         break;
304         }
305
306         efi_free(vtoc);
307         (void) close(fd);
308
309         return (err);
310 }
311
312 static int
313 check_device(const char *path, boolean_t force,
314              boolean_t isspare, boolean_t iswholedisk)
315 {
316         static blkid_cache cache = NULL;
317
318 #ifdef HAVE_LIBBLKID
319         /*
320          * There is no easy way to add a correct blkid_put_cache() call,
321          * memory will be reclaimed when the command exits.
322          */
323         if (cache == NULL) {
324                 int err;
325
326                 if ((err = blkid_get_cache(&cache, NULL)) != 0) {
327                         check_error(err);
328                         return -1;
329                 }
330
331                 if ((err = blkid_probe_all(cache)) != 0) {
332                         blkid_put_cache(cache);
333                         check_error(err);
334                         return -1;
335                 }
336         }
337 #endif /* HAVE_LIBBLKID */
338
339         return check_disk(path, cache, force, isspare, iswholedisk);
340 }
341
342 /*
343  * By "whole disk" we mean an entire physical disk (something we can
344  * label, toggle the write cache on, etc.) as opposed to the full
345  * capacity of a pseudo-device such as lofi or did.  We act as if we
346  * are labeling the disk, which should be a pretty good test of whether
347  * it's a viable device or not.  Returns B_TRUE if it is and B_FALSE if
348  * it isn't.
349  */
350 static boolean_t
351 is_whole_disk(const char *path)
352 {
353         struct dk_gpt *label;
354         int     fd;
355
356         if ((fd = open(path, O_RDWR|O_DIRECT|O_EXCL)) < 0)
357                 return (B_FALSE);
358         if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
359                 (void) close(fd);
360                 return (B_FALSE);
361         }
362         efi_free(label);
363         (void) close(fd);
364         return (B_TRUE);
365 }
366
367 /*
368  * This may be a shorthand device path or it could be total gibberish.
369  * Check to see if it is a known device available in zfs_vdev_paths.
370  * As part of this check, see if we've been given an entire disk
371  * (minus the slice number).
372  */
373 static int
374 is_shorthand_path(const char *arg, char *path,
375                   struct stat64 *statbuf, boolean_t *wholedisk)
376 {
377         int error;
378
379         error = zfs_resolve_shortname(arg, path, MAXPATHLEN);
380         if (error == 0) {
381                 *wholedisk = is_whole_disk(path);
382                 if (*wholedisk || (stat64(path, statbuf) == 0))
383                         return (0);
384         }
385
386         strlcpy(path, arg, sizeof(path));
387         memset(statbuf, 0, sizeof(*statbuf));
388         *wholedisk = B_FALSE;
389
390         return (error);
391 }
392
393 /*
394  * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
395  * device, fill in the device id to make a complete nvlist.  Valid forms for a
396  * leaf vdev are:
397  *
398  *      /dev/xxx        Complete disk path
399  *      /xxx            Full path to file
400  *      xxx             Shorthand for <zfs_vdev_paths>/xxx
401  */
402 static nvlist_t *
403 make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
404 {
405         char path[MAXPATHLEN];
406         struct stat64 statbuf;
407         nvlist_t *vdev = NULL;
408         char *type = NULL;
409         boolean_t wholedisk = B_FALSE;
410         int err;
411
412         /*
413          * Determine what type of vdev this is, and put the full path into
414          * 'path'.  We detect whether this is a device of file afterwards by
415          * checking the st_mode of the file.
416          */
417         if (arg[0] == '/') {
418                 /*
419                  * Complete device or file path.  Exact type is determined by
420                  * examining the file descriptor afterwards.  Symbolic links
421                  * are resolved to their real paths for the is_whole_disk()
422                  * and S_ISBLK/S_ISREG type checks.  However, we are careful
423                  * to store the given path as ZPOOL_CONFIG_PATH to ensure we
424                  * can leverage udev's persistent device labels.
425                  */
426                 if (realpath(arg, path) == NULL) {
427                         (void) fprintf(stderr,
428                             gettext("cannot resolve path '%s'\n"), arg);
429                         return (NULL);
430                 }
431
432                 wholedisk = is_whole_disk(path);
433                 if (!wholedisk && (stat64(path, &statbuf) != 0)) {
434                         (void) fprintf(stderr,
435                             gettext("cannot open '%s': %s\n"),
436                             path, strerror(errno));
437                         return (NULL);
438                 }
439
440                 /* After is_whole_disk() check restore original passed path */
441                 strlcpy(path, arg, MAXPATHLEN);
442         } else {
443                 err = is_shorthand_path(arg, path, &statbuf, &wholedisk);
444                 if (err != 0) {
445                         /*
446                          * If we got ENOENT, then the user gave us
447                          * gibberish, so try to direct them with a
448                          * reasonable error message.  Otherwise,
449                          * regurgitate strerror() since it's the best we
450                          * can do.
451                          */
452                         if (err == ENOENT) {
453                                 (void) fprintf(stderr,
454                                     gettext("cannot open '%s': no such "
455                                     "device in %s\n"), arg, DISK_ROOT);
456                                 (void) fprintf(stderr,
457                                     gettext("must be a full path or "
458                                     "shorthand device name\n"));
459                                 return (NULL);
460                         } else {
461                                 (void) fprintf(stderr,
462                                     gettext("cannot open '%s': %s\n"),
463                                     path, strerror(errno));
464                                 return (NULL);
465                         }
466                 }
467         }
468
469         /*
470          * Determine whether this is a device or a file.
471          */
472         if (wholedisk || S_ISBLK(statbuf.st_mode)) {
473                 type = VDEV_TYPE_DISK;
474         } else if (S_ISREG(statbuf.st_mode)) {
475                 type = VDEV_TYPE_FILE;
476         } else {
477                 (void) fprintf(stderr, gettext("cannot use '%s': must be a "
478                     "block device or regular file\n"), path);
479                 return (NULL);
480         }
481
482         /*
483          * Finally, we have the complete device or file, and we know that it is
484          * acceptable to use.  Construct the nvlist to describe this vdev.  All
485          * vdevs have a 'path' element, and devices also have a 'devid' element.
486          */
487         verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
488         verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
489         verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
490         verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
491         if (strcmp(type, VDEV_TYPE_DISK) == 0)
492                 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
493                     (uint64_t)wholedisk) == 0);
494
495         if (props != NULL) {
496                 uint64_t ashift = 0;
497                 char *value = NULL;
498
499                 if (nvlist_lookup_string(props,
500                     zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0)
501                         zfs_nicestrtonum(NULL, value, &ashift);
502
503                 if (ashift > 0)
504                         verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT,
505                             ashift) == 0);
506         }
507
508         return (vdev);
509 }
510
511 /*
512  * Go through and verify the replication level of the pool is consistent.
513  * Performs the following checks:
514  *
515  *      For the new spec, verifies that devices in mirrors and raidz are the
516  *      same size.
517  *
518  *      If the current configuration already has inconsistent replication
519  *      levels, ignore any other potential problems in the new spec.
520  *
521  *      Otherwise, make sure that the current spec (if there is one) and the new
522  *      spec have consistent replication levels.
523  */
524 typedef struct replication_level {
525         char *zprl_type;
526         uint64_t zprl_children;
527         uint64_t zprl_parity;
528 } replication_level_t;
529
530 #define ZPOOL_FUZZ      (16 * 1024 * 1024)
531
532 /*
533  * Given a list of toplevel vdevs, return the current replication level.  If
534  * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
535  * an error message will be displayed for each self-inconsistent vdev.
536  */
537 static replication_level_t *
538 get_replication(nvlist_t *nvroot, boolean_t fatal)
539 {
540         nvlist_t **top;
541         uint_t t, toplevels;
542         nvlist_t **child;
543         uint_t c, children;
544         nvlist_t *nv;
545         char *type;
546         replication_level_t lastrep = { 0 }, rep, *ret;
547         boolean_t dontreport;
548
549         ret = safe_malloc(sizeof (replication_level_t));
550
551         verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
552             &top, &toplevels) == 0);
553
554         lastrep.zprl_type = NULL;
555         for (t = 0; t < toplevels; t++) {
556                 uint64_t is_log = B_FALSE;
557
558                 nv = top[t];
559
560                 /*
561                  * For separate logs we ignore the top level vdev replication
562                  * constraints.
563                  */
564                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
565                 if (is_log)
566                         continue;
567
568                 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE,
569                     &type) == 0);
570                 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
571                     &child, &children) != 0) {
572                         /*
573                          * This is a 'file' or 'disk' vdev.
574                          */
575                         rep.zprl_type = type;
576                         rep.zprl_children = 1;
577                         rep.zprl_parity = 0;
578                 } else {
579                         uint64_t vdev_size;
580
581                         /*
582                          * This is a mirror or RAID-Z vdev.  Go through and make
583                          * sure the contents are all the same (files vs. disks),
584                          * keeping track of the number of elements in the
585                          * process.
586                          *
587                          * We also check that the size of each vdev (if it can
588                          * be determined) is the same.
589                          */
590                         rep.zprl_type = type;
591                         rep.zprl_children = 0;
592
593                         if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
594                                 verify(nvlist_lookup_uint64(nv,
595                                     ZPOOL_CONFIG_NPARITY,
596                                     &rep.zprl_parity) == 0);
597                                 assert(rep.zprl_parity != 0);
598                         } else {
599                                 rep.zprl_parity = 0;
600                         }
601
602                         /*
603                          * The 'dontreport' variable indicates that we've
604                          * already reported an error for this spec, so don't
605                          * bother doing it again.
606                          */
607                         type = NULL;
608                         dontreport = 0;
609                         vdev_size = -1ULL;
610                         for (c = 0; c < children; c++) {
611                                 nvlist_t *cnv = child[c];
612                                 char *path;
613                                 struct stat64 statbuf;
614                                 uint64_t size = -1ULL;
615                                 char *childtype;
616                                 int fd, err;
617
618                                 rep.zprl_children++;
619
620                                 verify(nvlist_lookup_string(cnv,
621                                     ZPOOL_CONFIG_TYPE, &childtype) == 0);
622
623                                 /*
624                                  * If this is a replacing or spare vdev, then
625                                  * get the real first child of the vdev.
626                                  */
627                                 if (strcmp(childtype,
628                                     VDEV_TYPE_REPLACING) == 0 ||
629                                     strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
630                                         nvlist_t **rchild;
631                                         uint_t rchildren;
632
633                                         verify(nvlist_lookup_nvlist_array(cnv,
634                                             ZPOOL_CONFIG_CHILDREN, &rchild,
635                                             &rchildren) == 0);
636                                         assert(rchildren == 2);
637                                         cnv = rchild[0];
638
639                                         verify(nvlist_lookup_string(cnv,
640                                             ZPOOL_CONFIG_TYPE,
641                                             &childtype) == 0);
642                                 }
643
644                                 verify(nvlist_lookup_string(cnv,
645                                     ZPOOL_CONFIG_PATH, &path) == 0);
646
647                                 /*
648                                  * If we have a raidz/mirror that combines disks
649                                  * with files, report it as an error.
650                                  */
651                                 if (!dontreport && type != NULL &&
652                                     strcmp(type, childtype) != 0) {
653                                         if (ret != NULL)
654                                                 free(ret);
655                                         ret = NULL;
656                                         if (fatal)
657                                                 vdev_error(gettext(
658                                                     "mismatched replication "
659                                                     "level: %s contains both "
660                                                     "files and devices\n"),
661                                                     rep.zprl_type);
662                                         else
663                                                 return (NULL);
664                                         dontreport = B_TRUE;
665                                 }
666
667                                 /*
668                                  * According to stat(2), the value of 'st_size'
669                                  * is undefined for block devices and character
670                                  * devices.  But there is no effective way to
671                                  * determine the real size in userland.
672                                  *
673                                  * Instead, we'll take advantage of an
674                                  * implementation detail of spec_size().  If the
675                                  * device is currently open, then we (should)
676                                  * return a valid size.
677                                  *
678                                  * If we still don't get a valid size (indicated
679                                  * by a size of 0 or MAXOFFSET_T), then ignore
680                                  * this device altogether.
681                                  */
682                                 if ((fd = open(path, O_RDONLY)) >= 0) {
683                                         err = fstat64(fd, &statbuf);
684                                         (void) close(fd);
685                                 } else {
686                                         err = stat64(path, &statbuf);
687                                 }
688
689                                 if (err != 0 ||
690                                     statbuf.st_size == 0 ||
691                                     statbuf.st_size == MAXOFFSET_T)
692                                         continue;
693
694                                 size = statbuf.st_size;
695
696                                 /*
697                                  * Also make sure that devices and
698                                  * slices have a consistent size.  If
699                                  * they differ by a significant amount
700                                  * (~16MB) then report an error.
701                                  */
702                                 if (!dontreport &&
703                                     (vdev_size != -1ULL &&
704                                     (labs(size - vdev_size) >
705                                     ZPOOL_FUZZ))) {
706                                         if (ret != NULL)
707                                                 free(ret);
708                                         ret = NULL;
709                                         if (fatal)
710                                                 vdev_error(gettext(
711                                                     "%s contains devices of "
712                                                     "different sizes\n"),
713                                                     rep.zprl_type);
714                                         else
715                                                 return (NULL);
716                                         dontreport = B_TRUE;
717                                 }
718
719                                 type = childtype;
720                                 vdev_size = size;
721                         }
722                 }
723
724                 /*
725                  * At this point, we have the replication of the last toplevel
726                  * vdev in 'rep'.  Compare it to 'lastrep' to see if its
727                  * different.
728                  */
729                 if (lastrep.zprl_type != NULL) {
730                         if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) {
731                                 if (ret != NULL)
732                                         free(ret);
733                                 ret = NULL;
734                                 if (fatal)
735                                         vdev_error(gettext(
736                                             "mismatched replication level: "
737                                             "both %s and %s vdevs are "
738                                             "present\n"),
739                                             lastrep.zprl_type, rep.zprl_type);
740                                 else
741                                         return (NULL);
742                         } else if (lastrep.zprl_parity != rep.zprl_parity) {
743                                 if (ret)
744                                         free(ret);
745                                 ret = NULL;
746                                 if (fatal)
747                                         vdev_error(gettext(
748                                             "mismatched replication level: "
749                                             "both %llu and %llu device parity "
750                                             "%s vdevs are present\n"),
751                                             lastrep.zprl_parity,
752                                             rep.zprl_parity,
753                                             rep.zprl_type);
754                                 else
755                                         return (NULL);
756                         } else if (lastrep.zprl_children != rep.zprl_children) {
757                                 if (ret)
758                                         free(ret);
759                                 ret = NULL;
760                                 if (fatal)
761                                         vdev_error(gettext(
762                                             "mismatched replication level: "
763                                             "both %llu-way and %llu-way %s "
764                                             "vdevs are present\n"),
765                                             lastrep.zprl_children,
766                                             rep.zprl_children,
767                                             rep.zprl_type);
768                                 else
769                                         return (NULL);
770                         }
771                 }
772                 lastrep = rep;
773         }
774
775         if (ret != NULL)
776                 *ret = rep;
777
778         return (ret);
779 }
780
781 /*
782  * Check the replication level of the vdev spec against the current pool.  Calls
783  * get_replication() to make sure the new spec is self-consistent.  If the pool
784  * has a consistent replication level, then we ignore any errors.  Otherwise,
785  * report any difference between the two.
786  */
787 static int
788 check_replication(nvlist_t *config, nvlist_t *newroot)
789 {
790         nvlist_t **child;
791         uint_t  children;
792         replication_level_t *current = NULL, *new;
793         int ret;
794
795         /*
796          * If we have a current pool configuration, check to see if it's
797          * self-consistent.  If not, simply return success.
798          */
799         if (config != NULL) {
800                 nvlist_t *nvroot;
801
802                 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
803                     &nvroot) == 0);
804                 if ((current = get_replication(nvroot, B_FALSE)) == NULL)
805                         return (0);
806         }
807         /*
808          * for spares there may be no children, and therefore no
809          * replication level to check
810          */
811         if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
812             &child, &children) != 0) || (children == 0)) {
813                 free(current);
814                 return (0);
815         }
816
817         /*
818          * If all we have is logs then there's no replication level to check.
819          */
820         if (num_logs(newroot) == children) {
821                 free(current);
822                 return (0);
823         }
824
825         /*
826          * Get the replication level of the new vdev spec, reporting any
827          * inconsistencies found.
828          */
829         if ((new = get_replication(newroot, B_TRUE)) == NULL) {
830                 free(current);
831                 return (-1);
832         }
833
834         /*
835          * Check to see if the new vdev spec matches the replication level of
836          * the current pool.
837          */
838         ret = 0;
839         if (current != NULL) {
840                 if (strcmp(current->zprl_type, new->zprl_type) != 0) {
841                         vdev_error(gettext(
842                             "mismatched replication level: pool uses %s "
843                             "and new vdev is %s\n"),
844                             current->zprl_type, new->zprl_type);
845                         ret = -1;
846                 } else if (current->zprl_parity != new->zprl_parity) {
847                         vdev_error(gettext(
848                             "mismatched replication level: pool uses %llu "
849                             "device parity and new vdev uses %llu\n"),
850                             current->zprl_parity, new->zprl_parity);
851                         ret = -1;
852                 } else if (current->zprl_children != new->zprl_children) {
853                         vdev_error(gettext(
854                             "mismatched replication level: pool uses %llu-way "
855                             "%s and new vdev uses %llu-way %s\n"),
856                             current->zprl_children, current->zprl_type,
857                             new->zprl_children, new->zprl_type);
858                         ret = -1;
859                 }
860         }
861
862         free(new);
863         if (current != NULL)
864                 free(current);
865
866         return (ret);
867 }
868
869 static int
870 zero_label(char *path)
871 {
872         const int size = 4096;
873         char buf[size];
874         int err, fd;
875
876         if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) {
877                 (void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
878                     path, strerror(errno));
879                 return (-1);
880         }
881
882         memset(buf, 0, size);
883         err = write(fd, buf, size);
884         (void) fdatasync(fd);
885         (void) close(fd);
886
887         if (err == -1) {
888                 (void) fprintf(stderr, gettext("cannot zero first %d bytes "
889                     "of '%s': %s\n"), size, path, strerror(errno));
890                 return (-1);
891         }
892
893         if (err != size) {
894                 (void) fprintf(stderr, gettext("could only zero %d/%d bytes "
895                     "of '%s'\n"), err, size, path);
896                 return (-1);
897         }
898
899         return 0;
900 }
901
902 /*
903  * Go through and find any whole disks in the vdev specification, labelling them
904  * as appropriate.  When constructing the vdev spec, we were unable to open this
905  * device in order to provide a devid.  Now that we have labelled the disk and
906  * know that slice 0 is valid, we can construct the devid now.
907  *
908  * If the disk was already labeled with an EFI label, we will have gotten the
909  * devid already (because we were able to open the whole disk).  Otherwise, we
910  * need to get the devid after we label the disk.
911  */
912 static int
913 make_disks(zpool_handle_t *zhp, nvlist_t *nv)
914 {
915         nvlist_t **child;
916         uint_t c, children;
917         char *type, *path, *diskname;
918         char devpath[MAXPATHLEN];
919         char udevpath[MAXPATHLEN];
920         uint64_t wholedisk;
921         struct stat64 statbuf;
922         int ret;
923
924         verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
925
926         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
927             &child, &children) != 0) {
928
929                 if (strcmp(type, VDEV_TYPE_DISK) != 0)
930                         return (0);
931
932                 /*
933                  * We have a disk device.  If this is a whole disk write
934                  * out the efi partition table, otherwise write zero's to
935                  * the first 4k of the partition.  This is to ensure that
936                  * libblkid will not misidentify the partition due to a
937                  * magic value left by the previous filesystem.
938                  */
939                 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
940                 verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
941                     &wholedisk));
942
943                 if (!wholedisk) {
944                         ret = zero_label(path);
945                         return (ret);
946                 }
947
948                 if (realpath(path, devpath) == NULL) {
949                         ret = errno;
950                         (void) fprintf(stderr,
951                             gettext("cannot resolve path '%s'\n"), path);
952                         return (ret);
953                 }
954
955                 /*
956                  * Remove any previously existing symlink from a udev path to
957                  * the device before labeling the disk.  This makes
958                  * zpool_label_disk_wait() truly wait for the new link to show
959                  * up instead of returning if it finds an old link still in
960                  * place.  Otherwise there is a window between when udev
961                  * deletes and recreates the link during which access attempts
962                  * will fail with ENOENT.
963                  */
964                 strncpy(udevpath, path, MAXPATHLEN);
965                 (void) zfs_append_partition(udevpath, MAXPATHLEN);
966
967                 if ((strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) &&
968                     (lstat64(udevpath, &statbuf) == 0) &&
969                     S_ISLNK(statbuf.st_mode))
970                         (void) unlink(udevpath);
971
972                 diskname = strrchr(devpath, '/');
973                 assert(diskname != NULL);
974                 diskname++;
975                 if (zpool_label_disk(g_zfs, zhp, diskname) == -1)
976                         return (-1);
977
978                 /*
979                  * Now we've labeled the disk and the partitions have been
980                  * created.  We still need to wait for udev to create the
981                  * symlinks to those partitions.
982                  */
983                 if ((ret = zpool_label_disk_wait(udevpath, 1000)) != 0) {
984                         (void) fprintf(stderr,
985                             gettext( "cannot resolve path '%s'\n"), udevpath);
986                         return (-1);
987                 }
988
989                 /*
990                  * Update the path to refer to the partition.  The presence of
991                  * the 'whole_disk' field indicates to the CLI that we should
992                  * chop off the partition number when displaying the device in
993                  * future output.
994                  */
995                 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0);
996
997                 /* Just in case this partition already existed. */
998                 (void) zero_label(udevpath);
999
1000                 return (0);
1001         }
1002
1003         for (c = 0; c < children; c++)
1004                 if ((ret = make_disks(zhp, child[c])) != 0)
1005                         return (ret);
1006
1007         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1008             &child, &children) == 0)
1009                 for (c = 0; c < children; c++)
1010                         if ((ret = make_disks(zhp, child[c])) != 0)
1011                                 return (ret);
1012
1013         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1014             &child, &children) == 0)
1015                 for (c = 0; c < children; c++)
1016                         if ((ret = make_disks(zhp, child[c])) != 0)
1017                                 return (ret);
1018
1019         return (0);
1020 }
1021
1022 /*
1023  * Determine if the given path is a hot spare within the given configuration.
1024  */
1025 static boolean_t
1026 is_spare(nvlist_t *config, const char *path)
1027 {
1028         int fd;
1029         pool_state_t state;
1030         char *name = NULL;
1031         nvlist_t *label;
1032         uint64_t guid, spareguid;
1033         nvlist_t *nvroot;
1034         nvlist_t **spares;
1035         uint_t i, nspares;
1036         boolean_t inuse;
1037
1038         if ((fd = open(path, O_RDONLY|O_EXCL)) < 0)
1039                 return (B_FALSE);
1040
1041         if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
1042             !inuse ||
1043             state != POOL_STATE_SPARE ||
1044             zpool_read_label(fd, &label) != 0) {
1045                 free(name);
1046                 (void) close(fd);
1047                 return (B_FALSE);
1048         }
1049         free(name);
1050         (void) close(fd);
1051
1052         verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
1053         nvlist_free(label);
1054
1055         verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1056             &nvroot) == 0);
1057         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1058             &spares, &nspares) == 0) {
1059                 for (i = 0; i < nspares; i++) {
1060                         verify(nvlist_lookup_uint64(spares[i],
1061                             ZPOOL_CONFIG_GUID, &spareguid) == 0);
1062                         if (spareguid == guid)
1063                                 return (B_TRUE);
1064                 }
1065         }
1066
1067         return (B_FALSE);
1068 }
1069
1070 /*
1071  * Go through and find any devices that are in use.  We rely on libdiskmgt for
1072  * the majority of this task.
1073  */
1074 static int
1075 check_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
1076     boolean_t replacing, boolean_t isspare)
1077 {
1078         nvlist_t **child;
1079         uint_t c, children;
1080         char *type, *path;
1081         int ret = 0;
1082         char buf[MAXPATHLEN];
1083         uint64_t wholedisk = B_FALSE;
1084
1085         verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1086
1087         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1088             &child, &children) != 0) {
1089
1090                 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
1091                 if (strcmp(type, VDEV_TYPE_DISK) == 0)
1092                         verify(!nvlist_lookup_uint64(nv,
1093                                ZPOOL_CONFIG_WHOLE_DISK, &wholedisk));
1094
1095                 /*
1096                  * As a generic check, we look to see if this is a replace of a
1097                  * hot spare within the same pool.  If so, we allow it
1098                  * regardless of what libblkid or zpool_in_use() says.
1099                  */
1100                 if (replacing) {
1101                         if (wholedisk)
1102                                 (void) snprintf(buf, sizeof (buf), "%ss0",
1103                                     path);
1104                         else
1105                                 (void) strlcpy(buf, path, sizeof (buf));
1106
1107                         if (is_spare(config, buf))
1108                                 return (0);
1109                 }
1110
1111                 if (strcmp(type, VDEV_TYPE_DISK) == 0)
1112                         ret = check_device(path, force, isspare, wholedisk);
1113
1114                 if (strcmp(type, VDEV_TYPE_FILE) == 0)
1115                         ret = check_file(path, force, isspare);
1116
1117                 return (ret);
1118         }
1119
1120         for (c = 0; c < children; c++)
1121                 if ((ret = check_in_use(config, child[c], force,
1122                     replacing, B_FALSE)) != 0)
1123                         return (ret);
1124
1125         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1126             &child, &children) == 0)
1127                 for (c = 0; c < children; c++)
1128                         if ((ret = check_in_use(config, child[c], force,
1129                             replacing, B_TRUE)) != 0)
1130                                 return (ret);
1131
1132         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1133             &child, &children) == 0)
1134                 for (c = 0; c < children; c++)
1135                         if ((ret = check_in_use(config, child[c], force,
1136                             replacing, B_FALSE)) != 0)
1137                                 return (ret);
1138
1139         return (0);
1140 }
1141
1142 static const char *
1143 is_grouping(const char *type, int *mindev, int *maxdev)
1144 {
1145         if (strncmp(type, "raidz", 5) == 0) {
1146                 const char *p = type + 5;
1147                 char *end;
1148                 long nparity;
1149
1150                 if (*p == '\0') {
1151                         nparity = 1;
1152                 } else if (*p == '0') {
1153                         return (NULL); /* no zero prefixes allowed */
1154                 } else {
1155                         errno = 0;
1156                         nparity = strtol(p, &end, 10);
1157                         if (errno != 0 || nparity < 1 || nparity >= 255 ||
1158                             *end != '\0')
1159                                 return (NULL);
1160                 }
1161
1162                 if (mindev != NULL)
1163                         *mindev = nparity + 1;
1164                 if (maxdev != NULL)
1165                         *maxdev = 255;
1166                 return (VDEV_TYPE_RAIDZ);
1167         }
1168
1169         if (maxdev != NULL)
1170                 *maxdev = INT_MAX;
1171
1172         if (strcmp(type, "mirror") == 0) {
1173                 if (mindev != NULL)
1174                         *mindev = 2;
1175                 return (VDEV_TYPE_MIRROR);
1176         }
1177
1178         if (strcmp(type, "spare") == 0) {
1179                 if (mindev != NULL)
1180                         *mindev = 1;
1181                 return (VDEV_TYPE_SPARE);
1182         }
1183
1184         if (strcmp(type, "log") == 0) {
1185                 if (mindev != NULL)
1186                         *mindev = 1;
1187                 return (VDEV_TYPE_LOG);
1188         }
1189
1190         if (strcmp(type, "cache") == 0) {
1191                 if (mindev != NULL)
1192                         *mindev = 1;
1193                 return (VDEV_TYPE_L2CACHE);
1194         }
1195
1196         return (NULL);
1197 }
1198
1199 /*
1200  * Construct a syntactically valid vdev specification,
1201  * and ensure that all devices and files exist and can be opened.
1202  * Note: we don't bother freeing anything in the error paths
1203  * because the program is just going to exit anyway.
1204  */
1205 nvlist_t *
1206 construct_spec(nvlist_t *props, int argc, char **argv)
1207 {
1208         nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
1209         int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
1210         const char *type;
1211         uint64_t is_log;
1212         boolean_t seen_logs;
1213
1214         top = NULL;
1215         toplevels = 0;
1216         spares = NULL;
1217         l2cache = NULL;
1218         nspares = 0;
1219         nlogs = 0;
1220         nl2cache = 0;
1221         is_log = B_FALSE;
1222         seen_logs = B_FALSE;
1223
1224         while (argc > 0) {
1225                 nv = NULL;
1226
1227                 /*
1228                  * If it's a mirror or raidz, the subsequent arguments are
1229                  * its leaves -- until we encounter the next mirror or raidz.
1230                  */
1231                 if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
1232                         nvlist_t **child = NULL;
1233                         int c, children = 0;
1234
1235                         if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1236                                 if (spares != NULL) {
1237                                         (void) fprintf(stderr,
1238                                             gettext("invalid vdev "
1239                                             "specification: 'spare' can be "
1240                                             "specified only once\n"));
1241                                         return (NULL);
1242                                 }
1243                                 is_log = B_FALSE;
1244                         }
1245
1246                         if (strcmp(type, VDEV_TYPE_LOG) == 0) {
1247                                 if (seen_logs) {
1248                                         (void) fprintf(stderr,
1249                                             gettext("invalid vdev "
1250                                             "specification: 'log' can be "
1251                                             "specified only once\n"));
1252                                         return (NULL);
1253                                 }
1254                                 seen_logs = B_TRUE;
1255                                 is_log = B_TRUE;
1256                                 argc--;
1257                                 argv++;
1258                                 /*
1259                                  * A log is not a real grouping device.
1260                                  * We just set is_log and continue.
1261                                  */
1262                                 continue;
1263                         }
1264
1265                         if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1266                                 if (l2cache != NULL) {
1267                                         (void) fprintf(stderr,
1268                                             gettext("invalid vdev "
1269                                             "specification: 'cache' can be "
1270                                             "specified only once\n"));
1271                                         return (NULL);
1272                                 }
1273                                 is_log = B_FALSE;
1274                         }
1275
1276                         if (is_log) {
1277                                 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
1278                                         (void) fprintf(stderr,
1279                                             gettext("invalid vdev "
1280                                             "specification: unsupported 'log' "
1281                                             "device: %s\n"), type);
1282                                         return (NULL);
1283                                 }
1284                                 nlogs++;
1285                         }
1286
1287                         for (c = 1; c < argc; c++) {
1288                                 if (is_grouping(argv[c], NULL, NULL) != NULL)
1289                                         break;
1290                                 children++;
1291                                 child = realloc(child,
1292                                     children * sizeof (nvlist_t *));
1293                                 if (child == NULL)
1294                                         zpool_no_memory();
1295                                 if ((nv = make_leaf_vdev(props, argv[c], B_FALSE))
1296                                     == NULL)
1297                                         return (NULL);
1298                                 child[children - 1] = nv;
1299                         }
1300
1301                         if (children < mindev) {
1302                                 (void) fprintf(stderr, gettext("invalid vdev "
1303                                     "specification: %s requires at least %d "
1304                                     "devices\n"), argv[0], mindev);
1305                                 return (NULL);
1306                         }
1307
1308                         if (children > maxdev) {
1309                                 (void) fprintf(stderr, gettext("invalid vdev "
1310                                     "specification: %s supports no more than "
1311                                     "%d devices\n"), argv[0], maxdev);
1312                                 return (NULL);
1313                         }
1314
1315                         argc -= c;
1316                         argv += c;
1317
1318                         if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1319                                 spares = child;
1320                                 nspares = children;
1321                                 continue;
1322                         } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1323                                 l2cache = child;
1324                                 nl2cache = children;
1325                                 continue;
1326                         } else {
1327                                 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
1328                                     0) == 0);
1329                                 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
1330                                     type) == 0);
1331                                 verify(nvlist_add_uint64(nv,
1332                                     ZPOOL_CONFIG_IS_LOG, is_log) == 0);
1333                                 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
1334                                         verify(nvlist_add_uint64(nv,
1335                                             ZPOOL_CONFIG_NPARITY,
1336                                             mindev - 1) == 0);
1337                                 }
1338                                 verify(nvlist_add_nvlist_array(nv,
1339                                     ZPOOL_CONFIG_CHILDREN, child,
1340                                     children) == 0);
1341
1342                                 for (c = 0; c < children; c++)
1343                                         nvlist_free(child[c]);
1344                                 free(child);
1345                         }
1346                 } else {
1347                         /*
1348                          * We have a device.  Pass off to make_leaf_vdev() to
1349                          * construct the appropriate nvlist describing the vdev.
1350                          */
1351                         if ((nv = make_leaf_vdev(props, argv[0], is_log)) == NULL)
1352                                 return (NULL);
1353                         if (is_log)
1354                                 nlogs++;
1355                         argc--;
1356                         argv++;
1357                 }
1358
1359                 toplevels++;
1360                 top = realloc(top, toplevels * sizeof (nvlist_t *));
1361                 if (top == NULL)
1362                         zpool_no_memory();
1363                 top[toplevels - 1] = nv;
1364         }
1365
1366         if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
1367                 (void) fprintf(stderr, gettext("invalid vdev "
1368                     "specification: at least one toplevel vdev must be "
1369                     "specified\n"));
1370                 return (NULL);
1371         }
1372
1373         if (seen_logs && nlogs == 0) {
1374                 (void) fprintf(stderr, gettext("invalid vdev specification: "
1375                     "log requires at least 1 device\n"));
1376                 return (NULL);
1377         }
1378
1379         /*
1380          * Finally, create nvroot and add all top-level vdevs to it.
1381          */
1382         verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
1383         verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1384             VDEV_TYPE_ROOT) == 0);
1385         verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1386             top, toplevels) == 0);
1387         if (nspares != 0)
1388                 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1389                     spares, nspares) == 0);
1390         if (nl2cache != 0)
1391                 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1392                     l2cache, nl2cache) == 0);
1393
1394         for (t = 0; t < toplevels; t++)
1395                 nvlist_free(top[t]);
1396         for (t = 0; t < nspares; t++)
1397                 nvlist_free(spares[t]);
1398         for (t = 0; t < nl2cache; t++)
1399                 nvlist_free(l2cache[t]);
1400         if (spares)
1401                 free(spares);
1402         if (l2cache)
1403                 free(l2cache);
1404         free(top);
1405
1406         return (nvroot);
1407 }
1408
1409 nvlist_t *
1410 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
1411     splitflags_t flags, int argc, char **argv)
1412 {
1413         nvlist_t *newroot = NULL, **child;
1414         uint_t c, children;
1415
1416         if (argc > 0) {
1417                 if ((newroot = construct_spec(props, argc, argv)) == NULL) {
1418                         (void) fprintf(stderr, gettext("Unable to build a "
1419                             "pool from the specified devices\n"));
1420                         return (NULL);
1421                 }
1422
1423                 if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
1424                         nvlist_free(newroot);
1425                         return (NULL);
1426                 }
1427
1428                 /* avoid any tricks in the spec */
1429                 verify(nvlist_lookup_nvlist_array(newroot,
1430                     ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
1431                 for (c = 0; c < children; c++) {
1432                         char *path;
1433                         const char *type;
1434                         int min, max;
1435
1436                         verify(nvlist_lookup_string(child[c],
1437                             ZPOOL_CONFIG_PATH, &path) == 0);
1438                         if ((type = is_grouping(path, &min, &max)) != NULL) {
1439                                 (void) fprintf(stderr, gettext("Cannot use "
1440                                     "'%s' as a device for splitting\n"), type);
1441                                 nvlist_free(newroot);
1442                                 return (NULL);
1443                         }
1444                 }
1445         }
1446
1447         if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
1448                 if (newroot != NULL)
1449                         nvlist_free(newroot);
1450                 return (NULL);
1451         }
1452
1453         return (newroot);
1454 }
1455
1456 /*
1457  * Get and validate the contents of the given vdev specification.  This ensures
1458  * that the nvlist returned is well-formed, that all the devices exist, and that
1459  * they are not currently in use by any other known consumer.  The 'poolconfig'
1460  * parameter is the current configuration of the pool when adding devices
1461  * existing pool, and is used to perform additional checks, such as changing the
1462  * replication level of the pool.  It can be 'NULL' to indicate that this is a
1463  * new pool.  The 'force' flag controls whether devices should be forcefully
1464  * added, even if they appear in use.
1465  */
1466 nvlist_t *
1467 make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
1468     boolean_t replacing, boolean_t dryrun, int argc, char **argv)
1469 {
1470         nvlist_t *newroot;
1471         nvlist_t *poolconfig = NULL;
1472         is_force = force;
1473
1474         /*
1475          * Construct the vdev specification.  If this is successful, we know
1476          * that we have a valid specification, and that all devices can be
1477          * opened.
1478          */
1479         if ((newroot = construct_spec(props, argc, argv)) == NULL)
1480                 return (NULL);
1481
1482         if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL))
1483                 return (NULL);
1484
1485         /*
1486          * Validate each device to make sure that its not shared with another
1487          * subsystem.  We do this even if 'force' is set, because there are some
1488          * uses (such as a dedicated dump device) that even '-f' cannot
1489          * override.
1490          */
1491         if (check_in_use(poolconfig, newroot, force, replacing, B_FALSE) != 0) {
1492                 nvlist_free(newroot);
1493                 return (NULL);
1494         }
1495
1496         /*
1497          * Check the replication level of the given vdevs and report any errors
1498          * found.  We include the existing pool spec, if any, as we need to
1499          * catch changes against the existing replication level.
1500          */
1501         if (check_rep && check_replication(poolconfig, newroot) != 0) {
1502                 nvlist_free(newroot);
1503                 return (NULL);
1504         }
1505
1506         /*
1507          * Run through the vdev specification and label any whole disks found.
1508          */
1509         if (!dryrun && make_disks(zhp, newroot) != 0) {
1510                 nvlist_free(newroot);
1511                 return (NULL);
1512         }
1513
1514         return (newroot);
1515 }