Add vdev_id for JBOD-friendly udev aliases
[zfs.git] / cmd / zpool / zpool_vdev.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25
26 /*
27  * Functions to convert between a list of vdevs and an nvlist representing the
28  * configuration.  Each entry in the list can be one of:
29  *
30  *      Device vdevs
31  *              disk=(path=..., devid=...)
32  *              file=(path=...)
33  *
34  *      Group vdevs
35  *              raidz[1|2]=(...)
36  *              mirror=(...)
37  *
38  *      Hot spares
39  *
40  * While the underlying implementation supports it, group vdevs cannot contain
41  * other group vdevs.  All userland verification of devices is contained within
42  * this file.  If successful, the nvlist returned can be passed directly to the
43  * kernel; we've done as much verification as possible in userland.
44  *
45  * Hot spares are a special case, and passed down as an array of disk vdevs, at
46  * the same level as the root of the vdev tree.
47  *
48  * The only function exported by this file is 'make_root_vdev'.  The
49  * function performs several passes:
50  *
51  *      1. Construct the vdev specification.  Performs syntax validation and
52  *         makes sure each device is valid.
53  *      2. Check for devices in use.  Using libblkid to make sure that no
54  *         devices are also in use.  Some can be overridden using the 'force'
55  *         flag, others cannot.
56  *      3. Check for replication errors if the 'force' flag is not specified.
57  *         validates that the replication level is consistent across the
58  *         entire pool.
59  *      4. Call libzfs to label any whole disks with an EFI label.
60  */
61
62 #include <assert.h>
63 #include <ctype.h>
64 #include <devid.h>
65 #include <errno.h>
66 #include <fcntl.h>
67 #include <libintl.h>
68 #include <libnvpair.h>
69 #include <limits.h>
70 #include <stdio.h>
71 #include <string.h>
72 #include <unistd.h>
73 #include <sys/efi_partition.h>
74 #include <sys/stat.h>
75 #include <sys/vtoc.h>
76 #include <sys/mntent.h>
77 #include <uuid/uuid.h>
78 #ifdef HAVE_LIBBLKID
79 #include <blkid/blkid.h>
80 #else
81 #define blkid_cache void *
82 #endif /* HAVE_LIBBLKID */
83
84 #include "zpool_util.h"
85
86 /*
87  * For any given vdev specification, we can have multiple errors.  The
88  * vdev_error() function keeps track of whether we have seen an error yet, and
89  * prints out a header if its the first error we've seen.
90  */
91 boolean_t error_seen;
92 boolean_t is_force;
93
94 /*PRINTFLIKE1*/
95 static void
96 vdev_error(const char *fmt, ...)
97 {
98         va_list ap;
99
100         if (!error_seen) {
101                 (void) fprintf(stderr, gettext("invalid vdev specification\n"));
102                 if (!is_force)
103                         (void) fprintf(stderr, gettext("use '-f' to override "
104                             "the following errors:\n"));
105                 else
106                         (void) fprintf(stderr, gettext("the following errors "
107                             "must be manually repaired:\n"));
108                 error_seen = B_TRUE;
109         }
110
111         va_start(ap, fmt);
112         (void) vfprintf(stderr, fmt, ap);
113         va_end(ap);
114 }
115
116 /*
117  * Check that a file is valid.  All we can do in this case is check that it's
118  * not in use by another pool, and not in use by swap.
119  */
120 static int
121 check_file(const char *file, boolean_t force, boolean_t isspare)
122 {
123         char  *name;
124         int fd;
125         int ret = 0;
126         pool_state_t state;
127         boolean_t inuse;
128
129         if ((fd = open(file, O_RDONLY)) < 0)
130                 return (0);
131
132         if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
133                 const char *desc;
134
135                 switch (state) {
136                 case POOL_STATE_ACTIVE:
137                         desc = gettext("active");
138                         break;
139
140                 case POOL_STATE_EXPORTED:
141                         desc = gettext("exported");
142                         break;
143
144                 case POOL_STATE_POTENTIALLY_ACTIVE:
145                         desc = gettext("potentially active");
146                         break;
147
148                 default:
149                         desc = gettext("unknown");
150                         break;
151                 }
152
153                 /*
154                  * Allow hot spares to be shared between pools.
155                  */
156                 if (state == POOL_STATE_SPARE && isspare)
157                         return (0);
158
159                 if (state == POOL_STATE_ACTIVE ||
160                     state == POOL_STATE_SPARE || !force) {
161                         switch (state) {
162                         case POOL_STATE_SPARE:
163                                 vdev_error(gettext("%s is reserved as a hot "
164                                     "spare for pool %s\n"), file, name);
165                                 break;
166                         default:
167                                 vdev_error(gettext("%s is part of %s pool "
168                                     "'%s'\n"), file, desc, name);
169                                 break;
170                         }
171                         ret = -1;
172                 }
173
174                 free(name);
175         }
176
177         (void) close(fd);
178         return (ret);
179 }
180
181 static void
182 check_error(int err)
183 {
184         (void) fprintf(stderr, gettext("warning: device in use checking "
185             "failed: %s\n"), strerror(err));
186 }
187
188 static int
189 check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare)
190 {
191         int err;
192 #ifdef HAVE_LIBBLKID
193         char *value;
194
195         /* No valid type detected device is safe to use */
196         value = blkid_get_tag_value(cache, "TYPE", path);
197         if (value == NULL)
198                 return (0);
199
200         /*
201          * If libblkid detects a ZFS device, we check the device
202          * using check_file() to see if it's safe.  The one safe
203          * case is a spare device shared between multiple pools.
204          */
205         if (strcmp(value, "zfs") == 0) {
206                 err = check_file(path, force, isspare);
207         } else {
208                 if (force) {
209                         err = 0;
210                 } else {
211                         err = -1;
212                         vdev_error(gettext("%s contains a filesystem of "
213                                    "type '%s'\n"), path, value);
214                 }
215         }
216
217         free(value);
218 #else
219         err = check_file(path, force, isspare);
220 #endif /* HAVE_LIBBLKID */
221
222         return (err);
223 }
224
225 /*
226  * Validate a whole disk.  Iterate over all slices on the disk and make sure
227  * that none is in use by calling check_slice().
228  */
229 static int
230 check_disk(const char *path, blkid_cache cache, int force,
231            boolean_t isspare, boolean_t iswholedisk)
232 {
233         struct dk_gpt *vtoc;
234         char slice_path[MAXPATHLEN];
235         int err = 0;
236         int fd, i;
237
238         /* This is not a wholedisk we only check the given partition */
239         if (!iswholedisk)
240                 return check_slice(path, cache, force, isspare);
241
242         /*
243          * When the device is a whole disk try to read the efi partition
244          * label.  If this is successful we safely check the all of the
245          * partitions.  However, when it fails it may simply be because
246          * the disk is partitioned via the MBR.  Since we currently can
247          * not easily decode the MBR return a failure and prompt to the
248          * user to use force option since we cannot check the partitions.
249          */
250         if ((fd = open(path, O_RDWR|O_DIRECT|O_EXCL)) < 0) {
251                 check_error(errno);
252                 return -1;
253         }
254
255         if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) {
256                 (void) close(fd);
257
258                 if (force) {
259                         return 0;
260                 } else {
261                         vdev_error(gettext("%s does not contain an EFI "
262                             "label but it may contain partition\n"
263                             "information in the MBR.\n"), path);
264                         return -1;
265                 }
266         }
267
268         /*
269          * The primary efi partition label is damaged however the secondary
270          * label at the end of the device is intact.  Rather than use this
271          * label we should play it safe and treat this as a non efi device.
272          */
273         if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
274                 efi_free(vtoc);
275                 (void) close(fd);
276
277                 if (force) {
278                         /* Partitions will no be created using the backup */
279                         return 0;
280                 } else {
281                         vdev_error(gettext("%s contains a corrupt primary "
282                             "EFI label.\n"), path);
283                         return -1;
284                 }
285         }
286
287         for (i = 0; i < vtoc->efi_nparts; i++) {
288
289                 if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED ||
290                     uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid))
291                         continue;
292
293                 if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0)
294                         (void) snprintf(slice_path, sizeof (slice_path),
295                             "%s%s%d", path, "-part", i+1);
296                 else
297                         (void) snprintf(slice_path, sizeof (slice_path),
298                             "%s%s%d", path, isdigit(path[strlen(path)-1]) ?
299                             "p" : "", i+1);
300
301                 err = check_slice(slice_path, cache, force, isspare);
302                 if (err)
303                         break;
304         }
305
306         efi_free(vtoc);
307         (void) close(fd);
308
309         return (err);
310 }
311
312 static int
313 check_device(const char *path, boolean_t force,
314              boolean_t isspare, boolean_t iswholedisk)
315 {
316         static blkid_cache cache = NULL;
317
318 #ifdef HAVE_LIBBLKID
319         /*
320          * There is no easy way to add a correct blkid_put_cache() call,
321          * memory will be reclaimed when the command exits.
322          */
323         if (cache == NULL) {
324                 int err;
325
326                 if ((err = blkid_get_cache(&cache, NULL)) != 0) {
327                         check_error(err);
328                         return -1;
329                 }
330
331                 if ((err = blkid_probe_all(cache)) != 0) {
332                         blkid_put_cache(cache);
333                         check_error(err);
334                         return -1;
335                 }
336         }
337 #endif /* HAVE_LIBBLKID */
338
339         return check_disk(path, cache, force, isspare, iswholedisk);
340 }
341
342 /*
343  * By "whole disk" we mean an entire physical disk (something we can
344  * label, toggle the write cache on, etc.) as opposed to the full
345  * capacity of a pseudo-device such as lofi or did.  We act as if we
346  * are labeling the disk, which should be a pretty good test of whether
347  * it's a viable device or not.  Returns B_TRUE if it is and B_FALSE if
348  * it isn't.
349  */
350 static boolean_t
351 is_whole_disk(const char *path)
352 {
353         struct dk_gpt *label;
354         int     fd;
355
356         if ((fd = open(path, O_RDWR|O_DIRECT|O_EXCL)) < 0)
357                 return (B_FALSE);
358         if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
359                 (void) close(fd);
360                 return (B_FALSE);
361         }
362         efi_free(label);
363         (void) close(fd);
364         return (B_TRUE);
365 }
366
367 /*
368  * This may be a shorthand device path or it could be total gibberish.
369  * Check to see if it's a known device in /dev/, /dev/disk/by-id,
370  * /dev/disk/by-label, /dev/disk/by-path, /dev/disk/by-uuid,
371  * /dev/disk/by-vdev, or /dev/disk/zpool/.  As part of this check, see
372  * if we've been given an entire disk (minus the slice number).
373  */
374 static int
375 is_shorthand_path(const char *arg, char *path,
376                   struct stat64 *statbuf, boolean_t *wholedisk)
377 {
378         if (zfs_resolve_shortname(arg, path, MAXPATHLEN) == 0) {
379                 *wholedisk = is_whole_disk(path);
380                 if (*wholedisk || (stat64(path, statbuf) == 0))
381                         return (0);
382         }
383
384         strlcpy(path, arg, sizeof(path));
385         memset(statbuf, 0, sizeof(*statbuf));
386         *wholedisk = B_FALSE;
387
388         return (ENOENT);
389 }
390
391 /*
392  * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
393  * device, fill in the device id to make a complete nvlist.  Valid forms for a
394  * leaf vdev are:
395  *
396  *      /dev/xxx        Complete disk path
397  *      /xxx            Full path to file
398  *      xxx             Shorthand for /dev/disk/yyy/xxx
399  */
400 static nvlist_t *
401 make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
402 {
403         char path[MAXPATHLEN];
404         struct stat64 statbuf;
405         nvlist_t *vdev = NULL;
406         char *type = NULL;
407         boolean_t wholedisk = B_FALSE;
408         int err;
409
410         /*
411          * Determine what type of vdev this is, and put the full path into
412          * 'path'.  We detect whether this is a device of file afterwards by
413          * checking the st_mode of the file.
414          */
415         if (arg[0] == '/') {
416                 /*
417                  * Complete device or file path.  Exact type is determined by
418                  * examining the file descriptor afterwards.  Symbolic links
419                  * are resolved to their real paths for the is_whole_disk()
420                  * and S_ISBLK/S_ISREG type checks.  However, we are careful
421                  * to store the given path as ZPOOL_CONFIG_PATH to ensure we
422                  * can leverage udev's persistent device labels.
423                  */
424                 if (realpath(arg, path) == NULL) {
425                         (void) fprintf(stderr,
426                             gettext("cannot resolve path '%s'\n"), arg);
427                         return (NULL);
428                 }
429
430                 wholedisk = is_whole_disk(path);
431                 if (!wholedisk && (stat64(path, &statbuf) != 0)) {
432                         (void) fprintf(stderr,
433                             gettext("cannot open '%s': %s\n"),
434                             path, strerror(errno));
435                         return (NULL);
436                 }
437
438                 /* After is_whole_disk() check restore original passed path */
439                 strlcpy(path, arg, MAXPATHLEN);
440         } else {
441                 err = is_shorthand_path(arg, path, &statbuf, &wholedisk);
442                 if (err != 0) {
443                         /*
444                          * If we got ENOENT, then the user gave us
445                          * gibberish, so try to direct them with a
446                          * reasonable error message.  Otherwise,
447                          * regurgitate strerror() since it's the best we
448                          * can do.
449                          */
450                         if (err == ENOENT) {
451                                 (void) fprintf(stderr,
452                                     gettext("cannot open '%s': no such "
453                                     "device in %s\n"), arg, DISK_ROOT);
454                                 (void) fprintf(stderr,
455                                     gettext("must be a full path or "
456                                     "shorthand device name\n"));
457                                 return (NULL);
458                         } else {
459                                 (void) fprintf(stderr,
460                                     gettext("cannot open '%s': %s\n"),
461                                     path, strerror(errno));
462                                 return (NULL);
463                         }
464                 }
465         }
466
467         /*
468          * Determine whether this is a device or a file.
469          */
470         if (wholedisk || S_ISBLK(statbuf.st_mode)) {
471                 type = VDEV_TYPE_DISK;
472         } else if (S_ISREG(statbuf.st_mode)) {
473                 type = VDEV_TYPE_FILE;
474         } else {
475                 (void) fprintf(stderr, gettext("cannot use '%s': must be a "
476                     "block device or regular file\n"), path);
477                 return (NULL);
478         }
479
480         /*
481          * Finally, we have the complete device or file, and we know that it is
482          * acceptable to use.  Construct the nvlist to describe this vdev.  All
483          * vdevs have a 'path' element, and devices also have a 'devid' element.
484          */
485         verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
486         verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
487         verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
488         verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
489         if (strcmp(type, VDEV_TYPE_DISK) == 0)
490                 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
491                     (uint64_t)wholedisk) == 0);
492
493         if (props != NULL) {
494                 uint64_t ashift = 0;
495                 char *value = NULL;
496
497                 if (nvlist_lookup_string(props,
498                     zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0)
499                         zfs_nicestrtonum(NULL, value, &ashift);
500
501                 if (ashift > 0)
502                         verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT,
503                             ashift) == 0);
504         }
505
506         return (vdev);
507 }
508
509 /*
510  * Go through and verify the replication level of the pool is consistent.
511  * Performs the following checks:
512  *
513  *      For the new spec, verifies that devices in mirrors and raidz are the
514  *      same size.
515  *
516  *      If the current configuration already has inconsistent replication
517  *      levels, ignore any other potential problems in the new spec.
518  *
519  *      Otherwise, make sure that the current spec (if there is one) and the new
520  *      spec have consistent replication levels.
521  */
522 typedef struct replication_level {
523         char *zprl_type;
524         uint64_t zprl_children;
525         uint64_t zprl_parity;
526 } replication_level_t;
527
528 #define ZPOOL_FUZZ      (16 * 1024 * 1024)
529
530 /*
531  * Given a list of toplevel vdevs, return the current replication level.  If
532  * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
533  * an error message will be displayed for each self-inconsistent vdev.
534  */
535 static replication_level_t *
536 get_replication(nvlist_t *nvroot, boolean_t fatal)
537 {
538         nvlist_t **top;
539         uint_t t, toplevels;
540         nvlist_t **child;
541         uint_t c, children;
542         nvlist_t *nv;
543         char *type;
544         replication_level_t lastrep = { 0 }, rep, *ret;
545         boolean_t dontreport;
546
547         ret = safe_malloc(sizeof (replication_level_t));
548
549         verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
550             &top, &toplevels) == 0);
551
552         lastrep.zprl_type = NULL;
553         for (t = 0; t < toplevels; t++) {
554                 uint64_t is_log = B_FALSE;
555
556                 nv = top[t];
557
558                 /*
559                  * For separate logs we ignore the top level vdev replication
560                  * constraints.
561                  */
562                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
563                 if (is_log)
564                         continue;
565
566                 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE,
567                     &type) == 0);
568                 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
569                     &child, &children) != 0) {
570                         /*
571                          * This is a 'file' or 'disk' vdev.
572                          */
573                         rep.zprl_type = type;
574                         rep.zprl_children = 1;
575                         rep.zprl_parity = 0;
576                 } else {
577                         uint64_t vdev_size;
578
579                         /*
580                          * This is a mirror or RAID-Z vdev.  Go through and make
581                          * sure the contents are all the same (files vs. disks),
582                          * keeping track of the number of elements in the
583                          * process.
584                          *
585                          * We also check that the size of each vdev (if it can
586                          * be determined) is the same.
587                          */
588                         rep.zprl_type = type;
589                         rep.zprl_children = 0;
590
591                         if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
592                                 verify(nvlist_lookup_uint64(nv,
593                                     ZPOOL_CONFIG_NPARITY,
594                                     &rep.zprl_parity) == 0);
595                                 assert(rep.zprl_parity != 0);
596                         } else {
597                                 rep.zprl_parity = 0;
598                         }
599
600                         /*
601                          * The 'dontreport' variable indicates that we've
602                          * already reported an error for this spec, so don't
603                          * bother doing it again.
604                          */
605                         type = NULL;
606                         dontreport = 0;
607                         vdev_size = -1ULL;
608                         for (c = 0; c < children; c++) {
609                                 nvlist_t *cnv = child[c];
610                                 char *path;
611                                 struct stat64 statbuf;
612                                 uint64_t size = -1ULL;
613                                 char *childtype;
614                                 int fd, err;
615
616                                 rep.zprl_children++;
617
618                                 verify(nvlist_lookup_string(cnv,
619                                     ZPOOL_CONFIG_TYPE, &childtype) == 0);
620
621                                 /*
622                                  * If this is a replacing or spare vdev, then
623                                  * get the real first child of the vdev.
624                                  */
625                                 if (strcmp(childtype,
626                                     VDEV_TYPE_REPLACING) == 0 ||
627                                     strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
628                                         nvlist_t **rchild;
629                                         uint_t rchildren;
630
631                                         verify(nvlist_lookup_nvlist_array(cnv,
632                                             ZPOOL_CONFIG_CHILDREN, &rchild,
633                                             &rchildren) == 0);
634                                         assert(rchildren == 2);
635                                         cnv = rchild[0];
636
637                                         verify(nvlist_lookup_string(cnv,
638                                             ZPOOL_CONFIG_TYPE,
639                                             &childtype) == 0);
640                                 }
641
642                                 verify(nvlist_lookup_string(cnv,
643                                     ZPOOL_CONFIG_PATH, &path) == 0);
644
645                                 /*
646                                  * If we have a raidz/mirror that combines disks
647                                  * with files, report it as an error.
648                                  */
649                                 if (!dontreport && type != NULL &&
650                                     strcmp(type, childtype) != 0) {
651                                         if (ret != NULL)
652                                                 free(ret);
653                                         ret = NULL;
654                                         if (fatal)
655                                                 vdev_error(gettext(
656                                                     "mismatched replication "
657                                                     "level: %s contains both "
658                                                     "files and devices\n"),
659                                                     rep.zprl_type);
660                                         else
661                                                 return (NULL);
662                                         dontreport = B_TRUE;
663                                 }
664
665                                 /*
666                                  * According to stat(2), the value of 'st_size'
667                                  * is undefined for block devices and character
668                                  * devices.  But there is no effective way to
669                                  * determine the real size in userland.
670                                  *
671                                  * Instead, we'll take advantage of an
672                                  * implementation detail of spec_size().  If the
673                                  * device is currently open, then we (should)
674                                  * return a valid size.
675                                  *
676                                  * If we still don't get a valid size (indicated
677                                  * by a size of 0 or MAXOFFSET_T), then ignore
678                                  * this device altogether.
679                                  */
680                                 if ((fd = open(path, O_RDONLY)) >= 0) {
681                                         err = fstat64(fd, &statbuf);
682                                         (void) close(fd);
683                                 } else {
684                                         err = stat64(path, &statbuf);
685                                 }
686
687                                 if (err != 0 ||
688                                     statbuf.st_size == 0 ||
689                                     statbuf.st_size == MAXOFFSET_T)
690                                         continue;
691
692                                 size = statbuf.st_size;
693
694                                 /*
695                                  * Also make sure that devices and
696                                  * slices have a consistent size.  If
697                                  * they differ by a significant amount
698                                  * (~16MB) then report an error.
699                                  */
700                                 if (!dontreport &&
701                                     (vdev_size != -1ULL &&
702                                     (labs(size - vdev_size) >
703                                     ZPOOL_FUZZ))) {
704                                         if (ret != NULL)
705                                                 free(ret);
706                                         ret = NULL;
707                                         if (fatal)
708                                                 vdev_error(gettext(
709                                                     "%s contains devices of "
710                                                     "different sizes\n"),
711                                                     rep.zprl_type);
712                                         else
713                                                 return (NULL);
714                                         dontreport = B_TRUE;
715                                 }
716
717                                 type = childtype;
718                                 vdev_size = size;
719                         }
720                 }
721
722                 /*
723                  * At this point, we have the replication of the last toplevel
724                  * vdev in 'rep'.  Compare it to 'lastrep' to see if its
725                  * different.
726                  */
727                 if (lastrep.zprl_type != NULL) {
728                         if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) {
729                                 if (ret != NULL)
730                                         free(ret);
731                                 ret = NULL;
732                                 if (fatal)
733                                         vdev_error(gettext(
734                                             "mismatched replication level: "
735                                             "both %s and %s vdevs are "
736                                             "present\n"),
737                                             lastrep.zprl_type, rep.zprl_type);
738                                 else
739                                         return (NULL);
740                         } else if (lastrep.zprl_parity != rep.zprl_parity) {
741                                 if (ret)
742                                         free(ret);
743                                 ret = NULL;
744                                 if (fatal)
745                                         vdev_error(gettext(
746                                             "mismatched replication level: "
747                                             "both %llu and %llu device parity "
748                                             "%s vdevs are present\n"),
749                                             lastrep.zprl_parity,
750                                             rep.zprl_parity,
751                                             rep.zprl_type);
752                                 else
753                                         return (NULL);
754                         } else if (lastrep.zprl_children != rep.zprl_children) {
755                                 if (ret)
756                                         free(ret);
757                                 ret = NULL;
758                                 if (fatal)
759                                         vdev_error(gettext(
760                                             "mismatched replication level: "
761                                             "both %llu-way and %llu-way %s "
762                                             "vdevs are present\n"),
763                                             lastrep.zprl_children,
764                                             rep.zprl_children,
765                                             rep.zprl_type);
766                                 else
767                                         return (NULL);
768                         }
769                 }
770                 lastrep = rep;
771         }
772
773         if (ret != NULL)
774                 *ret = rep;
775
776         return (ret);
777 }
778
779 /*
780  * Check the replication level of the vdev spec against the current pool.  Calls
781  * get_replication() to make sure the new spec is self-consistent.  If the pool
782  * has a consistent replication level, then we ignore any errors.  Otherwise,
783  * report any difference between the two.
784  */
785 static int
786 check_replication(nvlist_t *config, nvlist_t *newroot)
787 {
788         nvlist_t **child;
789         uint_t  children;
790         replication_level_t *current = NULL, *new;
791         int ret;
792
793         /*
794          * If we have a current pool configuration, check to see if it's
795          * self-consistent.  If not, simply return success.
796          */
797         if (config != NULL) {
798                 nvlist_t *nvroot;
799
800                 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
801                     &nvroot) == 0);
802                 if ((current = get_replication(nvroot, B_FALSE)) == NULL)
803                         return (0);
804         }
805         /*
806          * for spares there may be no children, and therefore no
807          * replication level to check
808          */
809         if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
810             &child, &children) != 0) || (children == 0)) {
811                 free(current);
812                 return (0);
813         }
814
815         /*
816          * If all we have is logs then there's no replication level to check.
817          */
818         if (num_logs(newroot) == children) {
819                 free(current);
820                 return (0);
821         }
822
823         /*
824          * Get the replication level of the new vdev spec, reporting any
825          * inconsistencies found.
826          */
827         if ((new = get_replication(newroot, B_TRUE)) == NULL) {
828                 free(current);
829                 return (-1);
830         }
831
832         /*
833          * Check to see if the new vdev spec matches the replication level of
834          * the current pool.
835          */
836         ret = 0;
837         if (current != NULL) {
838                 if (strcmp(current->zprl_type, new->zprl_type) != 0) {
839                         vdev_error(gettext(
840                             "mismatched replication level: pool uses %s "
841                             "and new vdev is %s\n"),
842                             current->zprl_type, new->zprl_type);
843                         ret = -1;
844                 } else if (current->zprl_parity != new->zprl_parity) {
845                         vdev_error(gettext(
846                             "mismatched replication level: pool uses %llu "
847                             "device parity and new vdev uses %llu\n"),
848                             current->zprl_parity, new->zprl_parity);
849                         ret = -1;
850                 } else if (current->zprl_children != new->zprl_children) {
851                         vdev_error(gettext(
852                             "mismatched replication level: pool uses %llu-way "
853                             "%s and new vdev uses %llu-way %s\n"),
854                             current->zprl_children, current->zprl_type,
855                             new->zprl_children, new->zprl_type);
856                         ret = -1;
857                 }
858         }
859
860         free(new);
861         if (current != NULL)
862                 free(current);
863
864         return (ret);
865 }
866
867 static int
868 zero_label(char *path)
869 {
870         const int size = 4096;
871         char buf[size];
872         int err, fd;
873
874         if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) {
875                 (void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
876                     path, strerror(errno));
877                 return (-1);
878         }
879
880         memset(buf, 0, size);
881         err = write(fd, buf, size);
882         (void) fdatasync(fd);
883         (void) close(fd);
884
885         if (err == -1) {
886                 (void) fprintf(stderr, gettext("cannot zero first %d bytes "
887                     "of '%s': %s\n"), size, path, strerror(errno));
888                 return (-1);
889         }
890
891         if (err != size) {
892                 (void) fprintf(stderr, gettext("could only zero %d/%d bytes "
893                     "of '%s'\n"), err, size, path);
894                 return (-1);
895         }
896
897         return 0;
898 }
899
900 /*
901  * Go through and find any whole disks in the vdev specification, labelling them
902  * as appropriate.  When constructing the vdev spec, we were unable to open this
903  * device in order to provide a devid.  Now that we have labelled the disk and
904  * know that slice 0 is valid, we can construct the devid now.
905  *
906  * If the disk was already labeled with an EFI label, we will have gotten the
907  * devid already (because we were able to open the whole disk).  Otherwise, we
908  * need to get the devid after we label the disk.
909  */
910 static int
911 make_disks(zpool_handle_t *zhp, nvlist_t *nv)
912 {
913         nvlist_t **child;
914         uint_t c, children;
915         char *type, *path, *diskname;
916         char devpath[MAXPATHLEN];
917         char udevpath[MAXPATHLEN];
918         uint64_t wholedisk;
919         struct stat64 statbuf;
920         int ret;
921
922         verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
923
924         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
925             &child, &children) != 0) {
926
927                 if (strcmp(type, VDEV_TYPE_DISK) != 0)
928                         return (0);
929
930                 /*
931                  * We have a disk device.  If this is a whole disk write
932                  * out the efi partition table, otherwise write zero's to
933                  * the first 4k of the partition.  This is to ensure that
934                  * libblkid will not misidentify the partition due to a
935                  * magic value left by the previous filesystem.
936                  */
937                 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
938                 verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
939                     &wholedisk));
940
941                 if (!wholedisk) {
942                         ret = zero_label(path);
943                         return (ret);
944                 }
945
946                 if (realpath(path, devpath) == NULL) {
947                         ret = errno;
948                         (void) fprintf(stderr,
949                             gettext("cannot resolve path '%s'\n"), path);
950                         return (ret);
951                 }
952
953                 /*
954                  * Remove any previously existing symlink from a udev path to
955                  * the device before labeling the disk.  This makes
956                  * zpool_label_disk_wait() truly wait for the new link to show
957                  * up instead of returning if it finds an old link still in
958                  * place.  Otherwise there is a window between when udev
959                  * deletes and recreates the link during which access attempts
960                  * will fail with ENOENT.
961                  */
962                 zfs_append_partition(path, udevpath, sizeof (udevpath));
963                 if ((strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) &&
964                     (lstat64(udevpath, &statbuf) == 0) &&
965                     S_ISLNK(statbuf.st_mode))
966                         (void) unlink(udevpath);
967
968                 diskname = strrchr(devpath, '/');
969                 assert(diskname != NULL);
970                 diskname++;
971                 if (zpool_label_disk(g_zfs, zhp, diskname) == -1)
972                         return (-1);
973
974                 /*
975                  * Now we've labeled the disk and the partitions have been
976                  * created.  We still need to wait for udev to create the
977                  * symlinks to those partitions.
978                  */
979                 if ((ret = zpool_label_disk_wait(udevpath, 1000)) != 0) {
980                         (void) fprintf(stderr,
981                             gettext( "cannot resolve path '%s'\n"), udevpath);
982                         return (-1);
983                 }
984
985                 /*
986                  * Update the path to refer to FIRST_SLICE.  The presence of
987                  * the 'whole_disk' field indicates to the CLI that we should
988                  * chop off the slice number when displaying the device in
989                  * future output.
990                  */
991                 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0);
992
993                 /* Just in case this partition already existed. */
994                 (void) zero_label(udevpath);
995
996                 return (0);
997         }
998
999         for (c = 0; c < children; c++)
1000                 if ((ret = make_disks(zhp, child[c])) != 0)
1001                         return (ret);
1002
1003         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1004             &child, &children) == 0)
1005                 for (c = 0; c < children; c++)
1006                         if ((ret = make_disks(zhp, child[c])) != 0)
1007                                 return (ret);
1008
1009         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1010             &child, &children) == 0)
1011                 for (c = 0; c < children; c++)
1012                         if ((ret = make_disks(zhp, child[c])) != 0)
1013                                 return (ret);
1014
1015         return (0);
1016 }
1017
1018 /*
1019  * Determine if the given path is a hot spare within the given configuration.
1020  */
1021 static boolean_t
1022 is_spare(nvlist_t *config, const char *path)
1023 {
1024         int fd;
1025         pool_state_t state;
1026         char *name = NULL;
1027         nvlist_t *label;
1028         uint64_t guid, spareguid;
1029         nvlist_t *nvroot;
1030         nvlist_t **spares;
1031         uint_t i, nspares;
1032         boolean_t inuse;
1033
1034         if ((fd = open(path, O_RDONLY|O_EXCL)) < 0)
1035                 return (B_FALSE);
1036
1037         if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
1038             !inuse ||
1039             state != POOL_STATE_SPARE ||
1040             zpool_read_label(fd, &label) != 0) {
1041                 free(name);
1042                 (void) close(fd);
1043                 return (B_FALSE);
1044         }
1045         free(name);
1046         (void) close(fd);
1047
1048         verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
1049         nvlist_free(label);
1050
1051         verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1052             &nvroot) == 0);
1053         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1054             &spares, &nspares) == 0) {
1055                 for (i = 0; i < nspares; i++) {
1056                         verify(nvlist_lookup_uint64(spares[i],
1057                             ZPOOL_CONFIG_GUID, &spareguid) == 0);
1058                         if (spareguid == guid)
1059                                 return (B_TRUE);
1060                 }
1061         }
1062
1063         return (B_FALSE);
1064 }
1065
1066 /*
1067  * Go through and find any devices that are in use.  We rely on libdiskmgt for
1068  * the majority of this task.
1069  */
1070 static int
1071 check_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
1072     boolean_t replacing, boolean_t isspare)
1073 {
1074         nvlist_t **child;
1075         uint_t c, children;
1076         char *type, *path;
1077         int ret = 0;
1078         char buf[MAXPATHLEN];
1079         uint64_t wholedisk = B_FALSE;
1080
1081         verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1082
1083         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1084             &child, &children) != 0) {
1085
1086                 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
1087                 if (strcmp(type, VDEV_TYPE_DISK) == 0)
1088                         verify(!nvlist_lookup_uint64(nv,
1089                                ZPOOL_CONFIG_WHOLE_DISK, &wholedisk));
1090
1091                 /*
1092                  * As a generic check, we look to see if this is a replace of a
1093                  * hot spare within the same pool.  If so, we allow it
1094                  * regardless of what libblkid or zpool_in_use() says.
1095                  */
1096                 if (replacing) {
1097                         if (wholedisk)
1098                                 (void) snprintf(buf, sizeof (buf), "%ss0",
1099                                     path);
1100                         else
1101                                 (void) strlcpy(buf, path, sizeof (buf));
1102
1103                         if (is_spare(config, buf))
1104                                 return (0);
1105                 }
1106
1107                 if (strcmp(type, VDEV_TYPE_DISK) == 0)
1108                         ret = check_device(path, force, isspare, wholedisk);
1109
1110                 if (strcmp(type, VDEV_TYPE_FILE) == 0)
1111                         ret = check_file(path, force, isspare);
1112
1113                 return (ret);
1114         }
1115
1116         for (c = 0; c < children; c++)
1117                 if ((ret = check_in_use(config, child[c], force,
1118                     replacing, B_FALSE)) != 0)
1119                         return (ret);
1120
1121         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1122             &child, &children) == 0)
1123                 for (c = 0; c < children; c++)
1124                         if ((ret = check_in_use(config, child[c], force,
1125                             replacing, B_TRUE)) != 0)
1126                                 return (ret);
1127
1128         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1129             &child, &children) == 0)
1130                 for (c = 0; c < children; c++)
1131                         if ((ret = check_in_use(config, child[c], force,
1132                             replacing, B_FALSE)) != 0)
1133                                 return (ret);
1134
1135         return (0);
1136 }
1137
1138 static const char *
1139 is_grouping(const char *type, int *mindev, int *maxdev)
1140 {
1141         if (strncmp(type, "raidz", 5) == 0) {
1142                 const char *p = type + 5;
1143                 char *end;
1144                 long nparity;
1145
1146                 if (*p == '\0') {
1147                         nparity = 1;
1148                 } else if (*p == '0') {
1149                         return (NULL); /* no zero prefixes allowed */
1150                 } else {
1151                         errno = 0;
1152                         nparity = strtol(p, &end, 10);
1153                         if (errno != 0 || nparity < 1 || nparity >= 255 ||
1154                             *end != '\0')
1155                                 return (NULL);
1156                 }
1157
1158                 if (mindev != NULL)
1159                         *mindev = nparity + 1;
1160                 if (maxdev != NULL)
1161                         *maxdev = 255;
1162                 return (VDEV_TYPE_RAIDZ);
1163         }
1164
1165         if (maxdev != NULL)
1166                 *maxdev = INT_MAX;
1167
1168         if (strcmp(type, "mirror") == 0) {
1169                 if (mindev != NULL)
1170                         *mindev = 2;
1171                 return (VDEV_TYPE_MIRROR);
1172         }
1173
1174         if (strcmp(type, "spare") == 0) {
1175                 if (mindev != NULL)
1176                         *mindev = 1;
1177                 return (VDEV_TYPE_SPARE);
1178         }
1179
1180         if (strcmp(type, "log") == 0) {
1181                 if (mindev != NULL)
1182                         *mindev = 1;
1183                 return (VDEV_TYPE_LOG);
1184         }
1185
1186         if (strcmp(type, "cache") == 0) {
1187                 if (mindev != NULL)
1188                         *mindev = 1;
1189                 return (VDEV_TYPE_L2CACHE);
1190         }
1191
1192         return (NULL);
1193 }
1194
1195 /*
1196  * Construct a syntactically valid vdev specification,
1197  * and ensure that all devices and files exist and can be opened.
1198  * Note: we don't bother freeing anything in the error paths
1199  * because the program is just going to exit anyway.
1200  */
1201 nvlist_t *
1202 construct_spec(nvlist_t *props, int argc, char **argv)
1203 {
1204         nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
1205         int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
1206         const char *type;
1207         uint64_t is_log;
1208         boolean_t seen_logs;
1209
1210         top = NULL;
1211         toplevels = 0;
1212         spares = NULL;
1213         l2cache = NULL;
1214         nspares = 0;
1215         nlogs = 0;
1216         nl2cache = 0;
1217         is_log = B_FALSE;
1218         seen_logs = B_FALSE;
1219
1220         while (argc > 0) {
1221                 nv = NULL;
1222
1223                 /*
1224                  * If it's a mirror or raidz, the subsequent arguments are
1225                  * its leaves -- until we encounter the next mirror or raidz.
1226                  */
1227                 if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
1228                         nvlist_t **child = NULL;
1229                         int c, children = 0;
1230
1231                         if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1232                                 if (spares != NULL) {
1233                                         (void) fprintf(stderr,
1234                                             gettext("invalid vdev "
1235                                             "specification: 'spare' can be "
1236                                             "specified only once\n"));
1237                                         return (NULL);
1238                                 }
1239                                 is_log = B_FALSE;
1240                         }
1241
1242                         if (strcmp(type, VDEV_TYPE_LOG) == 0) {
1243                                 if (seen_logs) {
1244                                         (void) fprintf(stderr,
1245                                             gettext("invalid vdev "
1246                                             "specification: 'log' can be "
1247                                             "specified only once\n"));
1248                                         return (NULL);
1249                                 }
1250                                 seen_logs = B_TRUE;
1251                                 is_log = B_TRUE;
1252                                 argc--;
1253                                 argv++;
1254                                 /*
1255                                  * A log is not a real grouping device.
1256                                  * We just set is_log and continue.
1257                                  */
1258                                 continue;
1259                         }
1260
1261                         if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1262                                 if (l2cache != NULL) {
1263                                         (void) fprintf(stderr,
1264                                             gettext("invalid vdev "
1265                                             "specification: 'cache' can be "
1266                                             "specified only once\n"));
1267                                         return (NULL);
1268                                 }
1269                                 is_log = B_FALSE;
1270                         }
1271
1272                         if (is_log) {
1273                                 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
1274                                         (void) fprintf(stderr,
1275                                             gettext("invalid vdev "
1276                                             "specification: unsupported 'log' "
1277                                             "device: %s\n"), type);
1278                                         return (NULL);
1279                                 }
1280                                 nlogs++;
1281                         }
1282
1283                         for (c = 1; c < argc; c++) {
1284                                 if (is_grouping(argv[c], NULL, NULL) != NULL)
1285                                         break;
1286                                 children++;
1287                                 child = realloc(child,
1288                                     children * sizeof (nvlist_t *));
1289                                 if (child == NULL)
1290                                         zpool_no_memory();
1291                                 if ((nv = make_leaf_vdev(props, argv[c], B_FALSE))
1292                                     == NULL)
1293                                         return (NULL);
1294                                 child[children - 1] = nv;
1295                         }
1296
1297                         if (children < mindev) {
1298                                 (void) fprintf(stderr, gettext("invalid vdev "
1299                                     "specification: %s requires at least %d "
1300                                     "devices\n"), argv[0], mindev);
1301                                 return (NULL);
1302                         }
1303
1304                         if (children > maxdev) {
1305                                 (void) fprintf(stderr, gettext("invalid vdev "
1306                                     "specification: %s supports no more than "
1307                                     "%d devices\n"), argv[0], maxdev);
1308                                 return (NULL);
1309                         }
1310
1311                         argc -= c;
1312                         argv += c;
1313
1314                         if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1315                                 spares = child;
1316                                 nspares = children;
1317                                 continue;
1318                         } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1319                                 l2cache = child;
1320                                 nl2cache = children;
1321                                 continue;
1322                         } else {
1323                                 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
1324                                     0) == 0);
1325                                 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
1326                                     type) == 0);
1327                                 verify(nvlist_add_uint64(nv,
1328                                     ZPOOL_CONFIG_IS_LOG, is_log) == 0);
1329                                 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
1330                                         verify(nvlist_add_uint64(nv,
1331                                             ZPOOL_CONFIG_NPARITY,
1332                                             mindev - 1) == 0);
1333                                 }
1334                                 verify(nvlist_add_nvlist_array(nv,
1335                                     ZPOOL_CONFIG_CHILDREN, child,
1336                                     children) == 0);
1337
1338                                 for (c = 0; c < children; c++)
1339                                         nvlist_free(child[c]);
1340                                 free(child);
1341                         }
1342                 } else {
1343                         /*
1344                          * We have a device.  Pass off to make_leaf_vdev() to
1345                          * construct the appropriate nvlist describing the vdev.
1346                          */
1347                         if ((nv = make_leaf_vdev(props, argv[0], is_log)) == NULL)
1348                                 return (NULL);
1349                         if (is_log)
1350                                 nlogs++;
1351                         argc--;
1352                         argv++;
1353                 }
1354
1355                 toplevels++;
1356                 top = realloc(top, toplevels * sizeof (nvlist_t *));
1357                 if (top == NULL)
1358                         zpool_no_memory();
1359                 top[toplevels - 1] = nv;
1360         }
1361
1362         if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
1363                 (void) fprintf(stderr, gettext("invalid vdev "
1364                     "specification: at least one toplevel vdev must be "
1365                     "specified\n"));
1366                 return (NULL);
1367         }
1368
1369         if (seen_logs && nlogs == 0) {
1370                 (void) fprintf(stderr, gettext("invalid vdev specification: "
1371                     "log requires at least 1 device\n"));
1372                 return (NULL);
1373         }
1374
1375         /*
1376          * Finally, create nvroot and add all top-level vdevs to it.
1377          */
1378         verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
1379         verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1380             VDEV_TYPE_ROOT) == 0);
1381         verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1382             top, toplevels) == 0);
1383         if (nspares != 0)
1384                 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1385                     spares, nspares) == 0);
1386         if (nl2cache != 0)
1387                 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1388                     l2cache, nl2cache) == 0);
1389
1390         for (t = 0; t < toplevels; t++)
1391                 nvlist_free(top[t]);
1392         for (t = 0; t < nspares; t++)
1393                 nvlist_free(spares[t]);
1394         for (t = 0; t < nl2cache; t++)
1395                 nvlist_free(l2cache[t]);
1396         if (spares)
1397                 free(spares);
1398         if (l2cache)
1399                 free(l2cache);
1400         free(top);
1401
1402         return (nvroot);
1403 }
1404
1405 nvlist_t *
1406 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
1407     splitflags_t flags, int argc, char **argv)
1408 {
1409         nvlist_t *newroot = NULL, **child;
1410         uint_t c, children;
1411
1412         if (argc > 0) {
1413                 if ((newroot = construct_spec(props, argc, argv)) == NULL) {
1414                         (void) fprintf(stderr, gettext("Unable to build a "
1415                             "pool from the specified devices\n"));
1416                         return (NULL);
1417                 }
1418
1419                 if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
1420                         nvlist_free(newroot);
1421                         return (NULL);
1422                 }
1423
1424                 /* avoid any tricks in the spec */
1425                 verify(nvlist_lookup_nvlist_array(newroot,
1426                     ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
1427                 for (c = 0; c < children; c++) {
1428                         char *path;
1429                         const char *type;
1430                         int min, max;
1431
1432                         verify(nvlist_lookup_string(child[c],
1433                             ZPOOL_CONFIG_PATH, &path) == 0);
1434                         if ((type = is_grouping(path, &min, &max)) != NULL) {
1435                                 (void) fprintf(stderr, gettext("Cannot use "
1436                                     "'%s' as a device for splitting\n"), type);
1437                                 nvlist_free(newroot);
1438                                 return (NULL);
1439                         }
1440                 }
1441         }
1442
1443         if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
1444                 if (newroot != NULL)
1445                         nvlist_free(newroot);
1446                 return (NULL);
1447         }
1448
1449         return (newroot);
1450 }
1451
1452 /*
1453  * Get and validate the contents of the given vdev specification.  This ensures
1454  * that the nvlist returned is well-formed, that all the devices exist, and that
1455  * they are not currently in use by any other known consumer.  The 'poolconfig'
1456  * parameter is the current configuration of the pool when adding devices
1457  * existing pool, and is used to perform additional checks, such as changing the
1458  * replication level of the pool.  It can be 'NULL' to indicate that this is a
1459  * new pool.  The 'force' flag controls whether devices should be forcefully
1460  * added, even if they appear in use.
1461  */
1462 nvlist_t *
1463 make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
1464     boolean_t replacing, boolean_t dryrun, int argc, char **argv)
1465 {
1466         nvlist_t *newroot;
1467         nvlist_t *poolconfig = NULL;
1468         is_force = force;
1469
1470         /*
1471          * Construct the vdev specification.  If this is successful, we know
1472          * that we have a valid specification, and that all devices can be
1473          * opened.
1474          */
1475         if ((newroot = construct_spec(props, argc, argv)) == NULL)
1476                 return (NULL);
1477
1478         if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL))
1479                 return (NULL);
1480
1481         /*
1482          * Validate each device to make sure that its not shared with another
1483          * subsystem.  We do this even if 'force' is set, because there are some
1484          * uses (such as a dedicated dump device) that even '-f' cannot
1485          * override.
1486          */
1487         if (check_in_use(poolconfig, newroot, force, replacing, B_FALSE) != 0) {
1488                 nvlist_free(newroot);
1489                 return (NULL);
1490         }
1491
1492         /*
1493          * Check the replication level of the given vdevs and report any errors
1494          * found.  We include the existing pool spec, if any, as we need to
1495          * catch changes against the existing replication level.
1496          */
1497         if (check_rep && check_replication(poolconfig, newroot) != 0) {
1498                 nvlist_free(newroot);
1499                 return (NULL);
1500         }
1501
1502         /*
1503          * Run through the vdev specification and label any whole disks found.
1504          */
1505         if (!dryrun && make_disks(zhp, newroot) != 0) {
1506                 nvlist_free(newroot);
1507                 return (NULL);
1508         }
1509
1510         return (newroot);
1511 }