Initial Linux ZFS GIT Repo
[zfs.git] / zfs / lib / libzpool / spa.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26
27 #pragma ident   "@(#)spa.c      1.51    08/04/09 SMI"
28
29 /*
30  * This file contains all the routines used when modifying on-disk SPA state.
31  * This includes opening, importing, destroying, exporting a pool, and syncing a
32  * pool.
33  */
34
35 #include <sys/zfs_context.h>
36 #include <sys/fm/fs/zfs.h>
37 #include <sys/spa_impl.h>
38 #include <sys/zio.h>
39 #include <sys/zio_checksum.h>
40 #include <sys/zio_compress.h>
41 #include <sys/dmu.h>
42 #include <sys/dmu_tx.h>
43 #include <sys/zap.h>
44 #include <sys/zil.h>
45 #include <sys/vdev_impl.h>
46 #include <sys/metaslab.h>
47 #include <sys/uberblock_impl.h>
48 #include <sys/txg.h>
49 #include <sys/avl.h>
50 #include <sys/dmu_traverse.h>
51 #include <sys/dmu_objset.h>
52 #include <sys/unique.h>
53 #include <sys/dsl_pool.h>
54 #include <sys/dsl_dataset.h>
55 #include <sys/dsl_dir.h>
56 #include <sys/dsl_prop.h>
57 #include <sys/dsl_synctask.h>
58 #include <sys/fs/zfs.h>
59 #include <sys/arc.h>
60 #include <sys/callb.h>
61 #include <sys/systeminfo.h>
62 #include <sys/sunddi.h>
63 #include <sys/spa_boot.h>
64
65 #include "zfs_prop.h"
66 #include "zfs_comutil.h"
67
68 int zio_taskq_threads = 8;
69
70 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
71
72 /*
73  * ==========================================================================
74  * SPA properties routines
75  * ==========================================================================
76  */
77
78 /*
79  * Add a (source=src, propname=propval) list to an nvlist.
80  */
81 static void
82 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
83     uint64_t intval, zprop_source_t src)
84 {
85         const char *propname = zpool_prop_to_name(prop);
86         nvlist_t *propval;
87
88         VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
89         VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
90
91         if (strval != NULL)
92                 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
93         else
94                 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
95
96         VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
97         nvlist_free(propval);
98 }
99
100 /*
101  * Get property values from the spa configuration.
102  */
103 static void
104 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
105 {
106         uint64_t size = spa_get_space(spa);
107         uint64_t used = spa_get_alloc(spa);
108         uint64_t cap, version;
109         zprop_source_t src = ZPROP_SRC_NONE;
110         char *cachefile;
111         size_t len;
112
113         /*
114          * readonly properties
115          */
116         spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa->spa_name, 0, src);
117         spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
118         spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src);
119         spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src);
120
121         cap = (size == 0) ? 0 : (used * 100 / size);
122         spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
123
124         spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
125         spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
126             spa->spa_root_vdev->vdev_state, src);
127
128         /*
129          * settable properties that are not stored in the pool property object.
130          */
131         version = spa_version(spa);
132         if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
133                 src = ZPROP_SRC_DEFAULT;
134         else
135                 src = ZPROP_SRC_LOCAL;
136         spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
137
138         if (spa->spa_root != NULL)
139                 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
140                     0, ZPROP_SRC_LOCAL);
141
142         if (spa->spa_config_dir != NULL) {
143                 if (strcmp(spa->spa_config_dir, "none") == 0) {
144                         spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
145                             spa->spa_config_dir, 0, ZPROP_SRC_LOCAL);
146                 } else {
147                         len = strlen(spa->spa_config_dir) +
148                             strlen(spa->spa_config_file) + 2;
149                         cachefile = kmem_alloc(len, KM_SLEEP);
150                         (void) snprintf(cachefile, len, "%s/%s",
151                             spa->spa_config_dir, spa->spa_config_file);
152                         spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
153                             cachefile, 0, ZPROP_SRC_LOCAL);
154                         kmem_free(cachefile, len);
155                 }
156         }
157 }
158
159 /*
160  * Get zpool property values.
161  */
162 int
163 spa_prop_get(spa_t *spa, nvlist_t **nvp)
164 {
165         zap_cursor_t zc;
166         zap_attribute_t za;
167         objset_t *mos = spa->spa_meta_objset;
168         int err;
169
170         VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
171
172         /*
173          * Get properties from the spa config.
174          */
175         spa_prop_get_config(spa, nvp);
176
177         mutex_enter(&spa->spa_props_lock);
178         /* If no pool property object, no more prop to get. */
179         if (spa->spa_pool_props_object == 0) {
180                 mutex_exit(&spa->spa_props_lock);
181                 return (0);
182         }
183
184         /*
185          * Get properties from the MOS pool property object.
186          */
187         for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
188             (err = zap_cursor_retrieve(&zc, &za)) == 0;
189             zap_cursor_advance(&zc)) {
190                 uint64_t intval = 0;
191                 char *strval = NULL;
192                 zprop_source_t src = ZPROP_SRC_DEFAULT;
193                 zpool_prop_t prop;
194
195                 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
196                         continue;
197
198                 switch (za.za_integer_length) {
199                 case 8:
200                         /* integer property */
201                         if (za.za_first_integer !=
202                             zpool_prop_default_numeric(prop))
203                                 src = ZPROP_SRC_LOCAL;
204
205                         if (prop == ZPOOL_PROP_BOOTFS) {
206                                 dsl_pool_t *dp;
207                                 dsl_dataset_t *ds = NULL;
208
209                                 dp = spa_get_dsl(spa);
210                                 rw_enter(&dp->dp_config_rwlock, RW_READER);
211                                 if (err = dsl_dataset_open_obj(dp,
212                                     za.za_first_integer, NULL, DS_MODE_NONE,
213                                     FTAG, &ds)) {
214                                         rw_exit(&dp->dp_config_rwlock);
215                                         break;
216                                 }
217
218                                 strval = kmem_alloc(
219                                     MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
220                                     KM_SLEEP);
221                                 dsl_dataset_name(ds, strval);
222                                 dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
223                                 rw_exit(&dp->dp_config_rwlock);
224                         } else {
225                                 strval = NULL;
226                                 intval = za.za_first_integer;
227                         }
228
229                         spa_prop_add_list(*nvp, prop, strval, intval, src);
230
231                         if (strval != NULL)
232                                 kmem_free(strval,
233                                     MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
234
235                         break;
236
237                 case 1:
238                         /* string property */
239                         strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
240                         err = zap_lookup(mos, spa->spa_pool_props_object,
241                             za.za_name, 1, za.za_num_integers, strval);
242                         if (err) {
243                                 kmem_free(strval, za.za_num_integers);
244                                 break;
245                         }
246                         spa_prop_add_list(*nvp, prop, strval, 0, src);
247                         kmem_free(strval, za.za_num_integers);
248                         break;
249
250                 default:
251                         break;
252                 }
253         }
254         zap_cursor_fini(&zc);
255         mutex_exit(&spa->spa_props_lock);
256 out:
257         if (err && err != ENOENT) {
258                 nvlist_free(*nvp);
259                 *nvp = NULL;
260                 return (err);
261         }
262
263         return (0);
264 }
265
266 /*
267  * Validate the given pool properties nvlist and modify the list
268  * for the property values to be set.
269  */
270 static int
271 spa_prop_validate(spa_t *spa, nvlist_t *props)
272 {
273         nvpair_t *elem;
274         int error = 0, reset_bootfs = 0;
275         uint64_t objnum;
276
277         elem = NULL;
278         while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
279                 zpool_prop_t prop;
280                 char *propname, *strval;
281                 uint64_t intval;
282                 vdev_t *rvdev;
283                 char *vdev_type;
284                 objset_t *os;
285                 char *slash;
286
287                 propname = nvpair_name(elem);
288
289                 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
290                         return (EINVAL);
291
292                 switch (prop) {
293                 case ZPOOL_PROP_VERSION:
294                         error = nvpair_value_uint64(elem, &intval);
295                         if (!error &&
296                             (intval < spa_version(spa) || intval > SPA_VERSION))
297                                 error = EINVAL;
298                         break;
299
300                 case ZPOOL_PROP_DELEGATION:
301                 case ZPOOL_PROP_AUTOREPLACE:
302                         error = nvpair_value_uint64(elem, &intval);
303                         if (!error && intval > 1)
304                                 error = EINVAL;
305                         break;
306
307                 case ZPOOL_PROP_BOOTFS:
308                         if (spa_version(spa) < SPA_VERSION_BOOTFS) {
309                                 error = ENOTSUP;
310                                 break;
311                         }
312
313                         /*
314                          * A bootable filesystem can not be on a RAIDZ pool
315                          * nor a striped pool with more than 1 device.
316                          */
317                         rvdev = spa->spa_root_vdev;
318                         vdev_type =
319                             rvdev->vdev_child[0]->vdev_ops->vdev_op_type;
320                         if (rvdev->vdev_children > 1 ||
321                             strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
322                             strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
323                                 error = ENOTSUP;
324                                 break;
325                         }
326
327                         reset_bootfs = 1;
328
329                         error = nvpair_value_string(elem, &strval);
330
331                         if (!error) {
332                                 if (strval == NULL || strval[0] == '\0') {
333                                         objnum = zpool_prop_default_numeric(
334                                             ZPOOL_PROP_BOOTFS);
335                                         break;
336                                 }
337
338                                 if (error = dmu_objset_open(strval, DMU_OST_ZFS,
339                                     DS_MODE_STANDARD | DS_MODE_READONLY, &os))
340                                         break;
341                                 objnum = dmu_objset_id(os);
342                                 dmu_objset_close(os);
343                         }
344                         break;
345                 case ZPOOL_PROP_FAILUREMODE:
346                         error = nvpair_value_uint64(elem, &intval);
347                         if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
348                             intval > ZIO_FAILURE_MODE_PANIC))
349                                 error = EINVAL;
350
351                         /*
352                          * This is a special case which only occurs when
353                          * the pool has completely failed. This allows
354                          * the user to change the in-core failmode property
355                          * without syncing it out to disk (I/Os might
356                          * currently be blocked). We do this by returning
357                          * EIO to the caller (spa_prop_set) to trick it
358                          * into thinking we encountered a property validation
359                          * error.
360                          */
361                         if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) {
362                                 spa->spa_failmode = intval;
363                                 error = EIO;
364                         }
365                         break;
366
367                 case ZPOOL_PROP_CACHEFILE:
368                         if ((error = nvpair_value_string(elem, &strval)) != 0)
369                                 break;
370
371                         if (strval[0] == '\0')
372                                 break;
373
374                         if (strcmp(strval, "none") == 0)
375                                 break;
376
377                         if (strval[0] != '/') {
378                                 error = EINVAL;
379                                 break;
380                         }
381
382                         slash = strrchr(strval, '/');
383                         ASSERT(slash != NULL);
384
385                         if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
386                             strcmp(slash, "/..") == 0)
387                                 error = EINVAL;
388                         break;
389                 }
390
391                 if (error)
392                         break;
393         }
394
395         if (!error && reset_bootfs) {
396                 error = nvlist_remove(props,
397                     zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
398
399                 if (!error) {
400                         error = nvlist_add_uint64(props,
401                             zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
402                 }
403         }
404
405         return (error);
406 }
407
408 int
409 spa_prop_set(spa_t *spa, nvlist_t *nvp)
410 {
411         int error;
412
413         if ((error = spa_prop_validate(spa, nvp)) != 0)
414                 return (error);
415
416         return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
417             spa, nvp, 3));
418 }
419
420 /*
421  * If the bootfs property value is dsobj, clear it.
422  */
423 void
424 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
425 {
426         if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
427                 VERIFY(zap_remove(spa->spa_meta_objset,
428                     spa->spa_pool_props_object,
429                     zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
430                 spa->spa_bootfs = 0;
431         }
432 }
433
434 /*
435  * ==========================================================================
436  * SPA state manipulation (open/create/destroy/import/export)
437  * ==========================================================================
438  */
439
440 static int
441 spa_error_entry_compare(const void *a, const void *b)
442 {
443         spa_error_entry_t *sa = (spa_error_entry_t *)a;
444         spa_error_entry_t *sb = (spa_error_entry_t *)b;
445         int ret;
446
447         ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
448             sizeof (zbookmark_t));
449
450         if (ret < 0)
451                 return (-1);
452         else if (ret > 0)
453                 return (1);
454         else
455                 return (0);
456 }
457
458 /*
459  * Utility function which retrieves copies of the current logs and
460  * re-initializes them in the process.
461  */
462 void
463 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
464 {
465         ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
466
467         bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
468         bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
469
470         avl_create(&spa->spa_errlist_scrub,
471             spa_error_entry_compare, sizeof (spa_error_entry_t),
472             offsetof(spa_error_entry_t, se_avl));
473         avl_create(&spa->spa_errlist_last,
474             spa_error_entry_compare, sizeof (spa_error_entry_t),
475             offsetof(spa_error_entry_t, se_avl));
476 }
477
478 /*
479  * Activate an uninitialized pool.
480  */
481 static void
482 spa_activate(spa_t *spa)
483 {
484         int t;
485
486         ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
487
488         spa->spa_state = POOL_STATE_ACTIVE;
489
490         spa->spa_normal_class = metaslab_class_create();
491         spa->spa_log_class = metaslab_class_create();
492
493         for (t = 0; t < ZIO_TYPES; t++) {
494                 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
495                     zio_taskq_threads, maxclsyspri, 50, INT_MAX,
496                     TASKQ_PREPOPULATE);
497                 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr",
498                     zio_taskq_threads, maxclsyspri, 50, INT_MAX,
499                     TASKQ_PREPOPULATE);
500         }
501
502         list_create(&spa->spa_dirty_list, sizeof (vdev_t),
503             offsetof(vdev_t, vdev_dirty_node));
504         list_create(&spa->spa_zio_list, sizeof (zio_t),
505             offsetof(zio_t, zio_link_node));
506
507         txg_list_create(&spa->spa_vdev_txg_list,
508             offsetof(struct vdev, vdev_txg_node));
509
510         avl_create(&spa->spa_errlist_scrub,
511             spa_error_entry_compare, sizeof (spa_error_entry_t),
512             offsetof(spa_error_entry_t, se_avl));
513         avl_create(&spa->spa_errlist_last,
514             spa_error_entry_compare, sizeof (spa_error_entry_t),
515             offsetof(spa_error_entry_t, se_avl));
516 }
517
518 /*
519  * Opposite of spa_activate().
520  */
521 static void
522 spa_deactivate(spa_t *spa)
523 {
524         int t;
525
526         ASSERT(spa->spa_sync_on == B_FALSE);
527         ASSERT(spa->spa_dsl_pool == NULL);
528         ASSERT(spa->spa_root_vdev == NULL);
529
530         ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
531
532         txg_list_destroy(&spa->spa_vdev_txg_list);
533
534         list_destroy(&spa->spa_dirty_list);
535         list_destroy(&spa->spa_zio_list);
536
537         for (t = 0; t < ZIO_TYPES; t++) {
538                 taskq_destroy(spa->spa_zio_issue_taskq[t]);
539                 taskq_destroy(spa->spa_zio_intr_taskq[t]);
540                 spa->spa_zio_issue_taskq[t] = NULL;
541                 spa->spa_zio_intr_taskq[t] = NULL;
542         }
543
544         metaslab_class_destroy(spa->spa_normal_class);
545         spa->spa_normal_class = NULL;
546
547         metaslab_class_destroy(spa->spa_log_class);
548         spa->spa_log_class = NULL;
549
550         /*
551          * If this was part of an import or the open otherwise failed, we may
552          * still have errors left in the queues.  Empty them just in case.
553          */
554         spa_errlog_drain(spa);
555
556         avl_destroy(&spa->spa_errlist_scrub);
557         avl_destroy(&spa->spa_errlist_last);
558
559         spa->spa_state = POOL_STATE_UNINITIALIZED;
560 }
561
562 /*
563  * Verify a pool configuration, and construct the vdev tree appropriately.  This
564  * will create all the necessary vdevs in the appropriate layout, with each vdev
565  * in the CLOSED state.  This will prep the pool before open/creation/import.
566  * All vdev validation is done by the vdev_alloc() routine.
567  */
568 static int
569 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
570     uint_t id, int atype)
571 {
572         nvlist_t **child;
573         uint_t c, children;
574         int error;
575
576         if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
577                 return (error);
578
579         if ((*vdp)->vdev_ops->vdev_op_leaf)
580                 return (0);
581
582         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
583             &child, &children) != 0) {
584                 vdev_free(*vdp);
585                 *vdp = NULL;
586                 return (EINVAL);
587         }
588
589         for (c = 0; c < children; c++) {
590                 vdev_t *vd;
591                 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
592                     atype)) != 0) {
593                         vdev_free(*vdp);
594                         *vdp = NULL;
595                         return (error);
596                 }
597         }
598
599         ASSERT(*vdp != NULL);
600
601         return (0);
602 }
603
604 /*
605  * Opposite of spa_load().
606  */
607 static void
608 spa_unload(spa_t *spa)
609 {
610         int i;
611
612         /*
613          * Stop async tasks.
614          */
615         spa_async_suspend(spa);
616
617         /*
618          * Stop syncing.
619          */
620         if (spa->spa_sync_on) {
621                 txg_sync_stop(spa->spa_dsl_pool);
622                 spa->spa_sync_on = B_FALSE;
623         }
624
625         /*
626          * Wait for any outstanding prefetch I/O to complete.
627          */
628         spa_config_enter(spa, RW_WRITER, FTAG);
629         spa_config_exit(spa, FTAG);
630
631         /*
632          * Drop and purge level 2 cache
633          */
634         spa_l2cache_drop(spa);
635
636         /*
637          * Close the dsl pool.
638          */
639         if (spa->spa_dsl_pool) {
640                 dsl_pool_close(spa->spa_dsl_pool);
641                 spa->spa_dsl_pool = NULL;
642         }
643
644         /*
645          * Close all vdevs.
646          */
647         if (spa->spa_root_vdev)
648                 vdev_free(spa->spa_root_vdev);
649         ASSERT(spa->spa_root_vdev == NULL);
650
651         for (i = 0; i < spa->spa_spares.sav_count; i++)
652                 vdev_free(spa->spa_spares.sav_vdevs[i]);
653         if (spa->spa_spares.sav_vdevs) {
654                 kmem_free(spa->spa_spares.sav_vdevs,
655                     spa->spa_spares.sav_count * sizeof (void *));
656                 spa->spa_spares.sav_vdevs = NULL;
657         }
658         if (spa->spa_spares.sav_config) {
659                 nvlist_free(spa->spa_spares.sav_config);
660                 spa->spa_spares.sav_config = NULL;
661         }
662
663         for (i = 0; i < spa->spa_l2cache.sav_count; i++)
664                 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
665         if (spa->spa_l2cache.sav_vdevs) {
666                 kmem_free(spa->spa_l2cache.sav_vdevs,
667                     spa->spa_l2cache.sav_count * sizeof (void *));
668                 spa->spa_l2cache.sav_vdevs = NULL;
669         }
670         if (spa->spa_l2cache.sav_config) {
671                 nvlist_free(spa->spa_l2cache.sav_config);
672                 spa->spa_l2cache.sav_config = NULL;
673         }
674
675         spa->spa_async_suspended = 0;
676 }
677
678 /*
679  * Load (or re-load) the current list of vdevs describing the active spares for
680  * this pool.  When this is called, we have some form of basic information in
681  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
682  * then re-generate a more complete list including status information.
683  */
684 static void
685 spa_load_spares(spa_t *spa)
686 {
687         nvlist_t **spares;
688         uint_t nspares;
689         int i;
690         vdev_t *vd, *tvd;
691
692         /*
693          * First, close and free any existing spare vdevs.
694          */
695         for (i = 0; i < spa->spa_spares.sav_count; i++) {
696                 vd = spa->spa_spares.sav_vdevs[i];
697
698                 /* Undo the call to spa_activate() below */
699                 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
700                     tvd->vdev_isspare)
701                         spa_spare_remove(tvd);
702                 vdev_close(vd);
703                 vdev_free(vd);
704         }
705
706         if (spa->spa_spares.sav_vdevs)
707                 kmem_free(spa->spa_spares.sav_vdevs,
708                     spa->spa_spares.sav_count * sizeof (void *));
709
710         if (spa->spa_spares.sav_config == NULL)
711                 nspares = 0;
712         else
713                 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
714                     ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
715
716         spa->spa_spares.sav_count = (int)nspares;
717         spa->spa_spares.sav_vdevs = NULL;
718
719         if (nspares == 0)
720                 return;
721
722         /*
723          * Construct the array of vdevs, opening them to get status in the
724          * process.   For each spare, there is potentially two different vdev_t
725          * structures associated with it: one in the list of spares (used only
726          * for basic validation purposes) and one in the active vdev
727          * configuration (if it's spared in).  During this phase we open and
728          * validate each vdev on the spare list.  If the vdev also exists in the
729          * active configuration, then we also mark this vdev as an active spare.
730          */
731         spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
732             KM_SLEEP);
733         for (i = 0; i < spa->spa_spares.sav_count; i++) {
734                 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
735                     VDEV_ALLOC_SPARE) == 0);
736                 ASSERT(vd != NULL);
737
738                 spa->spa_spares.sav_vdevs[i] = vd;
739
740                 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
741                         if (!tvd->vdev_isspare)
742                                 spa_spare_add(tvd);
743
744                         /*
745                          * We only mark the spare active if we were successfully
746                          * able to load the vdev.  Otherwise, importing a pool
747                          * with a bad active spare would result in strange
748                          * behavior, because multiple pool would think the spare
749                          * is actively in use.
750                          *
751                          * There is a vulnerability here to an equally bizarre
752                          * circumstance, where a dead active spare is later
753                          * brought back to life (onlined or otherwise).  Given
754                          * the rarity of this scenario, and the extra complexity
755                          * it adds, we ignore the possibility.
756                          */
757                         if (!vdev_is_dead(tvd))
758                                 spa_spare_activate(tvd);
759                 }
760
761                 if (vdev_open(vd) != 0)
762                         continue;
763
764                 vd->vdev_top = vd;
765                 if (vdev_validate_aux(vd) == 0)
766                         spa_spare_add(vd);
767         }
768
769         /*
770          * Recompute the stashed list of spares, with status information
771          * this time.
772          */
773         VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
774             DATA_TYPE_NVLIST_ARRAY) == 0);
775
776         spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
777             KM_SLEEP);
778         for (i = 0; i < spa->spa_spares.sav_count; i++)
779                 spares[i] = vdev_config_generate(spa,
780                     spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
781         VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
782             ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
783         for (i = 0; i < spa->spa_spares.sav_count; i++)
784                 nvlist_free(spares[i]);
785         kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
786 }
787
788 /*
789  * Load (or re-load) the current list of vdevs describing the active l2cache for
790  * this pool.  When this is called, we have some form of basic information in
791  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
792  * then re-generate a more complete list including status information.
793  * Devices which are already active have their details maintained, and are
794  * not re-opened.
795  */
796 static void
797 spa_load_l2cache(spa_t *spa)
798 {
799         nvlist_t **l2cache;
800         uint_t nl2cache;
801         int i, j, oldnvdevs;
802         uint64_t guid;
803         vdev_t *vd, **oldvdevs, **newvdevs;
804         spa_aux_vdev_t *sav = &spa->spa_l2cache;
805
806         if (sav->sav_config != NULL) {
807                 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
808                     ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
809                 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
810         } else {
811                 nl2cache = 0;
812         }
813
814         oldvdevs = sav->sav_vdevs;
815         oldnvdevs = sav->sav_count;
816         sav->sav_vdevs = NULL;
817         sav->sav_count = 0;
818
819         /*
820          * Process new nvlist of vdevs.
821          */
822         for (i = 0; i < nl2cache; i++) {
823                 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
824                     &guid) == 0);
825
826                 newvdevs[i] = NULL;
827                 for (j = 0; j < oldnvdevs; j++) {
828                         vd = oldvdevs[j];
829                         if (vd != NULL && guid == vd->vdev_guid) {
830                                 /*
831                                  * Retain previous vdev for add/remove ops.
832                                  */
833                                 newvdevs[i] = vd;
834                                 oldvdevs[j] = NULL;
835                                 break;
836                         }
837                 }
838
839                 if (newvdevs[i] == NULL) {
840                         /*
841                          * Create new vdev
842                          */
843                         VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
844                             VDEV_ALLOC_L2CACHE) == 0);
845                         ASSERT(vd != NULL);
846                         newvdevs[i] = vd;
847
848                         /*
849                          * Commit this vdev as an l2cache device,
850                          * even if it fails to open.
851                          */
852                         spa_l2cache_add(vd);
853
854                         if (vdev_open(vd) != 0)
855                                 continue;
856
857                         vd->vdev_top = vd;
858                         (void) vdev_validate_aux(vd);
859
860                         if (!vdev_is_dead(vd)) {
861                                 uint64_t size;
862                                 size = vdev_get_rsize(vd);
863                                 ASSERT3U(size, >, 0);
864                                 if (spa_mode & FWRITE) {
865                                         l2arc_add_vdev(spa, vd,
866                                             VDEV_LABEL_START_SIZE,
867                                             size - VDEV_LABEL_START_SIZE);
868                                 }
869                                 spa_l2cache_activate(vd);
870                         }
871                 }
872         }
873
874         /*
875          * Purge vdevs that were dropped
876          */
877         for (i = 0; i < oldnvdevs; i++) {
878                 uint64_t pool;
879
880                 vd = oldvdevs[i];
881                 if (vd != NULL) {
882                         if (spa_mode & FWRITE &&
883                             spa_l2cache_exists(vd->vdev_guid, &pool) &&
884                             pool != 0ULL) {
885                                 l2arc_remove_vdev(vd);
886                         }
887                         (void) vdev_close(vd);
888                         spa_l2cache_remove(vd);
889                 }
890         }
891
892         if (oldvdevs)
893                 kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
894
895         if (sav->sav_config == NULL)
896                 goto out;
897
898         sav->sav_vdevs = newvdevs;
899         sav->sav_count = (int)nl2cache;
900
901         /*
902          * Recompute the stashed list of l2cache devices, with status
903          * information this time.
904          */
905         VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
906             DATA_TYPE_NVLIST_ARRAY) == 0);
907
908         l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
909         for (i = 0; i < sav->sav_count; i++)
910                 l2cache[i] = vdev_config_generate(spa,
911                     sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
912         VERIFY(nvlist_add_nvlist_array(sav->sav_config,
913             ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
914 out:
915         for (i = 0; i < sav->sav_count; i++)
916                 nvlist_free(l2cache[i]);
917         if (sav->sav_count)
918                 kmem_free(l2cache, sav->sav_count * sizeof (void *));
919 }
920
921 static int
922 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
923 {
924         dmu_buf_t *db;
925         char *packed = NULL;
926         size_t nvsize = 0;
927         int error;
928         *value = NULL;
929
930         VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
931         nvsize = *(uint64_t *)db->db_data;
932         dmu_buf_rele(db, FTAG);
933
934         packed = kmem_alloc(nvsize, KM_SLEEP);
935         error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
936         if (error == 0)
937                 error = nvlist_unpack(packed, nvsize, value, 0);
938         kmem_free(packed, nvsize);
939
940         return (error);
941 }
942
943 /*
944  * Checks to see if the given vdev could not be opened, in which case we post a
945  * sysevent to notify the autoreplace code that the device has been removed.
946  */
947 static void
948 spa_check_removed(vdev_t *vd)
949 {
950         int c;
951
952         for (c = 0; c < vd->vdev_children; c++)
953                 spa_check_removed(vd->vdev_child[c]);
954
955         if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
956                 zfs_post_autoreplace(vd->vdev_spa, vd);
957                 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
958         }
959 }
960
961 /*
962  * Load an existing storage pool, using the pool's builtin spa_config as a
963  * source of configuration information.
964  */
965 static int
966 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
967 {
968         int error = 0;
969         nvlist_t *nvroot = NULL;
970         vdev_t *rvd;
971         uberblock_t *ub = &spa->spa_uberblock;
972         uint64_t config_cache_txg = spa->spa_config_txg;
973         uint64_t pool_guid;
974         uint64_t version;
975         zio_t *zio;
976         uint64_t autoreplace = 0;
977
978         spa->spa_load_state = state;
979
980         if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
981             nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
982                 error = EINVAL;
983                 goto out;
984         }
985
986         /*
987          * Versioning wasn't explicitly added to the label until later, so if
988          * it's not present treat it as the initial version.
989          */
990         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
991                 version = SPA_VERSION_INITIAL;
992
993         (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
994             &spa->spa_config_txg);
995
996         if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
997             spa_guid_exists(pool_guid, 0)) {
998                 error = EEXIST;
999                 goto out;
1000         }
1001
1002         spa->spa_load_guid = pool_guid;
1003
1004         /*
1005          * Parse the configuration into a vdev tree.  We explicitly set the
1006          * value that will be returned by spa_version() since parsing the
1007          * configuration requires knowing the version number.
1008          */
1009         spa_config_enter(spa, RW_WRITER, FTAG);
1010         spa->spa_ubsync.ub_version = version;
1011         error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
1012         spa_config_exit(spa, FTAG);
1013
1014         if (error != 0)
1015                 goto out;
1016
1017         ASSERT(spa->spa_root_vdev == rvd);
1018         ASSERT(spa_guid(spa) == pool_guid);
1019
1020         /*
1021          * Try to open all vdevs, loading each label in the process.
1022          */
1023         error = vdev_open(rvd);
1024         if (error != 0)
1025                 goto out;
1026
1027         /*
1028          * Validate the labels for all leaf vdevs.  We need to grab the config
1029          * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD
1030          * flag.
1031          */
1032         spa_config_enter(spa, RW_READER, FTAG);
1033         error = vdev_validate(rvd);
1034         spa_config_exit(spa, FTAG);
1035
1036         if (error != 0)
1037                 goto out;
1038
1039         if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
1040                 error = ENXIO;
1041                 goto out;
1042         }
1043
1044         /*
1045          * Find the best uberblock.
1046          */
1047         bzero(ub, sizeof (uberblock_t));
1048
1049         zio = zio_root(spa, NULL, NULL,
1050             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1051         vdev_uberblock_load(zio, rvd, ub);
1052         error = zio_wait(zio);
1053
1054         /*
1055          * If we weren't able to find a single valid uberblock, return failure.
1056          */
1057         if (ub->ub_txg == 0) {
1058                 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1059                     VDEV_AUX_CORRUPT_DATA);
1060                 error = ENXIO;
1061                 goto out;
1062         }
1063
1064         /*
1065          * If the pool is newer than the code, we can't open it.
1066          */
1067         if (ub->ub_version > SPA_VERSION) {
1068                 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1069                     VDEV_AUX_VERSION_NEWER);
1070                 error = ENOTSUP;
1071                 goto out;
1072         }
1073
1074         /*
1075          * If the vdev guid sum doesn't match the uberblock, we have an
1076          * incomplete configuration.
1077          */
1078         if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
1079                 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1080                     VDEV_AUX_BAD_GUID_SUM);
1081                 error = ENXIO;
1082                 goto out;
1083         }
1084
1085         /*
1086          * Initialize internal SPA structures.
1087          */
1088         spa->spa_state = POOL_STATE_ACTIVE;
1089         spa->spa_ubsync = spa->spa_uberblock;
1090         spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
1091         error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
1092         if (error) {
1093                 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1094                     VDEV_AUX_CORRUPT_DATA);
1095                 goto out;
1096         }
1097         spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
1098
1099         if (zap_lookup(spa->spa_meta_objset,
1100             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
1101             sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
1102                 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1103                     VDEV_AUX_CORRUPT_DATA);
1104                 error = EIO;
1105                 goto out;
1106         }
1107
1108         if (!mosconfig) {
1109                 nvlist_t *newconfig;
1110                 uint64_t hostid;
1111
1112                 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
1113                         vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1114                             VDEV_AUX_CORRUPT_DATA);
1115                         error = EIO;
1116                         goto out;
1117                 }
1118
1119                 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID,
1120                     &hostid) == 0) {
1121                         char *hostname;
1122                         unsigned long myhostid = 0;
1123
1124                         VERIFY(nvlist_lookup_string(newconfig,
1125                             ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
1126
1127                         (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
1128                         if (hostid != 0 && myhostid != 0 &&
1129                             (unsigned long)hostid != myhostid) {
1130                                 cmn_err(CE_WARN, "pool '%s' could not be "
1131                                     "loaded as it was last accessed by "
1132                                     "another system (host: %s hostid: 0x%lx).  "
1133                                     "See: http://www.sun.com/msg/ZFS-8000-EY",
1134                                     spa->spa_name, hostname,
1135                                     (unsigned long)hostid);
1136                                 error = EBADF;
1137                                 goto out;
1138                         }
1139                 }
1140
1141                 spa_config_set(spa, newconfig);
1142                 spa_unload(spa);
1143                 spa_deactivate(spa);
1144                 spa_activate(spa);
1145
1146                 return (spa_load(spa, newconfig, state, B_TRUE));
1147         }
1148
1149         if (zap_lookup(spa->spa_meta_objset,
1150             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
1151             sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
1152                 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1153                     VDEV_AUX_CORRUPT_DATA);
1154                 error = EIO;
1155                 goto out;
1156         }
1157
1158         /*
1159          * Load the bit that tells us to use the new accounting function
1160          * (raid-z deflation).  If we have an older pool, this will not
1161          * be present.
1162          */
1163         error = zap_lookup(spa->spa_meta_objset,
1164             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
1165             sizeof (uint64_t), 1, &spa->spa_deflate);
1166         if (error != 0 && error != ENOENT) {
1167                 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1168                     VDEV_AUX_CORRUPT_DATA);
1169                 error = EIO;
1170                 goto out;
1171         }
1172
1173         /*
1174          * Load the persistent error log.  If we have an older pool, this will
1175          * not be present.
1176          */
1177         error = zap_lookup(spa->spa_meta_objset,
1178             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
1179             sizeof (uint64_t), 1, &spa->spa_errlog_last);
1180         if (error != 0 && error != ENOENT) {
1181                 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1182                     VDEV_AUX_CORRUPT_DATA);
1183                 error = EIO;
1184                 goto out;
1185         }
1186
1187         error = zap_lookup(spa->spa_meta_objset,
1188             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
1189             sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
1190         if (error != 0 && error != ENOENT) {
1191                 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1192                     VDEV_AUX_CORRUPT_DATA);
1193                 error = EIO;
1194                 goto out;
1195         }
1196
1197         /*
1198          * Load the history object.  If we have an older pool, this
1199          * will not be present.
1200          */
1201         error = zap_lookup(spa->spa_meta_objset,
1202             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
1203             sizeof (uint64_t), 1, &spa->spa_history);
1204         if (error != 0 && error != ENOENT) {
1205                 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1206                     VDEV_AUX_CORRUPT_DATA);
1207                 error = EIO;
1208                 goto out;
1209         }
1210
1211         /*
1212          * Load any hot spares for this pool.
1213          */
1214         error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1215             DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object);
1216         if (error != 0 && error != ENOENT) {
1217                 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1218                     VDEV_AUX_CORRUPT_DATA);
1219                 error = EIO;
1220                 goto out;
1221         }
1222         if (error == 0) {
1223                 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
1224                 if (load_nvlist(spa, spa->spa_spares.sav_object,
1225                     &spa->spa_spares.sav_config) != 0) {
1226                         vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1227                             VDEV_AUX_CORRUPT_DATA);
1228                         error = EIO;
1229                         goto out;
1230                 }
1231
1232                 spa_config_enter(spa, RW_WRITER, FTAG);
1233                 spa_load_spares(spa);
1234                 spa_config_exit(spa, FTAG);
1235         }
1236
1237         /*
1238          * Load any level 2 ARC devices for this pool.
1239          */
1240         error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1241             DMU_POOL_L2CACHE, sizeof (uint64_t), 1,
1242             &spa->spa_l2cache.sav_object);
1243         if (error != 0 && error != ENOENT) {
1244                 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1245                     VDEV_AUX_CORRUPT_DATA);
1246                 error = EIO;
1247                 goto out;
1248         }
1249         if (error == 0) {
1250                 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
1251                 if (load_nvlist(spa, spa->spa_l2cache.sav_object,
1252                     &spa->spa_l2cache.sav_config) != 0) {
1253                         vdev_set_state(rvd, B_TRUE,
1254                             VDEV_STATE_CANT_OPEN,
1255                             VDEV_AUX_CORRUPT_DATA);
1256                         error = EIO;
1257                         goto out;
1258                 }
1259
1260                 spa_config_enter(spa, RW_WRITER, FTAG);
1261                 spa_load_l2cache(spa);
1262                 spa_config_exit(spa, FTAG);
1263         }
1264
1265         spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
1266
1267         error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1268             DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
1269
1270         if (error && error != ENOENT) {
1271                 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1272                     VDEV_AUX_CORRUPT_DATA);
1273                 error = EIO;
1274                 goto out;
1275         }
1276
1277         if (error == 0) {
1278                 (void) zap_lookup(spa->spa_meta_objset,
1279                     spa->spa_pool_props_object,
1280                     zpool_prop_to_name(ZPOOL_PROP_BOOTFS),
1281                     sizeof (uint64_t), 1, &spa->spa_bootfs);
1282                 (void) zap_lookup(spa->spa_meta_objset,
1283                     spa->spa_pool_props_object,
1284                     zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE),
1285                     sizeof (uint64_t), 1, &autoreplace);
1286                 (void) zap_lookup(spa->spa_meta_objset,
1287                     spa->spa_pool_props_object,
1288                     zpool_prop_to_name(ZPOOL_PROP_DELEGATION),
1289                     sizeof (uint64_t), 1, &spa->spa_delegation);
1290                 (void) zap_lookup(spa->spa_meta_objset,
1291                     spa->spa_pool_props_object,
1292                     zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
1293                     sizeof (uint64_t), 1, &spa->spa_failmode);
1294         }
1295
1296         /*
1297          * If the 'autoreplace' property is set, then post a resource notifying
1298          * the ZFS DE that it should not issue any faults for unopenable
1299          * devices.  We also iterate over the vdevs, and post a sysevent for any
1300          * unopenable vdevs so that the normal autoreplace handler can take
1301          * over.
1302          */
1303         if (autoreplace && state != SPA_LOAD_TRYIMPORT)
1304                 spa_check_removed(spa->spa_root_vdev);
1305
1306         /*
1307          * Load the vdev state for all toplevel vdevs.
1308          */
1309         vdev_load(rvd);
1310
1311         /*
1312          * Propagate the leaf DTLs we just loaded all the way up the tree.
1313          */
1314         spa_config_enter(spa, RW_WRITER, FTAG);
1315         vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
1316         spa_config_exit(spa, FTAG);
1317
1318         /*
1319          * Check the state of the root vdev.  If it can't be opened, it
1320          * indicates one or more toplevel vdevs are faulted.
1321          */
1322         if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
1323                 error = ENXIO;
1324                 goto out;
1325         }
1326
1327         if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
1328                 dmu_tx_t *tx;
1329                 int need_update = B_FALSE;
1330                 int c;
1331
1332                 /*
1333                  * Claim log blocks that haven't been committed yet.
1334                  * This must all happen in a single txg.
1335                  */
1336                 tx = dmu_tx_create_assigned(spa_get_dsl(spa),
1337                     spa_first_txg(spa));
1338                 (void) dmu_objset_find(spa->spa_name,
1339                     zil_claim, tx, DS_FIND_CHILDREN);
1340                 dmu_tx_commit(tx);
1341
1342                 spa->spa_sync_on = B_TRUE;
1343                 txg_sync_start(spa->spa_dsl_pool);
1344
1345                 /*
1346                  * Wait for all claims to sync.
1347                  */
1348                 txg_wait_synced(spa->spa_dsl_pool, 0);
1349
1350                 /*
1351                  * If the config cache is stale, or we have uninitialized
1352                  * metaslabs (see spa_vdev_add()), then update the config.
1353                  */
1354                 if (config_cache_txg != spa->spa_config_txg ||
1355                     state == SPA_LOAD_IMPORT)
1356                         need_update = B_TRUE;
1357
1358                 for (c = 0; c < rvd->vdev_children; c++)
1359                         if (rvd->vdev_child[c]->vdev_ms_array == 0)
1360                                 need_update = B_TRUE;
1361
1362                 /*
1363                  * Update the config cache asychronously in case we're the
1364                  * root pool, in which case the config cache isn't writable yet.
1365                  */
1366                 if (need_update)
1367                         spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
1368         }
1369
1370         error = 0;
1371 out:
1372         if (error && error != EBADF)
1373                 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
1374         spa->spa_load_state = SPA_LOAD_NONE;
1375         spa->spa_ena = 0;
1376
1377         return (error);
1378 }
1379
1380 /*
1381  * Pool Open/Import
1382  *
1383  * The import case is identical to an open except that the configuration is sent
1384  * down from userland, instead of grabbed from the configuration cache.  For the
1385  * case of an open, the pool configuration will exist in the
1386  * POOL_STATE_UNINITIALIZED state.
1387  *
1388  * The stats information (gen/count/ustats) is used to gather vdev statistics at
1389  * the same time open the pool, without having to keep around the spa_t in some
1390  * ambiguous state.
1391  */
1392 static int
1393 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
1394 {
1395         spa_t *spa;
1396         int error;
1397         int loaded = B_FALSE;
1398         int locked = B_FALSE;
1399
1400         *spapp = NULL;
1401
1402         /*
1403          * As disgusting as this is, we need to support recursive calls to this
1404          * function because dsl_dir_open() is called during spa_load(), and ends
1405          * up calling spa_open() again.  The real fix is to figure out how to
1406          * avoid dsl_dir_open() calling this in the first place.
1407          */
1408         if (mutex_owner(&spa_namespace_lock) != curthread) {
1409                 mutex_enter(&spa_namespace_lock);
1410                 locked = B_TRUE;
1411         }
1412
1413         if ((spa = spa_lookup(pool)) == NULL) {
1414                 if (locked)
1415                         mutex_exit(&spa_namespace_lock);
1416                 return (ENOENT);
1417         }
1418         if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
1419
1420                 spa_activate(spa);
1421
1422                 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
1423
1424                 if (error == EBADF) {
1425                         /*
1426                          * If vdev_validate() returns failure (indicated by
1427                          * EBADF), it indicates that one of the vdevs indicates
1428                          * that the pool has been exported or destroyed.  If
1429                          * this is the case, the config cache is out of sync and
1430                          * we should remove the pool from the namespace.
1431                          */
1432                         zfs_post_ok(spa, NULL);
1433                         spa_unload(spa);
1434                         spa_deactivate(spa);
1435                         spa_remove(spa);
1436                         spa_config_sync();
1437                         if (locked)
1438                                 mutex_exit(&spa_namespace_lock);
1439                         return (ENOENT);
1440                 }
1441
1442                 if (error) {
1443                         /*
1444                          * We can't open the pool, but we still have useful
1445                          * information: the state of each vdev after the
1446                          * attempted vdev_open().  Return this to the user.
1447                          */
1448                         if (config != NULL && spa->spa_root_vdev != NULL) {
1449                                 spa_config_enter(spa, RW_READER, FTAG);
1450                                 *config = spa_config_generate(spa, NULL, -1ULL,
1451                                     B_TRUE);
1452                                 spa_config_exit(spa, FTAG);
1453                         }
1454                         spa_unload(spa);
1455                         spa_deactivate(spa);
1456                         spa->spa_last_open_failed = B_TRUE;
1457                         if (locked)
1458                                 mutex_exit(&spa_namespace_lock);
1459                         *spapp = NULL;
1460                         return (error);
1461                 } else {
1462                         zfs_post_ok(spa, NULL);
1463                         spa->spa_last_open_failed = B_FALSE;
1464                 }
1465
1466                 loaded = B_TRUE;
1467         }
1468
1469         spa_open_ref(spa, tag);
1470
1471         /*
1472          * If we just loaded the pool, resilver anything that's out of date.
1473          */
1474         if (loaded && (spa_mode & FWRITE))
1475                 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1476
1477         if (locked)
1478                 mutex_exit(&spa_namespace_lock);
1479
1480         *spapp = spa;
1481
1482         if (config != NULL) {
1483                 spa_config_enter(spa, RW_READER, FTAG);
1484                 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
1485                 spa_config_exit(spa, FTAG);
1486         }
1487
1488         return (0);
1489 }
1490
1491 int
1492 spa_open(const char *name, spa_t **spapp, void *tag)
1493 {
1494         return (spa_open_common(name, spapp, tag, NULL));
1495 }
1496
1497 /*
1498  * Lookup the given spa_t, incrementing the inject count in the process,
1499  * preventing it from being exported or destroyed.
1500  */
1501 spa_t *
1502 spa_inject_addref(char *name)
1503 {
1504         spa_t *spa;
1505
1506         mutex_enter(&spa_namespace_lock);
1507         if ((spa = spa_lookup(name)) == NULL) {
1508                 mutex_exit(&spa_namespace_lock);
1509                 return (NULL);
1510         }
1511         spa->spa_inject_ref++;
1512         mutex_exit(&spa_namespace_lock);
1513
1514         return (spa);
1515 }
1516
1517 void
1518 spa_inject_delref(spa_t *spa)
1519 {
1520         mutex_enter(&spa_namespace_lock);
1521         spa->spa_inject_ref--;
1522         mutex_exit(&spa_namespace_lock);
1523 }
1524
1525 /*
1526  * Add spares device information to the nvlist.
1527  */
1528 static void
1529 spa_add_spares(spa_t *spa, nvlist_t *config)
1530 {
1531         nvlist_t **spares;
1532         uint_t i, nspares;
1533         nvlist_t *nvroot;
1534         uint64_t guid;
1535         vdev_stat_t *vs;
1536         uint_t vsc;
1537         uint64_t pool;
1538
1539         if (spa->spa_spares.sav_count == 0)
1540                 return;
1541
1542         VERIFY(nvlist_lookup_nvlist(config,
1543             ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1544         VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1545             ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1546         if (nspares != 0) {
1547                 VERIFY(nvlist_add_nvlist_array(nvroot,
1548                     ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1549                 VERIFY(nvlist_lookup_nvlist_array(nvroot,
1550                     ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1551
1552                 /*
1553                  * Go through and find any spares which have since been
1554                  * repurposed as an active spare.  If this is the case, update
1555                  * their status appropriately.
1556                  */
1557                 for (i = 0; i < nspares; i++) {
1558                         VERIFY(nvlist_lookup_uint64(spares[i],
1559                             ZPOOL_CONFIG_GUID, &guid) == 0);
1560                         if (spa_spare_exists(guid, &pool) && pool != 0ULL) {
1561                                 VERIFY(nvlist_lookup_uint64_array(
1562                                     spares[i], ZPOOL_CONFIG_STATS,
1563                                     (uint64_t **)&vs, &vsc) == 0);
1564                                 vs->vs_state = VDEV_STATE_CANT_OPEN;
1565                                 vs->vs_aux = VDEV_AUX_SPARED;
1566                         }
1567                 }
1568         }
1569 }
1570
1571 /*
1572  * Add l2cache device information to the nvlist, including vdev stats.
1573  */
1574 static void
1575 spa_add_l2cache(spa_t *spa, nvlist_t *config)
1576 {
1577         nvlist_t **l2cache;
1578         uint_t i, j, nl2cache;
1579         nvlist_t *nvroot;
1580         uint64_t guid;
1581         vdev_t *vd;
1582         vdev_stat_t *vs;
1583         uint_t vsc;
1584
1585         if (spa->spa_l2cache.sav_count == 0)
1586                 return;
1587
1588         spa_config_enter(spa, RW_READER, FTAG);
1589
1590         VERIFY(nvlist_lookup_nvlist(config,
1591             ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1592         VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
1593             ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1594         if (nl2cache != 0) {
1595                 VERIFY(nvlist_add_nvlist_array(nvroot,
1596                     ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
1597                 VERIFY(nvlist_lookup_nvlist_array(nvroot,
1598                     ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1599
1600                 /*
1601                  * Update level 2 cache device stats.
1602                  */
1603
1604                 for (i = 0; i < nl2cache; i++) {
1605                         VERIFY(nvlist_lookup_uint64(l2cache[i],
1606                             ZPOOL_CONFIG_GUID, &guid) == 0);
1607
1608                         vd = NULL;
1609                         for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
1610                                 if (guid ==
1611                                     spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
1612                                         vd = spa->spa_l2cache.sav_vdevs[j];
1613                                         break;
1614                                 }
1615                         }
1616                         ASSERT(vd != NULL);
1617
1618                         VERIFY(nvlist_lookup_uint64_array(l2cache[i],
1619                             ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
1620                         vdev_get_stats(vd, vs);
1621                 }
1622         }
1623
1624         spa_config_exit(spa, FTAG);
1625 }
1626
1627 int
1628 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
1629 {
1630         int error;
1631         spa_t *spa;
1632
1633         *config = NULL;
1634         error = spa_open_common(name, &spa, FTAG, config);
1635
1636         if (spa && *config != NULL) {
1637                 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
1638                     spa_get_errlog_size(spa)) == 0);
1639
1640                 spa_add_spares(spa, *config);
1641                 spa_add_l2cache(spa, *config);
1642         }
1643
1644         /*
1645          * We want to get the alternate root even for faulted pools, so we cheat
1646          * and call spa_lookup() directly.
1647          */
1648         if (altroot) {
1649                 if (spa == NULL) {
1650                         mutex_enter(&spa_namespace_lock);
1651                         spa = spa_lookup(name);
1652                         if (spa)
1653                                 spa_altroot(spa, altroot, buflen);
1654                         else
1655                                 altroot[0] = '\0';
1656                         spa = NULL;
1657                         mutex_exit(&spa_namespace_lock);
1658                 } else {
1659                         spa_altroot(spa, altroot, buflen);
1660                 }
1661         }
1662
1663         if (spa != NULL)
1664                 spa_close(spa, FTAG);
1665
1666         return (error);
1667 }
1668
1669 /*
1670  * Validate that the auxiliary device array is well formed.  We must have an
1671  * array of nvlists, each which describes a valid leaf vdev.  If this is an
1672  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
1673  * specified, as long as they are well-formed.
1674  */
1675 static int
1676 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
1677     spa_aux_vdev_t *sav, const char *config, uint64_t version,
1678     vdev_labeltype_t label)
1679 {
1680         nvlist_t **dev;
1681         uint_t i, ndev;
1682         vdev_t *vd;
1683         int error;
1684
1685         /*
1686          * It's acceptable to have no devs specified.
1687          */
1688         if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
1689                 return (0);
1690
1691         if (ndev == 0)
1692                 return (EINVAL);
1693
1694         /*
1695          * Make sure the pool is formatted with a version that supports this
1696          * device type.
1697          */
1698         if (spa_version(spa) < version)
1699                 return (ENOTSUP);
1700
1701         /*
1702          * Set the pending device list so we correctly handle device in-use
1703          * checking.
1704          */
1705         sav->sav_pending = dev;
1706         sav->sav_npending = ndev;
1707
1708         for (i = 0; i < ndev; i++) {
1709                 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
1710                     mode)) != 0)
1711                         goto out;
1712
1713                 if (!vd->vdev_ops->vdev_op_leaf) {
1714                         vdev_free(vd);
1715                         error = EINVAL;
1716                         goto out;
1717                 }
1718
1719                 /*
1720                  * The L2ARC currently only supports disk devices.
1721                  */
1722                 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
1723                     strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
1724                         error = ENOTBLK;
1725                         goto out;
1726                 }
1727
1728                 vd->vdev_top = vd;
1729
1730                 if ((error = vdev_open(vd)) == 0 &&
1731                     (error = vdev_label_init(vd, crtxg, label)) == 0) {
1732                         VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
1733                             vd->vdev_guid) == 0);
1734                 }
1735
1736                 vdev_free(vd);
1737
1738                 if (error &&
1739                     (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
1740                         goto out;
1741                 else
1742                         error = 0;
1743         }
1744
1745 out:
1746         sav->sav_pending = NULL;
1747         sav->sav_npending = 0;
1748         return (error);
1749 }
1750
1751 static int
1752 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
1753 {
1754         int error;
1755
1756         if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
1757             &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
1758             VDEV_LABEL_SPARE)) != 0) {
1759                 return (error);
1760         }
1761
1762         return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
1763             &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
1764             VDEV_LABEL_L2CACHE));
1765 }
1766
1767 static void
1768 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
1769     const char *config)
1770 {
1771         int i;
1772
1773         if (sav->sav_config != NULL) {
1774                 nvlist_t **olddevs;
1775                 uint_t oldndevs;
1776                 nvlist_t **newdevs;
1777
1778                 /*
1779                  * Generate new dev list by concatentating with the
1780                  * current dev list.
1781                  */
1782                 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
1783                     &olddevs, &oldndevs) == 0);
1784
1785                 newdevs = kmem_alloc(sizeof (void *) *
1786                     (ndevs + oldndevs), KM_SLEEP);
1787                 for (i = 0; i < oldndevs; i++)
1788                         VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
1789                             KM_SLEEP) == 0);
1790                 for (i = 0; i < ndevs; i++)
1791                         VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
1792                             KM_SLEEP) == 0);
1793
1794                 VERIFY(nvlist_remove(sav->sav_config, config,
1795                     DATA_TYPE_NVLIST_ARRAY) == 0);
1796
1797                 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1798                     config, newdevs, ndevs + oldndevs) == 0);
1799                 for (i = 0; i < oldndevs + ndevs; i++)
1800                         nvlist_free(newdevs[i]);
1801                 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
1802         } else {
1803                 /*
1804                  * Generate a new dev list.
1805                  */
1806                 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
1807                     KM_SLEEP) == 0);
1808                 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
1809                     devs, ndevs) == 0);
1810         }
1811 }
1812
1813 /*
1814  * Stop and drop level 2 ARC devices
1815  */
1816 void
1817 spa_l2cache_drop(spa_t *spa)
1818 {
1819         vdev_t *vd;
1820         int i;
1821         spa_aux_vdev_t *sav = &spa->spa_l2cache;
1822
1823         for (i = 0; i < sav->sav_count; i++) {
1824                 uint64_t pool;
1825
1826                 vd = sav->sav_vdevs[i];
1827                 ASSERT(vd != NULL);
1828
1829                 if (spa_mode & FWRITE &&
1830                     spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL) {
1831                         l2arc_remove_vdev(vd);
1832                 }
1833                 if (vd->vdev_isl2cache)
1834                         spa_l2cache_remove(vd);
1835                 vdev_clear_stats(vd);
1836                 (void) vdev_close(vd);
1837         }
1838 }
1839
1840 /*
1841  * Pool Creation
1842  */
1843 int
1844 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
1845     const char *history_str)
1846 {
1847         spa_t *spa;
1848         char *altroot = NULL;
1849         vdev_t *rvd;
1850         dsl_pool_t *dp;
1851         dmu_tx_t *tx;
1852         int c, error = 0;
1853         uint64_t txg = TXG_INITIAL;
1854         nvlist_t **spares, **l2cache;
1855         uint_t nspares, nl2cache;
1856         uint64_t version;
1857
1858         /*
1859          * If this pool already exists, return failure.
1860          */
1861         mutex_enter(&spa_namespace_lock);
1862         if (spa_lookup(pool) != NULL) {
1863                 mutex_exit(&spa_namespace_lock);
1864                 return (EEXIST);
1865         }
1866
1867         /*
1868          * Allocate a new spa_t structure.
1869          */
1870         (void) nvlist_lookup_string(props,
1871             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
1872         spa = spa_add(pool, altroot);
1873         spa_activate(spa);
1874
1875         spa->spa_uberblock.ub_txg = txg - 1;
1876
1877         if (props && (error = spa_prop_validate(spa, props))) {
1878                 spa_unload(spa);
1879                 spa_deactivate(spa);
1880                 spa_remove(spa);
1881                 return (error);
1882         }
1883
1884         if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
1885             &version) != 0)
1886                 version = SPA_VERSION;
1887         ASSERT(version <= SPA_VERSION);
1888         spa->spa_uberblock.ub_version = version;
1889         spa->spa_ubsync = spa->spa_uberblock;
1890
1891         /*
1892          * Create the root vdev.
1893          */
1894         spa_config_enter(spa, RW_WRITER, FTAG);
1895
1896         error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
1897
1898         ASSERT(error != 0 || rvd != NULL);
1899         ASSERT(error != 0 || spa->spa_root_vdev == rvd);
1900
1901         if (error == 0 && !zfs_allocatable_devs(nvroot))
1902                 error = EINVAL;
1903
1904         if (error == 0 &&
1905             (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
1906             (error = spa_validate_aux(spa, nvroot, txg,
1907             VDEV_ALLOC_ADD)) == 0) {
1908                 for (c = 0; c < rvd->vdev_children; c++)
1909                         vdev_init(rvd->vdev_child[c], txg);
1910                 vdev_config_dirty(rvd);
1911         }
1912
1913         spa_config_exit(spa, FTAG);
1914
1915         if (error != 0) {
1916                 spa_unload(spa);
1917                 spa_deactivate(spa);
1918                 spa_remove(spa);
1919                 mutex_exit(&spa_namespace_lock);
1920                 return (error);
1921         }
1922
1923         /*
1924          * Get the list of spares, if specified.
1925          */
1926         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1927             &spares, &nspares) == 0) {
1928                 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
1929                     KM_SLEEP) == 0);
1930                 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1931                     ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1932                 spa_config_enter(spa, RW_WRITER, FTAG);
1933                 spa_load_spares(spa);
1934                 spa_config_exit(spa, FTAG);
1935                 spa->spa_spares.sav_sync = B_TRUE;
1936         }
1937
1938         /*
1939          * Get the list of level 2 cache devices, if specified.
1940          */
1941         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1942             &l2cache, &nl2cache) == 0) {
1943                 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
1944                     NV_UNIQUE_NAME, KM_SLEEP) == 0);
1945                 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
1946                     ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
1947                 spa_config_enter(spa, RW_WRITER, FTAG);
1948                 spa_load_l2cache(spa);
1949                 spa_config_exit(spa, FTAG);
1950                 spa->spa_l2cache.sav_sync = B_TRUE;
1951         }
1952
1953         spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
1954         spa->spa_meta_objset = dp->dp_meta_objset;
1955
1956         tx = dmu_tx_create_assigned(dp, txg);
1957
1958         /*
1959          * Create the pool config object.
1960          */
1961         spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
1962             DMU_OT_PACKED_NVLIST, 1 << 14,
1963             DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
1964
1965         if (zap_add(spa->spa_meta_objset,
1966             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
1967             sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
1968                 cmn_err(CE_PANIC, "failed to add pool config");
1969         }
1970
1971         /* Newly created pools with the right version are always deflated. */
1972         if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
1973                 spa->spa_deflate = TRUE;
1974                 if (zap_add(spa->spa_meta_objset,
1975                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
1976                     sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
1977                         cmn_err(CE_PANIC, "failed to add deflate");
1978                 }
1979         }
1980
1981         /*
1982          * Create the deferred-free bplist object.  Turn off compression
1983          * because sync-to-convergence takes longer if the blocksize
1984          * keeps changing.
1985          */
1986         spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
1987             1 << 14, tx);
1988         dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
1989             ZIO_COMPRESS_OFF, tx);
1990
1991         if (zap_add(spa->spa_meta_objset,
1992             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
1993             sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
1994                 cmn_err(CE_PANIC, "failed to add bplist");
1995         }
1996
1997         /*
1998          * Create the pool's history object.
1999          */
2000         if (version >= SPA_VERSION_ZPOOL_HISTORY)
2001                 spa_history_create_obj(spa, tx);
2002
2003         /*
2004          * Set pool properties.
2005          */
2006         spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
2007         spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2008         spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
2009         if (props)
2010                 spa_sync_props(spa, props, CRED(), tx);
2011
2012         dmu_tx_commit(tx);
2013
2014         spa->spa_sync_on = B_TRUE;
2015         txg_sync_start(spa->spa_dsl_pool);
2016
2017         /*
2018          * We explicitly wait for the first transaction to complete so that our
2019          * bean counters are appropriately updated.
2020          */
2021         txg_wait_synced(spa->spa_dsl_pool, txg);
2022
2023         spa_config_sync();
2024
2025         if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
2026                 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
2027
2028         mutex_exit(&spa_namespace_lock);
2029
2030         return (0);
2031 }
2032
2033 /*
2034  * Import the given pool into the system.  We set up the necessary spa_t and
2035  * then call spa_load() to do the dirty work.
2036  */
2037 static int
2038 spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
2039     boolean_t isroot)
2040 {
2041         spa_t *spa;
2042         char *altroot = NULL;
2043         int error;
2044         nvlist_t *nvroot;
2045         nvlist_t **spares, **l2cache;
2046         uint_t nspares, nl2cache;
2047         int mosconfig = isroot? B_FALSE : B_TRUE;
2048
2049         /*
2050          * If a pool with this name exists, return failure.
2051          */
2052         mutex_enter(&spa_namespace_lock);
2053         if (spa_lookup(pool) != NULL) {
2054                 mutex_exit(&spa_namespace_lock);
2055                 return (EEXIST);
2056         }
2057
2058         /*
2059          * Create and initialize the spa structure.
2060          */
2061         (void) nvlist_lookup_string(props,
2062             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2063         spa = spa_add(pool, altroot);
2064         spa_activate(spa);
2065
2066         /*
2067          * Pass off the heavy lifting to spa_load().
2068          * Pass TRUE for mosconfig because the user-supplied config
2069          * is actually the one to trust when doing an import.
2070          */
2071         error = spa_load(spa, config, SPA_LOAD_IMPORT, mosconfig);
2072
2073         spa_config_enter(spa, RW_WRITER, FTAG);
2074         /*
2075          * Toss any existing sparelist, as it doesn't have any validity anymore,
2076          * and conflicts with spa_has_spare().
2077          */
2078         if (!isroot && spa->spa_spares.sav_config) {
2079                 nvlist_free(spa->spa_spares.sav_config);
2080                 spa->spa_spares.sav_config = NULL;
2081                 spa_load_spares(spa);
2082         }
2083         if (!isroot && spa->spa_l2cache.sav_config) {
2084                 nvlist_free(spa->spa_l2cache.sav_config);
2085                 spa->spa_l2cache.sav_config = NULL;
2086                 spa_load_l2cache(spa);
2087         }
2088
2089         VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
2090             &nvroot) == 0);
2091         if (error == 0)
2092                 error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE);
2093         if (error == 0)
2094                 error = spa_validate_aux(spa, nvroot, -1ULL,
2095                     VDEV_ALLOC_L2CACHE);
2096         spa_config_exit(spa, FTAG);
2097
2098         if (error != 0 || (props && (error = spa_prop_set(spa, props)))) {
2099                 spa_unload(spa);
2100                 spa_deactivate(spa);
2101                 spa_remove(spa);
2102                 mutex_exit(&spa_namespace_lock);
2103                 return (error);
2104         }
2105
2106         /*
2107          * Override any spares and level 2 cache devices as specified by
2108          * the user, as these may have correct device names/devids, etc.
2109          */
2110         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
2111             &spares, &nspares) == 0) {
2112                 if (spa->spa_spares.sav_config)
2113                         VERIFY(nvlist_remove(spa->spa_spares.sav_config,
2114                             ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
2115                 else
2116                         VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
2117                             NV_UNIQUE_NAME, KM_SLEEP) == 0);
2118                 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
2119                     ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2120                 spa_config_enter(spa, RW_WRITER, FTAG);
2121                 spa_load_spares(spa);
2122                 spa_config_exit(spa, FTAG);
2123                 spa->spa_spares.sav_sync = B_TRUE;
2124         }
2125         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
2126             &l2cache, &nl2cache) == 0) {
2127                 if (spa->spa_l2cache.sav_config)
2128                         VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
2129                             ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
2130                 else
2131                         VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
2132                             NV_UNIQUE_NAME, KM_SLEEP) == 0);
2133                 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
2134                     ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2135                 spa_config_enter(spa, RW_WRITER, FTAG);
2136                 spa_load_l2cache(spa);
2137                 spa_config_exit(spa, FTAG);
2138                 spa->spa_l2cache.sav_sync = B_TRUE;
2139         }
2140
2141         /*
2142          * Update the config cache to include the newly-imported pool.
2143          */
2144         if (spa_mode & FWRITE)
2145                 spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot);
2146
2147         /*
2148          * Resilver anything that's out of date.
2149          */
2150         if (!isroot && (spa_mode & FWRITE))
2151                 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
2152
2153         mutex_exit(&spa_namespace_lock);
2154
2155         return (0);
2156 }
2157
2158 #ifdef _KERNEL
2159 /*
2160  * Build a "root" vdev for a top level vdev read in from a rootpool
2161  * device label.
2162  */
2163 static void
2164 spa_build_rootpool_config(nvlist_t *config)
2165 {
2166         nvlist_t *nvtop, *nvroot;
2167         uint64_t pgid;
2168
2169         /*
2170          * Add this top-level vdev to the child array.
2171          */
2172         VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop)
2173             == 0);
2174         VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid)
2175             == 0);
2176
2177         /*
2178          * Put this pool's top-level vdevs into a root vdev.
2179          */
2180         VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2181         VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT)
2182             == 0);
2183         VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
2184         VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
2185         VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
2186             &nvtop, 1) == 0);
2187
2188         /*
2189          * Replace the existing vdev_tree with the new root vdev in
2190          * this pool's configuration (remove the old, add the new).
2191          */
2192         VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
2193         nvlist_free(nvroot);
2194 }
2195
2196 /*
2197  * Get the root pool information from the root disk, then import the root pool
2198  * during the system boot up time.
2199  */
2200 extern nvlist_t *vdev_disk_read_rootlabel(char *);
2201
2202 void
2203 spa_check_rootconf(char *devpath, char **bestdev, nvlist_t **bestconf,
2204     uint64_t *besttxg)
2205 {
2206         nvlist_t *config;
2207         uint64_t txg;
2208
2209         if ((config = vdev_disk_read_rootlabel(devpath)) == NULL)
2210                 return;
2211
2212         VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
2213
2214         if (txg > *besttxg) {
2215                 *besttxg = txg;
2216                 if (*bestconf != NULL)
2217                         nvlist_free(*bestconf);
2218                 *bestconf = config;
2219                 *bestdev = devpath;
2220         }
2221 }
2222
2223 boolean_t
2224 spa_rootdev_validate(nvlist_t *nv)
2225 {
2226         uint64_t ival;
2227
2228         if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 ||
2229             nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 ||
2230             nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &ival) == 0 ||
2231             nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0)
2232                 return (B_FALSE);
2233
2234         return (B_TRUE);
2235 }
2236
2237 /*
2238  * Import a root pool.
2239  *
2240  * For x86. devpath_list will consist the physpath name of the vdev in a single
2241  * disk root pool or a list of physnames for the vdevs in a mirrored rootpool.
2242  * e.g.
2243  *      "/pci@1f,0/ide@d/disk@0,0:a /pci@1f,o/ide@d/disk@2,0:a"
2244  *
2245  * For Sparc, devpath_list consists the physpath name of the booting device
2246  * no matter the rootpool is a single device pool or a mirrored pool.
2247  * e.g.
2248  *      "/pci@1f,0/ide@d/disk@0,0:a"
2249  */
2250 int
2251 spa_import_rootpool(char *devpath_list)
2252 {
2253         nvlist_t *conf = NULL;
2254         char *dev = NULL;
2255         char *pname;
2256         int error;
2257
2258         /*
2259          * Get the vdev pathname and configuation from the most
2260          * recently updated vdev (highest txg).
2261          */
2262         if (error = spa_get_rootconf(devpath_list, &dev, &conf))
2263                 goto msg_out;
2264
2265         /*
2266          * Add type "root" vdev to the config.
2267          */
2268         spa_build_rootpool_config(conf);
2269
2270         VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0);
2271
2272         error = spa_import_common(pname, conf, NULL, TRUE);
2273         if (error == EEXIST)
2274                 error = 0;
2275
2276         nvlist_free(conf);
2277         return (error);
2278
2279 msg_out:
2280         cmn_err(CE_NOTE, "\n\n"
2281             "  ***************************************************  \n"
2282             "  *  This device is not bootable!                   *  \n"
2283             "  *  It is either offlined or detached or faulted.  *  \n"
2284             "  *  Please try to boot from a different device.    *  \n"
2285             "  ***************************************************  \n\n");
2286
2287         return (error);
2288 }
2289 #endif
2290
2291 /*
2292  * Import a non-root pool into the system.
2293  */
2294 int
2295 spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
2296 {
2297         return (spa_import_common(pool, config, props, FALSE));
2298 }
2299
2300 /*
2301  * This (illegal) pool name is used when temporarily importing a spa_t in order
2302  * to get the vdev stats associated with the imported devices.
2303  */
2304 #define TRYIMPORT_NAME  "$import"
2305
2306 nvlist_t *
2307 spa_tryimport(nvlist_t *tryconfig)
2308 {
2309         nvlist_t *config = NULL;
2310         char *poolname;
2311         spa_t *spa;
2312         uint64_t state;
2313
2314         if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
2315                 return (NULL);
2316
2317         if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
2318                 return (NULL);
2319
2320         /*
2321          * Create and initialize the spa structure.
2322          */
2323         mutex_enter(&spa_namespace_lock);
2324         spa = spa_add(TRYIMPORT_NAME, NULL);
2325         spa_activate(spa);
2326
2327         /*
2328          * Pass off the heavy lifting to spa_load().
2329          * Pass TRUE for mosconfig because the user-supplied config
2330          * is actually the one to trust when doing an import.
2331          */
2332         (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
2333
2334         /*
2335          * If 'tryconfig' was at least parsable, return the current config.
2336          */
2337         if (spa->spa_root_vdev != NULL) {
2338                 spa_config_enter(spa, RW_READER, FTAG);
2339                 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2340                 spa_config_exit(spa, FTAG);
2341                 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
2342                     poolname) == 0);
2343                 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
2344                     state) == 0);
2345                 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
2346                     spa->spa_uberblock.ub_timestamp) == 0);
2347
2348                 /*
2349                  * If the bootfs property exists on this pool then we
2350                  * copy it out so that external consumers can tell which
2351                  * pools are bootable.
2352                  */
2353                 if (spa->spa_bootfs) {
2354                         char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2355
2356                         /*
2357                          * We have to play games with the name since the
2358                          * pool was opened as TRYIMPORT_NAME.
2359                          */
2360                         if (dsl_dsobj_to_dsname(spa->spa_name,
2361                             spa->spa_bootfs, tmpname) == 0) {
2362                                 char *cp;
2363                                 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2364
2365                                 cp = strchr(tmpname, '/');
2366                                 if (cp == NULL) {
2367                                         (void) strlcpy(dsname, tmpname,
2368                                             MAXPATHLEN);
2369                                 } else {
2370                                         (void) snprintf(dsname, MAXPATHLEN,
2371                                             "%s/%s", poolname, ++cp);
2372                                 }
2373                                 VERIFY(nvlist_add_string(config,
2374                                     ZPOOL_CONFIG_BOOTFS, dsname) == 0);
2375                                 kmem_free(dsname, MAXPATHLEN);
2376                         }
2377                         kmem_free(tmpname, MAXPATHLEN);
2378                 }
2379
2380                 /*
2381                  * Add the list of hot spares and level 2 cache devices.
2382                  */
2383                 spa_add_spares(spa, config);
2384                 spa_add_l2cache(spa, config);
2385         }
2386
2387         spa_unload(spa);
2388         spa_deactivate(spa);
2389         spa_remove(spa);
2390         mutex_exit(&spa_namespace_lock);
2391
2392         return (config);
2393 }
2394
2395 /*
2396  * Pool export/destroy
2397  *
2398  * The act of destroying or exporting a pool is very simple.  We make sure there
2399  * is no more pending I/O and any references to the pool are gone.  Then, we
2400  * update the pool state and sync all the labels to disk, removing the
2401  * configuration from the cache afterwards.
2402  */
2403 static int
2404 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
2405 {
2406         spa_t *spa;
2407
2408         if (oldconfig)
2409                 *oldconfig = NULL;
2410
2411         if (!(spa_mode & FWRITE))
2412                 return (EROFS);
2413
2414         mutex_enter(&spa_namespace_lock);
2415         if ((spa = spa_lookup(pool)) == NULL) {
2416                 mutex_exit(&spa_namespace_lock);
2417                 return (ENOENT);
2418         }
2419
2420         /*
2421          * Put a hold on the pool, drop the namespace lock, stop async tasks,
2422          * reacquire the namespace lock, and see if we can export.
2423          */
2424         spa_open_ref(spa, FTAG);
2425         mutex_exit(&spa_namespace_lock);
2426         spa_async_suspend(spa);
2427         mutex_enter(&spa_namespace_lock);
2428         spa_close(spa, FTAG);
2429
2430         /*
2431          * The pool will be in core if it's openable,
2432          * in which case we can modify its state.
2433          */
2434         if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
2435                 /*
2436                  * Objsets may be open only because they're dirty, so we
2437                  * have to force it to sync before checking spa_refcnt.
2438                  */
2439                 spa_scrub_suspend(spa);
2440                 txg_wait_synced(spa->spa_dsl_pool, 0);
2441
2442                 /*
2443                  * A pool cannot be exported or destroyed if there are active
2444                  * references.  If we are resetting a pool, allow references by
2445                  * fault injection handlers.
2446                  */
2447                 if (!spa_refcount_zero(spa) ||
2448                     (spa->spa_inject_ref != 0 &&
2449                     new_state != POOL_STATE_UNINITIALIZED)) {
2450                         spa_scrub_resume(spa);
2451                         spa_async_resume(spa);
2452                         mutex_exit(&spa_namespace_lock);
2453                         return (EBUSY);
2454                 }
2455
2456                 spa_scrub_resume(spa);
2457                 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
2458
2459                 /*
2460                  * We want this to be reflected on every label,
2461                  * so mark them all dirty.  spa_unload() will do the
2462                  * final sync that pushes these changes out.
2463                  */
2464                 if (new_state != POOL_STATE_UNINITIALIZED) {
2465                         spa_config_enter(spa, RW_WRITER, FTAG);
2466                         spa->spa_state = new_state;
2467                         spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
2468                         vdev_config_dirty(spa->spa_root_vdev);
2469                         spa_config_exit(spa, FTAG);
2470                 }
2471         }
2472
2473         spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
2474
2475         if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
2476                 spa_unload(spa);
2477                 spa_deactivate(spa);
2478         }
2479
2480         if (oldconfig && spa->spa_config)
2481                 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
2482
2483         if (new_state != POOL_STATE_UNINITIALIZED) {
2484                 spa_config_check(spa->spa_config_dir,
2485                     spa->spa_config_file);
2486                 spa_remove(spa);
2487                 spa_config_sync();
2488         }
2489         mutex_exit(&spa_namespace_lock);
2490
2491         return (0);
2492 }
2493
2494 /*
2495  * Destroy a storage pool.
2496  */
2497 int
2498 spa_destroy(char *pool)
2499 {
2500         return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
2501 }
2502
2503 /*
2504  * Export a storage pool.
2505  */
2506 int
2507 spa_export(char *pool, nvlist_t **oldconfig)
2508 {
2509         return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
2510 }
2511
2512 /*
2513  * Similar to spa_export(), this unloads the spa_t without actually removing it
2514  * from the namespace in any way.
2515  */
2516 int
2517 spa_reset(char *pool)
2518 {
2519         return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
2520 }
2521
2522
2523 /*
2524  * ==========================================================================
2525  * Device manipulation
2526  * ==========================================================================
2527  */
2528
2529 /*
2530  * Add a device to a storage pool.
2531  */
2532 int
2533 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
2534 {
2535         uint64_t txg;
2536         int c, error;
2537         vdev_t *rvd = spa->spa_root_vdev;
2538         vdev_t *vd, *tvd;
2539         nvlist_t **spares, **l2cache;
2540         uint_t nspares, nl2cache;
2541
2542         txg = spa_vdev_enter(spa);
2543
2544         if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
2545             VDEV_ALLOC_ADD)) != 0)
2546                 return (spa_vdev_exit(spa, NULL, txg, error));
2547
2548         spa->spa_pending_vdev = vd;
2549
2550         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
2551             &nspares) != 0)
2552                 nspares = 0;
2553
2554         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
2555             &nl2cache) != 0)
2556                 nl2cache = 0;
2557
2558         if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) {
2559                 spa->spa_pending_vdev = NULL;
2560                 return (spa_vdev_exit(spa, vd, txg, EINVAL));
2561         }
2562
2563         if (vd->vdev_children != 0) {
2564                 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) {
2565                         spa->spa_pending_vdev = NULL;
2566                         return (spa_vdev_exit(spa, vd, txg, error));
2567                 }
2568         }
2569
2570         /*
2571          * We must validate the spares and l2cache devices after checking the
2572          * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
2573          */
2574         if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) {
2575                 spa->spa_pending_vdev = NULL;
2576                 return (spa_vdev_exit(spa, vd, txg, error));
2577         }
2578
2579         spa->spa_pending_vdev = NULL;
2580
2581         /*
2582          * Transfer each new top-level vdev from vd to rvd.
2583          */
2584         for (c = 0; c < vd->vdev_children; c++) {
2585                 tvd = vd->vdev_child[c];
2586                 vdev_remove_child(vd, tvd);
2587                 tvd->vdev_id = rvd->vdev_children;
2588                 vdev_add_child(rvd, tvd);
2589                 vdev_config_dirty(tvd);
2590         }
2591
2592         if (nspares != 0) {
2593                 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
2594                     ZPOOL_CONFIG_SPARES);
2595                 spa_load_spares(spa);
2596                 spa->spa_spares.sav_sync = B_TRUE;
2597         }
2598
2599         if (nl2cache != 0) {
2600                 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
2601                     ZPOOL_CONFIG_L2CACHE);
2602                 spa_load_l2cache(spa);
2603                 spa->spa_l2cache.sav_sync = B_TRUE;
2604         }
2605
2606         /*
2607          * We have to be careful when adding new vdevs to an existing pool.
2608          * If other threads start allocating from these vdevs before we
2609          * sync the config cache, and we lose power, then upon reboot we may
2610          * fail to open the pool because there are DVAs that the config cache
2611          * can't translate.  Therefore, we first add the vdevs without
2612          * initializing metaslabs; sync the config cache (via spa_vdev_exit());
2613          * and then let spa_config_update() initialize the new metaslabs.
2614          *
2615          * spa_load() checks for added-but-not-initialized vdevs, so that
2616          * if we lose power at any point in this sequence, the remaining
2617          * steps will be completed the next time we load the pool.
2618          */
2619         (void) spa_vdev_exit(spa, vd, txg, 0);
2620
2621         mutex_enter(&spa_namespace_lock);
2622         spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
2623         mutex_exit(&spa_namespace_lock);
2624
2625         return (0);
2626 }
2627
2628 /*
2629  * Attach a device to a mirror.  The arguments are the path to any device
2630  * in the mirror, and the nvroot for the new device.  If the path specifies
2631  * a device that is not mirrored, we automatically insert the mirror vdev.
2632  *
2633  * If 'replacing' is specified, the new device is intended to replace the
2634  * existing device; in this case the two devices are made into their own
2635  * mirror using the 'replacing' vdev, which is functionally identical to
2636  * the mirror vdev (it actually reuses all the same ops) but has a few
2637  * extra rules: you can't attach to it after it's been created, and upon
2638  * completion of resilvering, the first disk (the one being replaced)
2639  * is automatically detached.
2640  */
2641 int
2642 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
2643 {
2644         uint64_t txg, open_txg;
2645         int error;
2646         vdev_t *rvd = spa->spa_root_vdev;
2647         vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
2648         vdev_ops_t *pvops;
2649         int is_log;
2650
2651         txg = spa_vdev_enter(spa);
2652
2653         oldvd = vdev_lookup_by_guid(rvd, guid);
2654
2655         if (oldvd == NULL)
2656                 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
2657
2658         if (!oldvd->vdev_ops->vdev_op_leaf)
2659                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2660
2661         pvd = oldvd->vdev_parent;
2662
2663         if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
2664             VDEV_ALLOC_ADD)) != 0)
2665                 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
2666
2667         if (newrootvd->vdev_children != 1)
2668                 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
2669
2670         newvd = newrootvd->vdev_child[0];
2671
2672         if (!newvd->vdev_ops->vdev_op_leaf)
2673                 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
2674
2675         if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
2676                 return (spa_vdev_exit(spa, newrootvd, txg, error));
2677
2678         /*
2679          * Spares can't replace logs
2680          */
2681         is_log = oldvd->vdev_islog;
2682         if (is_log && newvd->vdev_isspare)
2683                 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2684
2685         if (!replacing) {
2686                 /*
2687                  * For attach, the only allowable parent is a mirror or the root
2688                  * vdev.
2689                  */
2690                 if (pvd->vdev_ops != &vdev_mirror_ops &&
2691                     pvd->vdev_ops != &vdev_root_ops)
2692                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2693
2694                 pvops = &vdev_mirror_ops;
2695         } else {
2696                 /*
2697                  * Active hot spares can only be replaced by inactive hot
2698                  * spares.
2699                  */
2700                 if (pvd->vdev_ops == &vdev_spare_ops &&
2701                     pvd->vdev_child[1] == oldvd &&
2702                     !spa_has_spare(spa, newvd->vdev_guid))
2703                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2704
2705                 /*
2706                  * If the source is a hot spare, and the parent isn't already a
2707                  * spare, then we want to create a new hot spare.  Otherwise, we
2708                  * want to create a replacing vdev.  The user is not allowed to
2709                  * attach to a spared vdev child unless the 'isspare' state is
2710                  * the same (spare replaces spare, non-spare replaces
2711                  * non-spare).
2712                  */
2713                 if (pvd->vdev_ops == &vdev_replacing_ops)
2714                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2715                 else if (pvd->vdev_ops == &vdev_spare_ops &&
2716                     newvd->vdev_isspare != oldvd->vdev_isspare)
2717                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2718                 else if (pvd->vdev_ops != &vdev_spare_ops &&
2719                     newvd->vdev_isspare)
2720                         pvops = &vdev_spare_ops;
2721                 else
2722                         pvops = &vdev_replacing_ops;
2723         }
2724
2725         /*
2726          * Compare the new device size with the replaceable/attachable
2727          * device size.
2728          */
2729         if (newvd->vdev_psize < vdev_get_rsize(oldvd))
2730                 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
2731
2732         /*
2733          * The new device cannot have a higher alignment requirement
2734          * than the top-level vdev.
2735          */
2736         if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
2737                 return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
2738
2739         /*
2740          * If this is an in-place replacement, update oldvd's path and devid
2741          * to make it distinguishable from newvd, and unopenable from now on.
2742          */
2743         if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
2744                 spa_strfree(oldvd->vdev_path);
2745                 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
2746                     KM_SLEEP);
2747                 (void) sprintf(oldvd->vdev_path, "%s/%s",
2748                     newvd->vdev_path, "old");
2749                 if (oldvd->vdev_devid != NULL) {
2750                         spa_strfree(oldvd->vdev_devid);
2751                         oldvd->vdev_devid = NULL;
2752                 }
2753         }
2754
2755         /*
2756          * If the parent is not a mirror, or if we're replacing, insert the new
2757          * mirror/replacing/spare vdev above oldvd.
2758          */
2759         if (pvd->vdev_ops != pvops)
2760                 pvd = vdev_add_parent(oldvd, pvops);
2761
2762         ASSERT(pvd->vdev_top->vdev_parent == rvd);
2763         ASSERT(pvd->vdev_ops == pvops);
2764         ASSERT(oldvd->vdev_parent == pvd);
2765
2766         /*
2767          * Extract the new device from its root and add it to pvd.
2768          */
2769         vdev_remove_child(newrootvd, newvd);
2770         newvd->vdev_id = pvd->vdev_children;
2771         vdev_add_child(pvd, newvd);
2772
2773         /*
2774          * If newvd is smaller than oldvd, but larger than its rsize,
2775          * the addition of newvd may have decreased our parent's asize.
2776          */
2777         pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
2778
2779         tvd = newvd->vdev_top;
2780         ASSERT(pvd->vdev_top == tvd);
2781         ASSERT(tvd->vdev_parent == rvd);
2782
2783         vdev_config_dirty(tvd);
2784
2785         /*
2786          * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
2787          * upward when spa_vdev_exit() calls vdev_dtl_reassess().
2788          */
2789         open_txg = txg + TXG_CONCURRENT_STATES - 1;
2790
2791         mutex_enter(&newvd->vdev_dtl_lock);
2792         space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
2793             open_txg - TXG_INITIAL + 1);
2794         mutex_exit(&newvd->vdev_dtl_lock);
2795
2796         if (newvd->vdev_isspare)
2797                 spa_spare_activate(newvd);
2798
2799         /*
2800          * Mark newvd's DTL dirty in this txg.
2801          */
2802         vdev_dirty(tvd, VDD_DTL, newvd, txg);
2803
2804         (void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
2805
2806         /*
2807          * Kick off a resilver to update newvd.  We need to grab the namespace
2808          * lock because spa_scrub() needs to post a sysevent with the pool name.
2809          */
2810         mutex_enter(&spa_namespace_lock);
2811         VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
2812         mutex_exit(&spa_namespace_lock);
2813
2814         return (0);
2815 }
2816
2817 /*
2818  * Detach a device from a mirror or replacing vdev.
2819  * If 'replace_done' is specified, only detach if the parent
2820  * is a replacing vdev.
2821  */
2822 int
2823 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
2824 {
2825         uint64_t txg;
2826         int c, t, error;
2827         vdev_t *rvd = spa->spa_root_vdev;
2828         vdev_t *vd, *pvd, *cvd, *tvd;
2829         boolean_t unspare = B_FALSE;
2830         uint64_t unspare_guid;
2831
2832         txg = spa_vdev_enter(spa);
2833
2834         vd = vdev_lookup_by_guid(rvd, guid);
2835
2836         if (vd == NULL)
2837                 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
2838
2839         if (!vd->vdev_ops->vdev_op_leaf)
2840                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2841
2842         pvd = vd->vdev_parent;
2843
2844         /*
2845          * If replace_done is specified, only remove this device if it's
2846          * the first child of a replacing vdev.  For the 'spare' vdev, either
2847          * disk can be removed.
2848          */
2849         if (replace_done) {
2850                 if (pvd->vdev_ops == &vdev_replacing_ops) {
2851                         if (vd->vdev_id != 0)
2852                                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2853                 } else if (pvd->vdev_ops != &vdev_spare_ops) {
2854                         return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2855                 }
2856         }
2857
2858         ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
2859             spa_version(spa) >= SPA_VERSION_SPARES);
2860
2861         /*
2862          * Only mirror, replacing, and spare vdevs support detach.
2863          */
2864         if (pvd->vdev_ops != &vdev_replacing_ops &&
2865             pvd->vdev_ops != &vdev_mirror_ops &&
2866             pvd->vdev_ops != &vdev_spare_ops)
2867                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2868
2869         /*
2870          * If there's only one replica, you can't detach it.
2871          */
2872         if (pvd->vdev_children <= 1)
2873                 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
2874
2875         /*
2876          * If all siblings have non-empty DTLs, this device may have the only
2877          * valid copy of the data, which means we cannot safely detach it.
2878          *
2879          * XXX -- as in the vdev_offline() case, we really want a more
2880          * precise DTL check.
2881          */
2882         for (c = 0; c < pvd->vdev_children; c++) {
2883                 uint64_t dirty;
2884
2885                 cvd = pvd->vdev_child[c];
2886                 if (cvd == vd)
2887                         continue;
2888                 if (vdev_is_dead(cvd))
2889                         continue;
2890                 mutex_enter(&cvd->vdev_dtl_lock);
2891                 dirty = cvd->vdev_dtl_map.sm_space |
2892                     cvd->vdev_dtl_scrub.sm_space;
2893                 mutex_exit(&cvd->vdev_dtl_lock);
2894                 if (!dirty)
2895                         break;
2896         }
2897
2898         /*
2899          * If we are a replacing or spare vdev, then we can always detach the
2900          * latter child, as that is how one cancels the operation.
2901          */
2902         if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) &&
2903             c == pvd->vdev_children)
2904                 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
2905
2906         /*
2907          * If we are detaching the original disk from a spare, then it implies
2908          * that the spare should become a real disk, and be removed from the
2909          * active spare list for the pool.
2910          */
2911         if (pvd->vdev_ops == &vdev_spare_ops &&
2912             vd->vdev_id == 0)
2913                 unspare = B_TRUE;
2914
2915         /*
2916          * Erase the disk labels so the disk can be used for other things.
2917          * This must be done after all other error cases are handled,
2918          * but before we disembowel vd (so we can still do I/O to it).
2919          * But if we can't do it, don't treat the error as fatal --
2920          * it may be that the unwritability of the disk is the reason
2921          * it's being detached!
2922          */
2923         error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
2924
2925         /*
2926          * Remove vd from its parent and compact the parent's children.
2927          */
2928         vdev_remove_child(pvd, vd);
2929         vdev_compact_children(pvd);
2930
2931         /*
2932          * Remember one of the remaining children so we can get tvd below.
2933          */
2934         cvd = pvd->vdev_child[0];
2935
2936         /*
2937          * If we need to remove the remaining child from the list of hot spares,
2938          * do it now, marking the vdev as no longer a spare in the process.  We
2939          * must do this before vdev_remove_parent(), because that can change the
2940          * GUID if it creates a new toplevel GUID.
2941          */
2942         if (unspare) {
2943                 ASSERT(cvd->vdev_isspare);
2944                 spa_spare_remove(cvd);
2945                 unspare_guid = cvd->vdev_guid;
2946         }
2947
2948         /*
2949          * If the parent mirror/replacing vdev only has one child,
2950          * the parent is no longer needed.  Remove it from the tree.
2951          */
2952         if (pvd->vdev_children == 1)
2953                 vdev_remove_parent(cvd);
2954
2955         /*
2956          * We don't set tvd until now because the parent we just removed
2957          * may have been the previous top-level vdev.
2958          */
2959         tvd = cvd->vdev_top;
2960         ASSERT(tvd->vdev_parent == rvd);
2961
2962         /*
2963          * Reevaluate the parent vdev state.
2964          */
2965         vdev_propagate_state(cvd);
2966
2967         /*
2968          * If the device we just detached was smaller than the others, it may be
2969          * possible to add metaslabs (i.e. grow the pool).  vdev_metaslab_init()
2970          * can't fail because the existing metaslabs are already in core, so
2971          * there's nothing to read from disk.
2972          */
2973         VERIFY(vdev_metaslab_init(tvd, txg) == 0);
2974
2975         vdev_config_dirty(tvd);
2976
2977         /*
2978          * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
2979          * vd->vdev_detached is set and free vd's DTL object in syncing context.
2980          * But first make sure we're not on any *other* txg's DTL list, to
2981          * prevent vd from being accessed after it's freed.
2982          */
2983         for (t = 0; t < TXG_SIZE; t++)
2984                 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
2985         vd->vdev_detached = B_TRUE;
2986         vdev_dirty(tvd, VDD_DTL, vd, txg);
2987
2988         spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
2989
2990         error = spa_vdev_exit(spa, vd, txg, 0);
2991
2992         /*
2993          * If this was the removal of the original device in a hot spare vdev,
2994          * then we want to go through and remove the device from the hot spare
2995          * list of every other pool.
2996          */
2997         if (unspare) {
2998                 spa = NULL;
2999                 mutex_enter(&spa_namespace_lock);
3000                 while ((spa = spa_next(spa)) != NULL) {
3001                         if (spa->spa_state != POOL_STATE_ACTIVE)
3002                                 continue;
3003
3004                         (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
3005                 }
3006                 mutex_exit(&spa_namespace_lock);
3007         }
3008
3009         return (error);
3010 }
3011
3012 /*
3013  * Remove a spares vdev from the nvlist config.
3014  */
3015 static int
3016 spa_remove_spares(spa_aux_vdev_t *sav, uint64_t guid, boolean_t unspare,
3017     nvlist_t **spares, int nspares, vdev_t *vd)
3018 {
3019         nvlist_t *nv, **newspares;
3020         int i, j;
3021
3022         nv = NULL;
3023         for (i = 0; i < nspares; i++) {
3024                 uint64_t theguid;
3025
3026                 VERIFY(nvlist_lookup_uint64(spares[i],
3027                     ZPOOL_CONFIG_GUID, &theguid) == 0);
3028                 if (theguid == guid) {
3029                         nv = spares[i];
3030                         break;
3031                 }
3032         }
3033
3034         /*
3035          * Only remove the hot spare if it's not currently in use in this pool.
3036          */
3037         if (nv == NULL && vd == NULL)
3038                 return (ENOENT);
3039
3040         if (nv == NULL && vd != NULL)
3041                 return (ENOTSUP);
3042
3043         if (!unspare && nv != NULL && vd != NULL)
3044                 return (EBUSY);
3045
3046         if (nspares == 1) {
3047                 newspares = NULL;
3048         } else {
3049                 newspares = kmem_alloc((nspares - 1) * sizeof (void *),
3050                     KM_SLEEP);
3051                 for (i = 0, j = 0; i < nspares; i++) {
3052                         if (spares[i] != nv)
3053                                 VERIFY(nvlist_dup(spares[i],
3054                                     &newspares[j++], KM_SLEEP) == 0);
3055                 }
3056         }
3057
3058         VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_SPARES,
3059             DATA_TYPE_NVLIST_ARRAY) == 0);
3060         VERIFY(nvlist_add_nvlist_array(sav->sav_config,
3061             ZPOOL_CONFIG_SPARES, newspares, nspares - 1) == 0);
3062         for (i = 0; i < nspares - 1; i++)
3063                 nvlist_free(newspares[i]);
3064         kmem_free(newspares, (nspares - 1) * sizeof (void *));
3065
3066         return (0);
3067 }
3068
3069 /*
3070  * Remove an l2cache vdev from the nvlist config.
3071  */
3072 static int
3073 spa_remove_l2cache(spa_aux_vdev_t *sav, uint64_t guid, nvlist_t **l2cache,
3074     int nl2cache, vdev_t *vd)
3075 {
3076         nvlist_t *nv, **newl2cache;
3077         int i, j;
3078
3079         nv = NULL;
3080         for (i = 0; i < nl2cache; i++) {
3081                 uint64_t theguid;
3082
3083                 VERIFY(nvlist_lookup_uint64(l2cache[i],
3084                     ZPOOL_CONFIG_GUID, &theguid) == 0);
3085                 if (theguid == guid) {
3086                         nv = l2cache[i];
3087                         break;
3088                 }
3089         }
3090
3091         if (vd == NULL) {
3092                 for (i = 0; i < nl2cache; i++) {
3093                         if (sav->sav_vdevs[i]->vdev_guid == guid) {
3094                                 vd = sav->sav_vdevs[i];
3095                                 break;
3096                         }
3097                 }
3098         }
3099
3100         if (nv == NULL && vd == NULL)
3101                 return (ENOENT);
3102
3103         if (nv == NULL && vd != NULL)
3104                 return (ENOTSUP);
3105
3106         if (nl2cache == 1) {
3107                 newl2cache = NULL;
3108         } else {
3109                 newl2cache = kmem_alloc((nl2cache - 1) * sizeof (void *),
3110                     KM_SLEEP);
3111                 for (i = 0, j = 0; i < nl2cache; i++) {
3112                         if (l2cache[i] != nv)
3113                                 VERIFY(nvlist_dup(l2cache[i],
3114                                     &newl2cache[j++], KM_SLEEP) == 0);
3115                 }
3116         }
3117
3118         VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
3119             DATA_TYPE_NVLIST_ARRAY) == 0);
3120         VERIFY(nvlist_add_nvlist_array(sav->sav_config,
3121             ZPOOL_CONFIG_L2CACHE, newl2cache, nl2cache - 1) == 0);
3122         for (i = 0; i < nl2cache - 1; i++)
3123                 nvlist_free(newl2cache[i]);
3124         kmem_free(newl2cache, (nl2cache - 1) * sizeof (void *));
3125
3126         return (0);
3127 }
3128
3129 /*
3130  * Remove a device from the pool.  Currently, this supports removing only hot
3131  * spares and level 2 ARC devices.
3132  */
3133 int
3134 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
3135 {
3136         vdev_t *vd;
3137         nvlist_t **spares, **l2cache;
3138         uint_t nspares, nl2cache;
3139         int error = 0;
3140
3141         spa_config_enter(spa, RW_WRITER, FTAG);
3142
3143         vd = spa_lookup_by_guid(spa, guid);
3144
3145         if (spa->spa_spares.sav_vdevs != NULL &&
3146             spa_spare_exists(guid, NULL) &&
3147             nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
3148             ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) {
3149                 if ((error = spa_remove_spares(&spa->spa_spares, guid, unspare,
3150                     spares, nspares, vd)) != 0)
3151                         goto out;
3152                 spa_load_spares(spa);
3153                 spa->spa_spares.sav_sync = B_TRUE;
3154                 goto out;
3155         }
3156
3157         if (spa->spa_l2cache.sav_vdevs != NULL &&
3158             spa_l2cache_exists(guid, NULL) &&
3159             nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
3160             ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) {
3161                 if ((error = spa_remove_l2cache(&spa->spa_l2cache, guid,
3162                     l2cache, nl2cache, vd)) != 0)
3163                         goto out;
3164                 spa_load_l2cache(spa);
3165                 spa->spa_l2cache.sav_sync = B_TRUE;
3166         }
3167
3168 out:
3169         spa_config_exit(spa, FTAG);
3170         return (error);
3171 }
3172
3173 /*
3174  * Find any device that's done replacing, or a vdev marked 'unspare' that's
3175  * current spared, so we can detach it.
3176  */
3177 static vdev_t *
3178 spa_vdev_resilver_done_hunt(vdev_t *vd)
3179 {
3180         vdev_t *newvd, *oldvd;
3181         int c;
3182
3183         for (c = 0; c < vd->vdev_children; c++) {
3184                 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
3185                 if (oldvd != NULL)
3186                         return (oldvd);
3187         }
3188
3189         /*
3190          * Check for a completed replacement.
3191          */
3192         if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
3193                 oldvd = vd->vdev_child[0];
3194                 newvd = vd->vdev_child[1];
3195
3196                 mutex_enter(&newvd->vdev_dtl_lock);
3197                 if (newvd->vdev_dtl_map.sm_space == 0 &&
3198                     newvd->vdev_dtl_scrub.sm_space == 0) {
3199                         mutex_exit(&newvd->vdev_dtl_lock);
3200                         return (oldvd);
3201                 }
3202                 mutex_exit(&newvd->vdev_dtl_lock);
3203         }
3204
3205         /*
3206          * Check for a completed resilver with the 'unspare' flag set.
3207          */
3208         if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
3209                 newvd = vd->vdev_child[0];
3210                 oldvd = vd->vdev_child[1];
3211
3212                 mutex_enter(&newvd->vdev_dtl_lock);
3213                 if (newvd->vdev_unspare &&
3214                     newvd->vdev_dtl_map.sm_space == 0 &&
3215                     newvd->vdev_dtl_scrub.sm_space == 0) {
3216                         newvd->vdev_unspare = 0;
3217                         mutex_exit(&newvd->vdev_dtl_lock);
3218                         return (oldvd);
3219                 }
3220                 mutex_exit(&newvd->vdev_dtl_lock);
3221         }
3222
3223         return (NULL);
3224 }
3225
3226 static void
3227 spa_vdev_resilver_done(spa_t *spa)
3228 {
3229         vdev_t *vd;
3230         vdev_t *pvd;
3231         uint64_t guid;
3232         uint64_t pguid = 0;
3233
3234         spa_config_enter(spa, RW_READER, FTAG);
3235
3236         while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
3237                 guid = vd->vdev_guid;
3238                 /*
3239                  * If we have just finished replacing a hot spared device, then
3240                  * we need to detach the parent's first child (the original hot
3241                  * spare) as well.
3242                  */
3243                 pvd = vd->vdev_parent;
3244                 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3245                     pvd->vdev_id == 0) {
3246                         ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
3247                         ASSERT(pvd->vdev_parent->vdev_children == 2);
3248                         pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
3249                 }
3250                 spa_config_exit(spa, FTAG);
3251                 if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
3252                         return;
3253                 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
3254                         return;
3255                 spa_config_enter(spa, RW_READER, FTAG);
3256         }
3257
3258         spa_config_exit(spa, FTAG);
3259 }
3260
3261 /*
3262  * Update the stored path for this vdev.  Dirty the vdev configuration, relying
3263  * on spa_vdev_enter/exit() to synchronize the labels and cache.
3264  */
3265 int
3266 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
3267 {
3268         vdev_t *rvd, *vd;
3269         uint64_t txg;
3270
3271         rvd = spa->spa_root_vdev;
3272
3273         txg = spa_vdev_enter(spa);
3274
3275         if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
3276                 /*
3277                  * Determine if this is a reference to a hot spare or l2cache
3278                  * device.  If it is, update the path as stored in their
3279                  * device list.
3280                  */
3281                 nvlist_t **spares, **l2cache;
3282                 uint_t i, nspares, nl2cache;
3283
3284                 if (spa->spa_spares.sav_config != NULL) {
3285                         VERIFY(nvlist_lookup_nvlist_array(
3286                             spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
3287                             &spares, &nspares) == 0);
3288                         for (i = 0; i < nspares; i++) {
3289                                 uint64_t theguid;
3290                                 VERIFY(nvlist_lookup_uint64(spares[i],
3291                                     ZPOOL_CONFIG_GUID, &theguid) == 0);
3292                                 if (theguid == guid) {
3293                                         VERIFY(nvlist_add_string(spares[i],
3294                                             ZPOOL_CONFIG_PATH, newpath) == 0);
3295                                         spa_load_spares(spa);
3296                                         spa->spa_spares.sav_sync = B_TRUE;
3297                                         return (spa_vdev_exit(spa, NULL, txg,
3298                                             0));
3299                                 }
3300                         }
3301                 }
3302
3303                 if (spa->spa_l2cache.sav_config != NULL) {
3304                         VERIFY(nvlist_lookup_nvlist_array(
3305                             spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE,
3306                             &l2cache, &nl2cache) == 0);
3307                         for (i = 0; i < nl2cache; i++) {
3308                                 uint64_t theguid;
3309                                 VERIFY(nvlist_lookup_uint64(l2cache[i],
3310                                     ZPOOL_CONFIG_GUID, &theguid) == 0);
3311                                 if (theguid == guid) {
3312                                         VERIFY(nvlist_add_string(l2cache[i],
3313                                             ZPOOL_CONFIG_PATH, newpath) == 0);
3314                                         spa_load_l2cache(spa);
3315                                         spa->spa_l2cache.sav_sync = B_TRUE;
3316                                         return (spa_vdev_exit(spa, NULL, txg,
3317                                             0));
3318                                 }
3319                         }
3320                 }
3321
3322                 return (spa_vdev_exit(spa, NULL, txg, ENOENT));
3323         }
3324
3325         if (!vd->vdev_ops->vdev_op_leaf)
3326                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3327
3328         spa_strfree(vd->vdev_path);
3329         vd->vdev_path = spa_strdup(newpath);
3330
3331         vdev_config_dirty(vd->vdev_top);
3332
3333         return (spa_vdev_exit(spa, NULL, txg, 0));
3334 }
3335
3336 /*
3337  * ==========================================================================
3338  * SPA Scrubbing
3339  * ==========================================================================
3340  */
3341
3342 static void
3343 spa_scrub_io_done(zio_t *zio)
3344 {
3345         spa_t *spa = zio->io_spa;
3346
3347         arc_data_buf_free(zio->io_data, zio->io_size);
3348
3349         mutex_enter(&spa->spa_scrub_lock);
3350         if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3351                 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
3352                 spa->spa_scrub_errors++;
3353                 mutex_enter(&vd->vdev_stat_lock);
3354                 vd->vdev_stat.vs_scrub_errors++;
3355                 mutex_exit(&vd->vdev_stat_lock);
3356         }
3357
3358         if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight)
3359                 cv_broadcast(&spa->spa_scrub_io_cv);
3360
3361         ASSERT(spa->spa_scrub_inflight >= 0);
3362
3363         mutex_exit(&spa->spa_scrub_lock);
3364 }
3365
3366 static void
3367 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
3368     zbookmark_t *zb)
3369 {
3370         size_t size = BP_GET_LSIZE(bp);
3371         void *data;
3372
3373         mutex_enter(&spa->spa_scrub_lock);
3374         /*
3375          * Do not give too much work to vdev(s).
3376          */
3377         while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) {
3378                 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
3379         }
3380         spa->spa_scrub_inflight++;
3381         mutex_exit(&spa->spa_scrub_lock);
3382
3383         data = arc_data_buf_alloc(size);
3384
3385         if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
3386                 flags |= ZIO_FLAG_SPECULATIVE;  /* intent log block */
3387
3388         flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
3389
3390         zio_nowait(zio_read(NULL, spa, bp, data, size,
3391             spa_scrub_io_done, NULL, priority, flags, zb));
3392 }
3393
3394 /* ARGSUSED */
3395 static int
3396 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
3397 {
3398         blkptr_t *bp = &bc->bc_blkptr;
3399         vdev_t *vd = spa->spa_root_vdev;
3400         dva_t *dva = bp->blk_dva;
3401         int needs_resilver = B_FALSE;
3402         int d;
3403
3404         if (bc->bc_errno) {
3405                 /*
3406                  * We can't scrub this block, but we can continue to scrub
3407                  * the rest of the pool.  Note the error and move along.
3408                  */
3409                 mutex_enter(&spa->spa_scrub_lock);
3410                 spa->spa_scrub_errors++;
3411                 mutex_exit(&spa->spa_scrub_lock);
3412
3413                 mutex_enter(&vd->vdev_stat_lock);
3414                 vd->vdev_stat.vs_scrub_errors++;
3415                 mutex_exit(&vd->vdev_stat_lock);
3416
3417                 return (ERESTART);
3418         }
3419
3420         ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
3421
3422         for (d = 0; d < BP_GET_NDVAS(bp); d++) {
3423                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
3424
3425                 ASSERT(vd != NULL);
3426
3427                 /*
3428                  * Keep track of how much data we've examined so that
3429                  * zpool(1M) status can make useful progress reports.
3430                  */
3431                 mutex_enter(&vd->vdev_stat_lock);
3432                 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
3433                 mutex_exit(&vd->vdev_stat_lock);
3434
3435                 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
3436                         if (DVA_GET_GANG(&dva[d])) {
3437                                 /*
3438                                  * Gang members may be spread across multiple
3439                                  * vdevs, so the best we can do is look at the
3440                                  * pool-wide DTL.
3441                                  * XXX -- it would be better to change our
3442                                  * allocation policy to ensure that this can't
3443                                  * happen.
3444                                  */
3445                                 vd = spa->spa_root_vdev;
3446                         }
3447                         if (vdev_dtl_contains(&vd->vdev_dtl_map,
3448                             bp->blk_birth, 1))
3449                                 needs_resilver = B_TRUE;
3450                 }
3451         }
3452
3453         if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
3454                 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
3455                     ZIO_FLAG_SCRUB, &bc->bc_bookmark);
3456         else if (needs_resilver)
3457                 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
3458                     ZIO_FLAG_RESILVER, &bc->bc_bookmark);
3459
3460         return (0);
3461 }
3462
3463 static void
3464 spa_scrub_thread(spa_t *spa)
3465 {
3466         callb_cpr_t cprinfo;
3467         traverse_handle_t *th = spa->spa_scrub_th;
3468         vdev_t *rvd = spa->spa_root_vdev;
3469         pool_scrub_type_t scrub_type = spa->spa_scrub_type;
3470         int error = 0;
3471         boolean_t complete;
3472
3473         CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
3474
3475         /*
3476          * If we're restarting due to a snapshot create/delete,
3477          * wait for that to complete.
3478          */
3479         txg_wait_synced(spa_get_dsl(spa), 0);
3480
3481         dprintf("start %s mintxg=%llu maxtxg=%llu\n",
3482             scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
3483             spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
3484
3485         spa_config_enter(spa, RW_WRITER, FTAG);
3486         vdev_reopen(rvd);               /* purge all vdev caches */
3487         vdev_config_dirty(rvd);         /* rewrite all disk labels */
3488         vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
3489         spa_config_exit(spa, FTAG);
3490
3491         mutex_enter(&spa->spa_scrub_lock);
3492         spa->spa_scrub_errors = 0;
3493         spa->spa_scrub_active = 1;
3494         ASSERT(spa->spa_scrub_inflight == 0);
3495
3496         while (!spa->spa_scrub_stop) {
3497                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
3498                 while (spa->spa_scrub_suspended) {
3499                         spa->spa_scrub_active = 0;
3500                         cv_broadcast(&spa->spa_scrub_cv);
3501                         cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
3502                         spa->spa_scrub_active = 1;
3503                 }
3504                 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
3505
3506                 if (spa->spa_scrub_restart_txg != 0)
3507                         break;
3508
3509                 mutex_exit(&spa->spa_scrub_lock);
3510                 error = traverse_more(th);
3511                 mutex_enter(&spa->spa_scrub_lock);
3512                 if (error != EAGAIN)
3513                         break;
3514         }
3515
3516         while (spa->spa_scrub_inflight)
3517                 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
3518
3519         spa->spa_scrub_active = 0;
3520         cv_broadcast(&spa->spa_scrub_cv);
3521
3522         mutex_exit(&spa->spa_scrub_lock);
3523
3524         spa_config_enter(spa, RW_WRITER, FTAG);
3525
3526         mutex_enter(&spa->spa_scrub_lock);
3527
3528         /*
3529          * Note: we check spa_scrub_restart_txg under both spa_scrub_lock
3530          * AND the spa config lock to synchronize with any config changes
3531          * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit().
3532          */
3533         if (spa->spa_scrub_restart_txg != 0)
3534                 error = ERESTART;
3535
3536         if (spa->spa_scrub_stop)
3537                 error = EINTR;
3538
3539         /*
3540          * Even if there were uncorrectable errors, we consider the scrub
3541          * completed.  The downside is that if there is a transient error during
3542          * a resilver, we won't resilver the data properly to the target.  But
3543          * if the damage is permanent (more likely) we will resilver forever,
3544          * which isn't really acceptable.  Since there is enough information for
3545          * the user to know what has failed and why, this seems like a more
3546          * tractable approach.
3547          */
3548         complete = (error == 0);
3549
3550         dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
3551             scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
3552             spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
3553             error, spa->spa_scrub_errors, spa->spa_scrub_stop);
3554
3555         mutex_exit(&spa->spa_scrub_lock);
3556
3557         /*
3558          * If the scrub/resilver completed, update all DTLs to reflect this.
3559          * Whether it succeeded or not, vacate all temporary scrub DTLs.
3560          */
3561         vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
3562             complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
3563         vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
3564         spa_errlog_rotate(spa);
3565
3566         if (scrub_type == POOL_SCRUB_RESILVER && complete)
3567                 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH);
3568
3569         spa_config_exit(spa, FTAG);
3570
3571         mutex_enter(&spa->spa_scrub_lock);
3572
3573         /*
3574          * We may have finished replacing a device.
3575          * Let the async thread assess this and handle the detach.
3576          */
3577         spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
3578
3579         /*
3580          * If we were told to restart, our final act is to start a new scrub.
3581          */
3582         if (error == ERESTART)
3583                 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
3584                     SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
3585
3586         spa->spa_scrub_type = POOL_SCRUB_NONE;
3587         spa->spa_scrub_active = 0;
3588         spa->spa_scrub_thread = NULL;
3589         cv_broadcast(&spa->spa_scrub_cv);
3590         CALLB_CPR_EXIT(&cprinfo);       /* drops &spa->spa_scrub_lock */
3591         thread_exit();
3592 }
3593
3594 void
3595 spa_scrub_suspend(spa_t *spa)
3596 {
3597         mutex_enter(&spa->spa_scrub_lock);
3598         spa->spa_scrub_suspended++;
3599         while (spa->spa_scrub_active) {
3600                 cv_broadcast(&spa->spa_scrub_cv);
3601                 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
3602         }
3603         while (spa->spa_scrub_inflight)
3604                 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
3605         mutex_exit(&spa->spa_scrub_lock);
3606 }
3607
3608 void
3609 spa_scrub_resume(spa_t *spa)
3610 {
3611         mutex_enter(&spa->spa_scrub_lock);
3612         ASSERT(spa->spa_scrub_suspended != 0);
3613         if (--spa->spa_scrub_suspended == 0)
3614                 cv_broadcast(&spa->spa_scrub_cv);
3615         mutex_exit(&spa->spa_scrub_lock);
3616 }
3617
3618 void
3619 spa_scrub_restart(spa_t *spa, uint64_t txg)
3620 {
3621         /*
3622          * Something happened (e.g. snapshot create/delete) that means
3623          * we must restart any in-progress scrubs.  The itinerary will
3624          * fix this properly.
3625          */
3626         mutex_enter(&spa->spa_scrub_lock);
3627         spa->spa_scrub_restart_txg = txg;
3628         mutex_exit(&spa->spa_scrub_lock);
3629 }
3630
3631 int
3632 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
3633 {
3634         space_seg_t *ss;
3635         uint64_t mintxg, maxtxg;
3636         vdev_t *rvd = spa->spa_root_vdev;
3637
3638         ASSERT(MUTEX_HELD(&spa_namespace_lock));
3639         ASSERT(!spa_config_held(spa, RW_WRITER));
3640
3641         if ((uint_t)type >= POOL_SCRUB_TYPES)
3642                 return (ENOTSUP);
3643
3644         mutex_enter(&spa->spa_scrub_lock);
3645
3646         /*
3647          * If there's a scrub or resilver already in progress, stop it.
3648          */
3649         while (spa->spa_scrub_thread != NULL) {
3650                 /*
3651                  * Don't stop a resilver unless forced.
3652                  */
3653                 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
3654                         mutex_exit(&spa->spa_scrub_lock);
3655                         return (EBUSY);
3656                 }
3657                 spa->spa_scrub_stop = 1;
3658                 cv_broadcast(&spa->spa_scrub_cv);
3659                 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
3660         }
3661
3662         /*
3663          * Terminate the previous traverse.
3664          */
3665         if (spa->spa_scrub_th != NULL) {
3666                 traverse_fini(spa->spa_scrub_th);
3667                 spa->spa_scrub_th = NULL;
3668         }
3669
3670         if (rvd == NULL) {
3671                 ASSERT(spa->spa_scrub_stop == 0);
3672                 ASSERT(spa->spa_scrub_type == type);
3673                 ASSERT(spa->spa_scrub_restart_txg == 0);
3674                 mutex_exit(&spa->spa_scrub_lock);
3675                 return (0);
3676         }
3677
3678         mintxg = TXG_INITIAL - 1;
3679         maxtxg = spa_last_synced_txg(spa) + 1;
3680
3681         mutex_enter(&rvd->vdev_dtl_lock);
3682
3683         if (rvd->vdev_dtl_map.sm_space == 0) {
3684                 /*
3685                  * The pool-wide DTL is empty.
3686                  * If this is a resilver, there's nothing to do except
3687                  * check whether any in-progress replacements have completed.
3688                  */
3689                 if (type == POOL_SCRUB_RESILVER) {
3690                         type = POOL_SCRUB_NONE;
3691                         spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
3692                 }
3693         } else {
3694                 /*
3695                  * The pool-wide DTL is non-empty.
3696                  * If this is a normal scrub, upgrade to a resilver instead.
3697                  */
3698                 if (type == POOL_SCRUB_EVERYTHING)
3699                         type = POOL_SCRUB_RESILVER;
3700         }
3701
3702         if (type == POOL_SCRUB_RESILVER) {
3703                 /*
3704                  * Determine the resilvering boundaries.
3705                  *
3706                  * Note: (mintxg, maxtxg) is an open interval,
3707                  * i.e. mintxg and maxtxg themselves are not included.
3708                  *
3709                  * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
3710                  * so we don't claim to resilver a txg that's still changing.
3711                  */
3712                 ss = avl_first(&rvd->vdev_dtl_map.sm_root);
3713                 mintxg = ss->ss_start - 1;
3714                 ss = avl_last(&rvd->vdev_dtl_map.sm_root);
3715                 maxtxg = MIN(ss->ss_end, maxtxg);
3716
3717                 spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
3718         }
3719
3720         mutex_exit(&rvd->vdev_dtl_lock);
3721
3722         spa->spa_scrub_stop = 0;
3723         spa->spa_scrub_type = type;
3724         spa->spa_scrub_restart_txg = 0;
3725
3726         if (type != POOL_SCRUB_NONE) {
3727                 spa->spa_scrub_mintxg = mintxg;
3728                 spa->spa_scrub_maxtxg = maxtxg;
3729                 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
3730                     ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL,
3731                     ZIO_FLAG_CANFAIL);
3732                 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
3733                 spa->spa_scrub_thread = thread_create(NULL, 0,
3734                     spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
3735         }
3736
3737         mutex_exit(&spa->spa_scrub_lock);
3738
3739         return (0);
3740 }
3741
3742 /*
3743  * ==========================================================================
3744  * SPA async task processing
3745  * ==========================================================================
3746  */
3747
3748 static void
3749 spa_async_remove(spa_t *spa, vdev_t *vd)
3750 {
3751         vdev_t *tvd;
3752         int c;
3753
3754         for (c = 0; c < vd->vdev_children; c++) {
3755                 tvd = vd->vdev_child[c];
3756                 if (tvd->vdev_remove_wanted) {
3757                         tvd->vdev_remove_wanted = 0;
3758                         vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED,
3759                             VDEV_AUX_NONE);
3760                         vdev_clear(spa, tvd, B_TRUE);
3761                         vdev_config_dirty(tvd->vdev_top);
3762                 }
3763                 spa_async_remove(spa, tvd);
3764         }
3765 }
3766
3767 static void
3768 spa_async_thread(spa_t *spa)
3769 {
3770         int tasks;
3771         uint64_t txg;
3772
3773         ASSERT(spa->spa_sync_on);
3774
3775         mutex_enter(&spa->spa_async_lock);
3776         tasks = spa->spa_async_tasks;
3777         spa->spa_async_tasks = 0;
3778         mutex_exit(&spa->spa_async_lock);
3779
3780         /*
3781          * See if the config needs to be updated.
3782          */
3783         if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
3784                 mutex_enter(&spa_namespace_lock);
3785                 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3786                 mutex_exit(&spa_namespace_lock);
3787         }
3788
3789         /*
3790          * See if any devices need to be marked REMOVED.
3791          *
3792          * XXX - We avoid doing this when we are in
3793          * I/O failure state since spa_vdev_enter() grabs
3794          * the namespace lock and would not be able to obtain
3795          * the writer config lock.
3796          */
3797         if (tasks & SPA_ASYNC_REMOVE &&
3798             spa_state(spa) != POOL_STATE_IO_FAILURE) {
3799                 txg = spa_vdev_enter(spa);
3800                 spa_async_remove(spa, spa->spa_root_vdev);
3801                 (void) spa_vdev_exit(spa, NULL, txg, 0);
3802         }
3803
3804         /*
3805          * If any devices are done replacing, detach them.
3806          */
3807         if (tasks & SPA_ASYNC_RESILVER_DONE)
3808                 spa_vdev_resilver_done(spa);
3809
3810         /*
3811          * Kick off a scrub.  When starting a RESILVER scrub (or an EVERYTHING
3812          * scrub which can become a resilver), we need to hold
3813          * spa_namespace_lock() because the sysevent we post via
3814          * spa_event_notify() needs to get the name of the pool.
3815          */
3816         if (tasks & SPA_ASYNC_SCRUB) {
3817                 mutex_enter(&spa_namespace_lock);
3818                 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
3819                 mutex_exit(&spa_namespace_lock);
3820         }
3821
3822         /*
3823          * Kick off a resilver.
3824          */
3825         if (tasks & SPA_ASYNC_RESILVER) {
3826                 mutex_enter(&spa_namespace_lock);
3827                 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
3828                 mutex_exit(&spa_namespace_lock);
3829         }
3830
3831         /*
3832          * Let the world know that we're done.
3833          */
3834         mutex_enter(&spa->spa_async_lock);
3835         spa->spa_async_thread = NULL;
3836         cv_broadcast(&spa->spa_async_cv);
3837         mutex_exit(&spa->spa_async_lock);
3838         thread_exit();
3839 }
3840
3841 void
3842 spa_async_suspend(spa_t *spa)
3843 {
3844         mutex_enter(&spa->spa_async_lock);
3845         spa->spa_async_suspended++;
3846         while (spa->spa_async_thread != NULL)
3847                 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
3848         mutex_exit(&spa->spa_async_lock);
3849 }
3850
3851 void
3852 spa_async_resume(spa_t *spa)
3853 {
3854         mutex_enter(&spa->spa_async_lock);
3855         ASSERT(spa->spa_async_suspended != 0);
3856         spa->spa_async_suspended--;
3857         mutex_exit(&spa->spa_async_lock);
3858 }
3859
3860 static void
3861 spa_async_dispatch(spa_t *spa)
3862 {
3863         mutex_enter(&spa->spa_async_lock);
3864         if (spa->spa_async_tasks && !spa->spa_async_suspended &&
3865             spa->spa_async_thread == NULL &&
3866             rootdir != NULL && !vn_is_readonly(rootdir))
3867                 spa->spa_async_thread = thread_create(NULL, 0,
3868                     spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
3869         mutex_exit(&spa->spa_async_lock);
3870 }
3871
3872 void
3873 spa_async_request(spa_t *spa, int task)
3874 {
3875         mutex_enter(&spa->spa_async_lock);
3876         spa->spa_async_tasks |= task;
3877         mutex_exit(&spa->spa_async_lock);
3878 }
3879
3880 /*
3881  * ==========================================================================
3882  * SPA syncing routines
3883  * ==========================================================================
3884  */
3885
3886 static void
3887 spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
3888 {
3889         bplist_t *bpl = &spa->spa_sync_bplist;
3890         dmu_tx_t *tx;
3891         blkptr_t blk;
3892         uint64_t itor = 0;
3893         zio_t *zio;
3894         int error;
3895         uint8_t c = 1;
3896
3897         zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
3898
3899         while (bplist_iterate(bpl, &itor, &blk) == 0)
3900                 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
3901
3902         error = zio_wait(zio);
3903         ASSERT3U(error, ==, 0);
3904
3905         tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3906         bplist_vacate(bpl, tx);
3907
3908         /*
3909          * Pre-dirty the first block so we sync to convergence faster.
3910          * (Usually only the first block is needed.)
3911          */
3912         dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
3913         dmu_tx_commit(tx);
3914 }
3915
3916 static void
3917 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
3918 {
3919         char *packed = NULL;
3920         size_t nvsize = 0;
3921         dmu_buf_t *db;
3922
3923         VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
3924
3925         packed = kmem_alloc(nvsize, KM_SLEEP);
3926
3927         VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
3928             KM_SLEEP) == 0);
3929
3930         dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx);
3931
3932         kmem_free(packed, nvsize);
3933
3934         VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
3935         dmu_buf_will_dirty(db, tx);
3936         *(uint64_t *)db->db_data = nvsize;
3937         dmu_buf_rele(db, FTAG);
3938 }
3939
3940 static void
3941 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
3942     const char *config, const char *entry)
3943 {
3944         nvlist_t *nvroot;
3945         nvlist_t **list;
3946         int i;
3947
3948         if (!sav->sav_sync)
3949                 return;
3950
3951         /*
3952          * Update the MOS nvlist describing the list of available devices.
3953          * spa_validate_aux() will have already made sure this nvlist is
3954          * valid and the vdevs are labeled appropriately.
3955          */
3956         if (sav->sav_object == 0) {
3957                 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
3958                     DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
3959                     sizeof (uint64_t), tx);
3960                 VERIFY(zap_update(spa->spa_meta_objset,
3961                     DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
3962                     &sav->sav_object, tx) == 0);
3963         }
3964
3965         VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3966         if (sav->sav_count == 0) {
3967                 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
3968         } else {
3969                 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
3970                 for (i = 0; i < sav->sav_count; i++)
3971                         list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
3972                             B_FALSE, B_FALSE, B_TRUE);
3973                 VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
3974                     sav->sav_count) == 0);
3975                 for (i = 0; i < sav->sav_count; i++)
3976                         nvlist_free(list[i]);
3977                 kmem_free(list, sav->sav_count * sizeof (void *));
3978         }
3979
3980         spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
3981         nvlist_free(nvroot);
3982
3983         sav->sav_sync = B_FALSE;
3984 }
3985
3986 static void
3987 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
3988 {
3989         nvlist_t *config;
3990
3991         if (list_is_empty(&spa->spa_dirty_list))
3992                 return;
3993
3994         config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
3995
3996         if (spa->spa_config_syncing)
3997                 nvlist_free(spa->spa_config_syncing);
3998         spa->spa_config_syncing = config;
3999
4000         spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
4001 }
4002
4003 /*
4004  * Set zpool properties.
4005  */
4006 static void
4007 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
4008 {
4009         spa_t *spa = arg1;
4010         objset_t *mos = spa->spa_meta_objset;
4011         nvlist_t *nvp = arg2;
4012         nvpair_t *elem;
4013         uint64_t intval;
4014         char *strval, *slash;
4015         zpool_prop_t prop;
4016         const char *propname;
4017         zprop_type_t proptype;
4018
4019         elem = NULL;
4020         while ((elem = nvlist_next_nvpair(nvp, elem))) {
4021                 switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
4022                 case ZPOOL_PROP_VERSION:
4023                         /*
4024                          * Only set version for non-zpool-creation cases
4025                          * (set/import). spa_create() needs special care
4026                          * for version setting.
4027                          */
4028                         if (tx->tx_txg != TXG_INITIAL) {
4029                                 VERIFY(nvpair_value_uint64(elem,
4030                                     &intval) == 0);
4031                                 ASSERT(intval <= SPA_VERSION);
4032                                 ASSERT(intval >= spa_version(spa));
4033                                 spa->spa_uberblock.ub_version = intval;
4034                                 vdev_config_dirty(spa->spa_root_vdev);
4035                         }
4036                         break;
4037
4038                 case ZPOOL_PROP_ALTROOT:
4039                         /*
4040                          * 'altroot' is a non-persistent property. It should
4041                          * have been set temporarily at creation or import time.
4042                          */
4043                         ASSERT(spa->spa_root != NULL);
4044                         break;
4045
4046                 case ZPOOL_PROP_CACHEFILE:
4047                         /*
4048                          * 'cachefile' is a non-persistent property, but note
4049                          * an async request that the config cache needs to be
4050                          * udpated.
4051                          */
4052                         VERIFY(nvpair_value_string(elem, &strval) == 0);
4053                         if (spa->spa_config_dir)
4054                                 spa_strfree(spa->spa_config_dir);
4055                         if (spa->spa_config_file)
4056                                 spa_strfree(spa->spa_config_file);
4057
4058                         if (strval[0] == '\0') {
4059                                 spa->spa_config_dir = NULL;
4060                                 spa->spa_config_file = NULL;
4061                         } else if (strcmp(strval, "none") == 0) {
4062                                 spa->spa_config_dir = spa_strdup(strval);
4063                                 spa->spa_config_file = NULL;
4064                         } else {
4065                                 /*
4066                                  * If the cachefile is in the root directory,
4067                                  * we will end up with an empty string for
4068                                  * spa_config_dir.  This value is only ever
4069                                  * used when concatenated with '/', so an empty
4070                                  * string still behaves correctly and keeps the
4071                                  * rest of the code simple.
4072                                  */
4073                                 slash = strrchr(strval, '/');
4074                                 ASSERT(slash != NULL);
4075                                 *slash = '\0';
4076                                 if (strcmp(strval, spa_config_dir) == 0 &&
4077                                     strcmp(slash + 1, ZPOOL_CACHE_FILE) == 0) {
4078                                         spa->spa_config_dir = NULL;
4079                                         spa->spa_config_file = NULL;
4080                                 } else {
4081                                         spa->spa_config_dir =
4082                                             spa_strdup(strval);
4083                                         spa->spa_config_file =
4084                                             spa_strdup(slash + 1);
4085                                 }
4086                         }
4087                         spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
4088                         break;
4089                 default:
4090                         /*
4091                          * Set pool property values in the poolprops mos object.
4092                          */
4093                         mutex_enter(&spa->spa_props_lock);
4094                         if (spa->spa_pool_props_object == 0) {
4095                                 objset_t *mos = spa->spa_meta_objset;
4096
4097                                 VERIFY((spa->spa_pool_props_object =
4098                                     zap_create(mos, DMU_OT_POOL_PROPS,
4099                                     DMU_OT_NONE, 0, tx)) > 0);
4100
4101                                 VERIFY(zap_update(mos,
4102                                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
4103                                     8, 1, &spa->spa_pool_props_object, tx)
4104                                     == 0);
4105                         }
4106                         mutex_exit(&spa->spa_props_lock);
4107
4108                         /* normalize the property name */
4109                         propname = zpool_prop_to_name(prop);
4110                         proptype = zpool_prop_get_type(prop);
4111
4112                         if (nvpair_type(elem) == DATA_TYPE_STRING) {
4113                                 ASSERT(proptype == PROP_TYPE_STRING);
4114                                 VERIFY(nvpair_value_string(elem, &strval) == 0);
4115                                 VERIFY(zap_update(mos,
4116                                     spa->spa_pool_props_object, propname,
4117                                     1, strlen(strval) + 1, strval, tx) == 0);
4118
4119                         } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
4120                                 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
4121
4122                                 if (proptype == PROP_TYPE_INDEX) {
4123                                         const char *unused;
4124                                         VERIFY(zpool_prop_index_to_string(
4125                                             prop, intval, &unused) == 0);
4126                                 }
4127                                 VERIFY(zap_update(mos,
4128                                     spa->spa_pool_props_object, propname,
4129                                     8, 1, &intval, tx) == 0);
4130                         } else {
4131                                 ASSERT(0); /* not allowed */
4132                         }
4133
4134                         switch (prop) {
4135                         case ZPOOL_PROP_DELEGATION:
4136                                 spa->spa_delegation = intval;
4137                                 break;
4138                         case ZPOOL_PROP_BOOTFS:
4139                                 spa->spa_bootfs = intval;
4140                                 break;
4141                         case ZPOOL_PROP_FAILUREMODE:
4142                                 spa->spa_failmode = intval;
4143                                 break;
4144                         default:
4145                                 break;
4146                         }
4147                 }
4148
4149                 /* log internal history if this is not a zpool create */
4150                 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
4151                     tx->tx_txg != TXG_INITIAL) {
4152                         spa_history_internal_log(LOG_POOL_PROPSET,
4153                             spa, tx, cr, "%s %lld %s",
4154                             nvpair_name(elem), intval, spa->spa_name);
4155                 }
4156         }
4157 }
4158
4159 /*
4160  * Sync the specified transaction group.  New blocks may be dirtied as
4161  * part of the process, so we iterate until it converges.
4162  */
4163 void
4164 spa_sync(spa_t *spa, uint64_t txg)
4165 {
4166         dsl_pool_t *dp = spa->spa_dsl_pool;
4167         objset_t *mos = spa->spa_meta_objset;
4168         bplist_t *bpl = &spa->spa_sync_bplist;
4169         vdev_t *rvd = spa->spa_root_vdev;
4170         vdev_t *vd;
4171         vdev_t *svd[SPA_DVAS_PER_BP];
4172         int svdcount = 0;
4173         dmu_tx_t *tx;
4174         int dirty_vdevs;
4175
4176         /*
4177          * Lock out configuration changes.
4178          */
4179         spa_config_enter(spa, RW_READER, FTAG);
4180
4181         spa->spa_syncing_txg = txg;
4182         spa->spa_sync_pass = 0;
4183
4184         VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
4185
4186         tx = dmu_tx_create_assigned(dp, txg);
4187
4188         /*
4189          * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
4190          * set spa_deflate if we have no raid-z vdevs.
4191          */
4192         if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
4193             spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
4194                 int i;
4195
4196                 for (i = 0; i < rvd->vdev_children; i++) {
4197                         vd = rvd->vdev_child[i];
4198                         if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
4199                                 break;
4200                 }
4201                 if (i == rvd->vdev_children) {
4202                         spa->spa_deflate = TRUE;
4203                         VERIFY(0 == zap_add(spa->spa_meta_objset,
4204                             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
4205                             sizeof (uint64_t), 1, &spa->spa_deflate, tx));
4206                 }
4207         }
4208
4209         /*
4210          * If anything has changed in this txg, push the deferred frees
4211          * from the previous txg.  If not, leave them alone so that we
4212          * don't generate work on an otherwise idle system.
4213          */
4214         if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
4215             !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
4216             !txg_list_empty(&dp->dp_sync_tasks, txg))
4217                 spa_sync_deferred_frees(spa, txg);
4218
4219         /*
4220          * Iterate to convergence.
4221          */
4222         do {
4223                 spa->spa_sync_pass++;
4224
4225                 spa_sync_config_object(spa, tx);
4226                 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
4227                     ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
4228                 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
4229                     ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
4230                 spa_errlog_sync(spa, txg);
4231                 dsl_pool_sync(dp, txg);
4232
4233                 dirty_vdevs = 0;
4234                 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
4235                         vdev_sync(vd, txg);
4236                         dirty_vdevs++;
4237                 }
4238
4239                 bplist_sync(bpl, tx);
4240         } while (dirty_vdevs);
4241
4242         bplist_close(bpl);
4243
4244         dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
4245
4246         /*
4247          * Rewrite the vdev configuration (which includes the uberblock)
4248          * to commit the transaction group.
4249          *
4250          * If there are no dirty vdevs, we sync the uberblock to a few
4251          * random top-level vdevs that are known to be visible in the
4252          * config cache (see spa_vdev_add() for details).  If there *are*
4253          * dirty vdevs -- or if the sync to our random subset fails --
4254          * then sync the uberblock to all vdevs.
4255          */
4256         if (list_is_empty(&spa->spa_dirty_list)) {
4257                 int children = rvd->vdev_children;
4258                 int c0 = spa_get_random(children);
4259                 int c;
4260
4261                 for (c = 0; c < children; c++) {
4262                         vd = rvd->vdev_child[(c0 + c) % children];
4263                         if (vd->vdev_ms_array == 0 || vd->vdev_islog)
4264                                 continue;
4265                         svd[svdcount++] = vd;
4266                         if (svdcount == SPA_DVAS_PER_BP)
4267                                 break;
4268                 }
4269         }
4270         if (svdcount == 0 || vdev_config_sync(svd, svdcount, txg) != 0)
4271                 VERIFY3U(vdev_config_sync(rvd->vdev_child,
4272                     rvd->vdev_children, txg), ==, 0);
4273
4274         dmu_tx_commit(tx);
4275
4276         /*
4277          * Clear the dirty config list.
4278          */
4279         while ((vd = list_head(&spa->spa_dirty_list)) != NULL)
4280                 vdev_config_clean(vd);
4281
4282         /*
4283          * Now that the new config has synced transactionally,
4284          * let it become visible to the config cache.
4285          */
4286         if (spa->spa_config_syncing != NULL) {
4287                 spa_config_set(spa, spa->spa_config_syncing);
4288                 spa->spa_config_txg = txg;
4289                 spa->spa_config_syncing = NULL;
4290         }
4291
4292         /*
4293          * Make a stable copy of the fully synced uberblock.
4294          * We use this as the root for pool traversals.
4295          */
4296         spa->spa_traverse_wanted = 1;   /* tells traverse_more() to stop */
4297
4298         spa_scrub_suspend(spa);         /* stop scrubbing and finish I/Os */
4299
4300         rw_enter(&spa->spa_traverse_lock, RW_WRITER);
4301         spa->spa_traverse_wanted = 0;
4302         spa->spa_ubsync = spa->spa_uberblock;
4303         rw_exit(&spa->spa_traverse_lock);
4304
4305         spa_scrub_resume(spa);          /* resume scrub with new ubsync */
4306
4307         /*
4308          * Clean up the ZIL records for the synced txg.
4309          */
4310         dsl_pool_zil_clean(dp);
4311
4312         /*
4313          * Update usable space statistics.
4314          */
4315         while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
4316                 vdev_sync_done(vd, txg);
4317
4318         /*
4319          * It had better be the case that we didn't dirty anything
4320          * since vdev_config_sync().
4321          */
4322         ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
4323         ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
4324         ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
4325         ASSERT(bpl->bpl_queue == NULL);
4326
4327         spa_config_exit(spa, FTAG);
4328
4329         /*
4330          * If any async tasks have been requested, kick them off.
4331          */
4332         spa_async_dispatch(spa);
4333 }
4334
4335 /*
4336  * Sync all pools.  We don't want to hold the namespace lock across these
4337  * operations, so we take a reference on the spa_t and drop the lock during the
4338  * sync.
4339  */
4340 void
4341 spa_sync_allpools(void)
4342 {
4343         spa_t *spa = NULL;
4344         mutex_enter(&spa_namespace_lock);
4345         while ((spa = spa_next(spa)) != NULL) {
4346                 if (spa_state(spa) != POOL_STATE_ACTIVE)
4347                         continue;
4348                 spa_open_ref(spa, FTAG);
4349                 mutex_exit(&spa_namespace_lock);
4350                 txg_wait_synced(spa_get_dsl(spa), 0);
4351                 mutex_enter(&spa_namespace_lock);
4352                 spa_close(spa, FTAG);
4353         }
4354         mutex_exit(&spa_namespace_lock);
4355 }
4356
4357 /*
4358  * ==========================================================================
4359  * Miscellaneous routines
4360  * ==========================================================================
4361  */
4362
4363 /*
4364  * Remove all pools in the system.
4365  */
4366 void
4367 spa_evict_all(void)
4368 {
4369         spa_t *spa;
4370
4371         /*
4372          * Remove all cached state.  All pools should be closed now,
4373          * so every spa in the AVL tree should be unreferenced.
4374          */
4375         mutex_enter(&spa_namespace_lock);
4376         while ((spa = spa_next(NULL)) != NULL) {
4377                 /*
4378                  * Stop async tasks.  The async thread may need to detach
4379                  * a device that's been replaced, which requires grabbing
4380                  * spa_namespace_lock, so we must drop it here.
4381                  */
4382                 spa_open_ref(spa, FTAG);
4383                 mutex_exit(&spa_namespace_lock);
4384                 spa_async_suspend(spa);
4385                 mutex_enter(&spa_namespace_lock);
4386                 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
4387                 spa_close(spa, FTAG);
4388
4389                 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
4390                         spa_unload(spa);
4391                         spa_deactivate(spa);
4392                 }
4393                 spa_remove(spa);
4394         }
4395         mutex_exit(&spa_namespace_lock);
4396 }
4397
4398 vdev_t *
4399 spa_lookup_by_guid(spa_t *spa, uint64_t guid)
4400 {
4401         return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
4402 }
4403
4404 void
4405 spa_upgrade(spa_t *spa, uint64_t version)
4406 {
4407         spa_config_enter(spa, RW_WRITER, FTAG);
4408
4409         /*
4410          * This should only be called for a non-faulted pool, and since a
4411          * future version would result in an unopenable pool, this shouldn't be
4412          * possible.
4413          */
4414         ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
4415         ASSERT(version >= spa->spa_uberblock.ub_version);
4416
4417         spa->spa_uberblock.ub_version = version;
4418         vdev_config_dirty(spa->spa_root_vdev);
4419
4420         spa_config_exit(spa, FTAG);
4421
4422         txg_wait_synced(spa_get_dsl(spa), 0);
4423 }
4424
4425 boolean_t
4426 spa_has_spare(spa_t *spa, uint64_t guid)
4427 {
4428         int i;
4429         uint64_t spareguid;
4430         spa_aux_vdev_t *sav = &spa->spa_spares;
4431
4432         for (i = 0; i < sav->sav_count; i++)
4433                 if (sav->sav_vdevs[i]->vdev_guid == guid)
4434                         return (B_TRUE);
4435
4436         for (i = 0; i < sav->sav_npending; i++) {
4437                 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
4438                     &spareguid) == 0 && spareguid == guid)
4439                         return (B_TRUE);
4440         }
4441
4442         return (B_FALSE);
4443 }
4444
4445 /*
4446  * Post a sysevent corresponding to the given event.  The 'name' must be one of
4447  * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
4448  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
4449  * in the userland libzpool, as we don't want consumers to misinterpret ztest
4450  * or zdb as real changes.
4451  */
4452 void
4453 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
4454 {
4455 #ifdef _KERNEL
4456         sysevent_t              *ev;
4457         sysevent_attr_list_t    *attr = NULL;
4458         sysevent_value_t        value;
4459         sysevent_id_t           eid;
4460
4461         ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
4462             SE_SLEEP);
4463
4464         value.value_type = SE_DATA_TYPE_STRING;
4465         value.value.sv_string = spa_name(spa);
4466         if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
4467                 goto done;
4468
4469         value.value_type = SE_DATA_TYPE_UINT64;
4470         value.value.sv_uint64 = spa_guid(spa);
4471         if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
4472                 goto done;
4473
4474         if (vd) {
4475                 value.value_type = SE_DATA_TYPE_UINT64;
4476                 value.value.sv_uint64 = vd->vdev_guid;
4477                 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
4478                     SE_SLEEP) != 0)
4479                         goto done;
4480
4481                 if (vd->vdev_path) {
4482                         value.value_type = SE_DATA_TYPE_STRING;
4483                         value.value.sv_string = vd->vdev_path;
4484                         if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
4485                             &value, SE_SLEEP) != 0)
4486                                 goto done;
4487                 }
4488         }
4489
4490         if (sysevent_attach_attributes(ev, attr) != 0)
4491                 goto done;
4492         attr = NULL;
4493
4494         (void) log_sysevent(ev, SE_SLEEP, &eid);
4495
4496 done:
4497         if (attr)
4498                 sysevent_free_attr(attr);
4499         sysevent_free(ev);
4500 #endif
4501 }