Remove libumem, we will try and remove this dependency entirely. If we can't then...

[zfs.git] / zfs / lib / libzpool / zfs_fm.c
diff --git a/zfs/lib/libzpool/zfs_fm.c b/zfs/lib/libzpool/zfs_fm.c

index 4be7710..236d69e 100644 (file)
--- a/zfs/lib/libzpool/zfs_fm.c
+++ b/zfs/lib/libzpool/zfs_fm.c
@@ -23,8 +23,6 @@
   * Use is subject to license terms.
   */
  
-#pragma ident  "@(#)zfs_fm.c   1.6     08/04/01 SMI"
-
  #include <sys/spa.h>
  #include <sys/spa_impl.h>
  #include <sys/vdev.h>
@@ -48,7 +46,7 @@
   *     pool                    X
   *
   * If we are in a loading state, all errors are chained together by the same
- * SPA-wide ENA.
+ * SPA-wide ENA (Error Numeric Association).
   *
   * For isolated I/O requests, we get the ENA from the zio_t. The propagation
   * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
@@ -85,11 +83,10 @@
   * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
   * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
   * then inherit this pointer, so that when it is first set subsequent failures
- * will use the same ENA.  If a physical I/O is issued (by passing the
- * ZIO_FLAG_NOBOOKMARK flag), then this pointer is reset, guaranteeing that a
- * unique ENA will be generated.  For an aggregate I/O, this pointer is set to
- * NULL, and no ereport will be generated (since it doesn't actually correspond
- * to any particular device or piece of data).
+ * will use the same ENA.  For vdev cache fill and queue aggregation I/O,
+ * this pointer is set to NULL, and no ereport will be generated (since it
+ * doesn't actually correspond to any particular device or piece of data,
+ * and the caller will always retry without caching or queueing anyway).
   */
  void
  zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
@@ -99,6 +96,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
         nvlist_t *ereport, *detector;
         uint64_t ena;
         char class[64];
+       int state;
  
         /*
          * If we are doing a spa_tryimport(), ignore errors.
@@ -115,22 +113,33 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
             spa->spa_last_open_failed)
                 return;
  
-       /*
-        * Ignore any errors from I/Os that we are going to retry anyway - we
-        * only generate errors from the final failure.  Checksum errors are
-        * generated after the pipeline stage responsible for retrying the I/O
-        * (VDEV_IO_ASSESS), so this only applies to standard I/O errors.
-        */
-       if (zio && zio_should_retry(zio) && zio->io_error != ECKSUM)
-               return;
+       if (zio != NULL) {
+               /*
+                * If this is not a read or write zio, ignore the error.  This
+                * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
+                */
+               if (zio->io_type != ZIO_TYPE_READ &&
+                   zio->io_type != ZIO_TYPE_WRITE)
+                       return;
  
-       /*
-        * If this is not a read or write zio, ignore the error.  This can occur
-        * if the DKIOCFLUSHWRITECACHE ioctl fails.
-        */
-       if (zio && zio->io_type != ZIO_TYPE_READ &&
-           zio->io_type != ZIO_TYPE_WRITE)
-               return;
+               /*
+                * Ignore any errors from speculative I/Os, as failure is an
+                * expected result.
+                */
+               if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+                       return;
+
+               /*
+                * If the vdev has already been marked as failing due to a
+                * failed probe, then ignore any subsequent I/O errors, as the
+                * DE will automatically fault the vdev on the first such
+                * failure.
+                */
+               if (vd != NULL &&
+                   (!vdev_readable(vd) || !vdev_writeable(vd)) &&
+                   strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0)
+                       return;
+       }
  
         if ((ereport = fm_nvlist_create(NULL)) == NULL)
                 return;
@@ -180,22 +189,31 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
          */
  
         /*
+        * If we are importing a faulted pool, then we treat it like an open,
+        * not an import.  Otherwise, the DE will ignore all faults during
+        * import, since the default behavior is to mark the devices as
+        * persistently unavailable, not leave them in the faulted state.
+        */
+       state = spa->spa_import_faulted ? SPA_LOAD_OPEN : spa->spa_load_state;
+
+       /*
          * Generic payload members common to all ereports.
-        *
-        * The direct reference to spa_name is used rather than spa_name()
-        * because of the asynchronous nature of the zio pipeline.  spa_name()
-        * asserts that the config lock is held in some form.  This is always
-        * the case in I/O context, but because the check for RW_WRITER compares
-        * against 'curthread', we may be in an asynchronous context and blow
-        * this assert.  Rather than loosen this assert, we acknowledge that all
-        * contexts in which this function is called (pool open, I/O) are safe,
-        * and dereference the name directly.
          */
         fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
-           DATA_TYPE_STRING, spa->spa_name, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+           DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
             DATA_TYPE_UINT64, spa_guid(spa),
             FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
-           spa->spa_load_state, NULL);
+           state, NULL);
+
+       if (spa != NULL) {
+               fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
+                   DATA_TYPE_STRING,
+                   spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
+                   FM_EREPORT_FAILMODE_WAIT :
+                   spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
+                   FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
+                   NULL);
+       }
  
         if (vd != NULL) {
                 vdev_t *pvd = vd->vdev_parent;
@@ -321,17 +339,6 @@ zfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
  }
  
  /*
- * The 'resource.fs.zfs.ok' event is an internal signal that the associated
- * resource (pool or disk) has been identified by ZFS as healthy.  This will
- * then trigger the DE to close the associated case, if any.
- */
-void
-zfs_post_ok(spa_t *spa, vdev_t *vd)
-{
-       zfs_post_common(spa, vd, FM_RESOURCE_OK);
-}
-
-/*
   * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
   * has been removed from the system.  This will cause the DE to ignore any
   * recent I/O errors, inferring that they are due to the asynchronous device