Add zfs_disable_dup_eviction module option

[zfs.git] / module / zfs / zvol.c
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c

index 07bda6d..c6cfac2 100644 (file)
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -141,6 +141,29 @@ zvol_find_by_name(const char *name)
         return NULL;
  }
  
+
+/*
+ * Given a path, return TRUE if path is a ZVOL.
+ */
+boolean_t
+zvol_is_zvol(const char *device)
+{
+       struct block_device *bdev;
+       unsigned int major;
+
+       bdev = lookup_bdev(device);
+       if (IS_ERR(bdev))
+               return (B_FALSE);
+
+       major = MAJOR(bdev->bd_dev);
+       bdput(bdev);
+
+       if (major == zvol_major)
+            return (B_TRUE);
+
+       return (B_FALSE);
+}
+
  /*
   * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
   */
@@ -540,6 +563,14 @@ zvol_write(void *arg)
         dmu_tx_t *tx;
         rl_t *rl;
  
+       /*
+        * Annotate this call path with a flag that indicates that it is
+        * unsafe to use KM_SLEEP during memory allocations due to the
+        * potential for a deadlock.  KM_PUSHPAGE should be used instead.
+        */
+       ASSERT(!(current->flags & PF_NOFS));
+       current->flags |= PF_NOFS;
+
         if (req->cmd_flags & VDEV_REQ_FLUSH)
                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
  
@@ -548,7 +579,7 @@ zvol_write(void *arg)
          */
         if (size == 0) {
                 blk_end_request(req, 0, size);
-               return;
+               goto out;
         }
  
         rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER);
@@ -562,7 +593,7 @@ zvol_write(void *arg)
                 dmu_tx_abort(tx);
                 zfs_range_unlock(rl);
                 blk_end_request(req, -error, size);
-               return;
+               goto out;
         }
  
         error = dmu_write_req(zv->zv_objset, ZVOL_OBJ, req, tx);
@@ -578,6 +609,8 @@ zvol_write(void *arg)
                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
  
         blk_end_request(req, -error, size);
+out:
+       current->flags &= ~PF_NOFS;
  }
  
  #ifdef HAVE_BLK_QUEUE_DISCARD
@@ -587,24 +620,41 @@ zvol_discard(void *arg)
         struct request *req = (struct request *)arg;
         struct request_queue *q = req->q;
         zvol_state_t *zv = q->queuedata;
-       uint64_t offset = blk_rq_pos(req) << 9;
-       uint64_t size = blk_rq_bytes(req);
+       uint64_t start = blk_rq_pos(req) << 9;
+       uint64_t end = start + blk_rq_bytes(req);
         int error;
         rl_t *rl;
  
-       if (offset + size > zv->zv_volsize) {
-               blk_end_request(req, -EIO, size);
-               return;
+       /*
+        * Annotate this call path with a flag that indicates that it is
+        * unsafe to use KM_SLEEP during memory allocations due to the
+        * potential for a deadlock.  KM_PUSHPAGE should be used instead.
+        */
+       ASSERT(!(current->flags & PF_NOFS));
+       current->flags |= PF_NOFS;
+
+       if (end > zv->zv_volsize) {
+               blk_end_request(req, -EIO, blk_rq_bytes(req));
+               goto out;
         }
  
-       if (size == 0) {
-               blk_end_request(req, 0, size);
-               return;
+       /*
+        * Align the request to volume block boundaries. If we don't,
+        * then this will force dnode_free_range() to zero out the
+        * unaligned parts, which is slow (read-modify-write) and
+        * useless since we are not freeing any space by doing so.
+        */
+       start = P2ROUNDUP(start, zv->zv_volblocksize);
+       end = P2ALIGN(end, zv->zv_volblocksize);
+
+       if (start >= end) {
+               blk_end_request(req, 0, blk_rq_bytes(req));
+               goto out;
         }
  
-       rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER);
+       rl = zfs_range_lock(&zv->zv_znode, start, end - start, RL_WRITER);
  
-       error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, size);
+       error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end - start);
  
         /*
          * TODO: maybe we should add the operation to the log.
@@ -612,7 +662,9 @@ zvol_discard(void *arg)
  
         zfs_range_unlock(rl);
  
-       blk_end_request(req, -error, size);
+       blk_end_request(req, -error, blk_rq_bytes(req));
+out:
+       current->flags &= ~PF_NOFS;
  }
  #endif /* HAVE_BLK_QUEUE_DISCARD */
  
@@ -765,7 +817,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
         ASSERT(zio != NULL);
         ASSERT(size != 0);
  
-       zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+       zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_PUSHPAGE);
         zgd->zgd_zilog = zv->zv_zilog;
         zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
  
@@ -839,25 +891,49 @@ zvol_first_open(zvol_state_t *zv)
  {
         objset_t *os;
         uint64_t volsize;
+       int locked = 0;
         int error;
         uint64_t ro;
  
+       /*
+        * In all other cases the spa_namespace_lock is taken before the
+        * bdev->bd_mutex lock.  But in this case the Linux __blkdev_get()
+        * function calls fops->open() with the bdev->bd_mutex lock held.
+        *
+        * To avoid a potential lock inversion deadlock we preemptively
+        * try to take the spa_namespace_lock().  Normally it will not
+        * be contended and this is safe because spa_open_common() handles
+        * the case where the caller already holds the spa_namespace_lock.
+        *
+        * When it is contended we risk a lock inversion if we were to
+        * block waiting for the lock.  Luckily, the __blkdev_get()
+        * function allows us to return -ERESTARTSYS which will result in
+        * bdev->bd_mutex being dropped, reacquired, and fops->open() being
+        * called again.  This process can be repeated safely until both
+        * locks are acquired.
+        */
+       if (!mutex_owned(&spa_namespace_lock)) {
+               locked = mutex_tryenter(&spa_namespace_lock);
+               if (!locked)
+                       return (-ERESTARTSYS);
+       }
+
         /* lie and say we're read-only */
         error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os);
         if (error)
-               return (-error);
+               goto out_mutex;
  
         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
         if (error) {
-               dmu_objset_disown(os, zvol_tag);
-               return (-error);
+               dmu_objset_disown(os, zvol_tag);
+               goto out_mutex;
         }
  
         zv->zv_objset = os;
         error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
         if (error) {
-               dmu_objset_disown(os, zvol_tag);
-               return (-error);
+               dmu_objset_disown(os, zvol_tag);
+               goto out_mutex;
         }
  
         set_capacity(zv->zv_disk, volsize >> 9);
@@ -866,13 +942,17 @@ zvol_first_open(zvol_state_t *zv)
  
         VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL) == 0);
         if (ro || dmu_objset_is_snapshot(os)) {
-                set_disk_ro(zv->zv_disk, 1);
-               zv->zv_flags |= ZVOL_RDONLY;
+               set_disk_ro(zv->zv_disk, 1);
+               zv->zv_flags |= ZVOL_RDONLY;
         } else {
-                set_disk_ro(zv->zv_disk, 0);
-               zv->zv_flags &= ~ZVOL_RDONLY;
+               set_disk_ro(zv->zv_disk, 0);
+               zv->zv_flags &= ~ZVOL_RDONLY;
         }
  
+out_mutex:
+       if (locked)
+               mutex_exit(&spa_namespace_lock);
+
         return (-error);
  }
  
@@ -881,8 +961,18 @@ zvol_last_close(zvol_state_t *zv)
  {
         zil_close(zv->zv_zilog);
         zv->zv_zilog = NULL;
+
         dmu_buf_rele(zv->zv_dbuf, zvol_tag);
         zv->zv_dbuf = NULL;
+
+       /*
+        * Evict cached data
+        */
+       if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
+           !(zv->zv_flags & ZVOL_RDONLY))
+               txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+       (void) dmu_objset_evict_dbufs(zv->zv_objset);
+
         dmu_objset_disown(zv->zv_objset, zvol_tag);
         zv->zv_objset = NULL;
  }
@@ -1045,7 +1135,7 @@ zvol_probe(dev_t dev, int *part, void *arg)
  
         mutex_enter(&zvol_state_lock);
         zv = zvol_find_by_dev(dev);
-       kobj = zv ? get_disk(zv->zv_disk) : ERR_PTR(-ENOENT);
+       kobj = zv ? get_disk(zv->zv_disk) : NULL;
         mutex_exit(&zvol_state_lock);
  
         return kobj;
@@ -1120,6 +1210,7 @@ static zvol_state_t *
  zvol_alloc(dev_t dev, const char *name)
  {
         zvol_state_t *zv;
+       int error = 0;
  
         zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
         if (zv == NULL)
@@ -1129,6 +1220,15 @@ zvol_alloc(dev_t dev, const char *name)
         if (zv->zv_queue == NULL)
                 goto out_kmem;
  
+#ifdef HAVE_ELEVATOR_CHANGE
+       error = elevator_change(zv->zv_queue, "noop");
+#endif /* HAVE_ELEVATOR_CHANGE */
+       if (error) {
+               printk("ZFS: Unable to set \"%s\" scheduler for zvol %s: %d\n",
+                   "noop", name, error);
+               goto out_queue;
+       }
+
  #ifdef HAVE_BLK_QUEUE_FLUSH
         blk_queue_flush(zv->zv_queue, VDEV_REQ_FLUSH | VDEV_REQ_FUA);
  #else
@@ -1257,9 +1357,9 @@ __zvol_create_minor(const char *name)
         else
                 zil_replay(os, zv, zvol_replay_vector);
  
+       zv->zv_objset = NULL;
  out_dmu_objset_disown:
         dmu_objset_disown(os, zvol_tag);
-       zv->zv_objset = NULL;
  out_doi:
         kmem_free(doi, sizeof(dmu_object_info_t));
  out: