Add FASTWRITE algorithm for synchronous writes.

[zfs.git] / module / zfs / dmu.c
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c

index 7f17c73..e2abf8c 100644 (file)
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -381,7 +381,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
                 }
                 nblks = 1;
         }
-       dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+       dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_PUSHPAGE | KM_NODEBUG);
  
         if (dn->dn_objset->os_dsl_dataset)
                 dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
@@ -793,7 +793,7 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
                 else
                         dmu_buf_will_dirty(db, tx);
  
-               bcopy(buf, (char *)db->db_data + bufoff, tocpy);
+               (void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
  
                 if (tocpy == db->db_size)
                         dmu_buf_fill_done(db, tx);
@@ -863,11 +863,11 @@ dmu_xuio_init(xuio_t *xuio, int nblk)
         uio_t *uio = &xuio->xu_uio;
  
         uio->uio_iovcnt = nblk;
-       uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
+       uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_PUSHPAGE);
  
-       priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
+       priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_PUSHPAGE);
         priv->cnt = nblk;
-       priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
+       priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_PUSHPAGE);
         priv->iovp = uio->uio_iov;
         XUIO_XUZC_PRIV(xuio) = priv;
  
@@ -975,6 +975,154 @@ xuio_stat_wbuf_nocopy()
  }
  
  #ifdef _KERNEL
+
+/*
+ * Copy up to size bytes between arg_buf and req based on the data direction
+ * described by the req.  If an entire req's data cannot be transfered the
+ * req's is updated such that it's current index and bv offsets correctly
+ * reference any residual data which could not be copied.  The return value
+ * is the number of bytes successfully copied to arg_buf.
+ */
+static int
+dmu_req_copy(void *arg_buf, int size, int *offset, struct request *req)
+{
+       struct bio_vec *bv;
+       struct req_iterator iter;
+       char *bv_buf;
+       int tocpy;
+
+       *offset = 0;
+       rq_for_each_segment(bv, req, iter) {
+
+               /* Fully consumed the passed arg_buf */
+               ASSERT3S(*offset, <=, size);
+               if (size == *offset)
+                       break;
+
+               /* Skip fully consumed bv's */
+               if (bv->bv_len == 0)
+                       continue;
+
+               tocpy = MIN(bv->bv_len, size - *offset);
+               ASSERT3S(tocpy, >=, 0);
+
+               bv_buf = page_address(bv->bv_page) + bv->bv_offset;
+               ASSERT3P(bv_buf, !=, NULL);
+
+               if (rq_data_dir(req) == WRITE)
+                       memcpy(arg_buf + *offset, bv_buf, tocpy);
+               else
+                       memcpy(bv_buf, arg_buf + *offset, tocpy);
+
+               *offset += tocpy;
+               bv->bv_offset += tocpy;
+               bv->bv_len -= tocpy;
+       }
+
+       return 0;
+}
+
+int
+dmu_read_req(objset_t *os, uint64_t object, struct request *req)
+{
+       uint64_t size = blk_rq_bytes(req);
+       uint64_t offset = blk_rq_pos(req) << 9;
+       dmu_buf_t **dbp;
+       int numbufs, i, err;
+
+       /*
+        * NB: we could do this block-at-a-time, but it's nice
+        * to be reading in parallel.
+        */
+       err = dmu_buf_hold_array(os, object, offset, size, TRUE, FTAG,
+                                &numbufs, &dbp);
+       if (err)
+               return (err);
+
+       for (i = 0; i < numbufs; i++) {
+               int tocpy, didcpy, bufoff;
+               dmu_buf_t *db = dbp[i];
+
+               bufoff = offset - db->db_offset;
+               ASSERT3S(bufoff, >=, 0);
+
+               tocpy = (int)MIN(db->db_size - bufoff, size);
+               if (tocpy == 0)
+                       break;
+
+               err = dmu_req_copy(db->db_data + bufoff, tocpy, &didcpy, req);
+
+               if (didcpy < tocpy)
+                       err = EIO;
+
+               if (err)
+                       break;
+
+               size -= tocpy;
+               offset += didcpy;
+               err = 0;
+       }
+       dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+       return (err);
+}
+
+int
+dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx)
+{
+       uint64_t size = blk_rq_bytes(req);
+       uint64_t offset = blk_rq_pos(req) << 9;
+       dmu_buf_t **dbp;
+       int numbufs;
+       int err = 0;
+       int i;
+
+       if (size == 0)
+               return (0);
+
+       err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG,
+                                &numbufs, &dbp);
+       if (err)
+               return (err);
+
+       for (i = 0; i < numbufs; i++) {
+               int tocpy, didcpy, bufoff;
+               dmu_buf_t *db = dbp[i];
+
+               bufoff = offset - db->db_offset;
+               ASSERT3S(bufoff, >=, 0);
+
+               tocpy = (int)MIN(db->db_size - bufoff, size);
+               if (tocpy == 0)
+                       break;
+
+               ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+               if (tocpy == db->db_size)
+                       dmu_buf_will_fill(db, tx);
+               else
+                       dmu_buf_will_dirty(db, tx);
+
+               err = dmu_req_copy(db->db_data + bufoff, tocpy, &didcpy, req);
+
+               if (tocpy == db->db_size)
+                       dmu_buf_fill_done(db, tx);
+
+               if (didcpy < tocpy)
+                       err = EIO;
+
+               if (err)
+                       break;
+
+               size -= tocpy;
+               offset += didcpy;
+               err = 0;
+       }
+
+       dmu_buf_rele_array(dbp, numbufs, FTAG);
+       return (err);
+}
+
  int
  dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
  {
@@ -991,9 +1139,6 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
         if (err)
                 return (err);
  
-       if (uio->uio_extflg == UIO_XUIO)
-               xuio = (xuio_t *)uio;
-
         for (i = 0; i < numbufs; i++) {
                 int tocpy;
                 int bufoff;
@@ -1063,7 +1208,7 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
                         dmu_buf_will_dirty(db, tx);
  
                 /*
-                * XXX uiomove could block forever (eg. nfs-backed
+                * XXX uiomove could block forever (eg.nfs-backed
                  * pages).  There needs to be a uiolockdown() function
                  * to lock the pages in memory, so that uiomove won't
                  * block.
@@ -1123,62 +1268,7 @@ dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
  
         return (err);
  }
-
-int
-dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    page_t *pp, dmu_tx_t *tx)
-{
-       dmu_buf_t **dbp;
-       int numbufs, i;
-       int err;
-
-       if (size == 0)
-               return (0);
-
-       err = dmu_buf_hold_array(os, object, offset, size,
-           FALSE, FTAG, &numbufs, &dbp);
-       if (err)
-               return (err);
-
-       for (i = 0; i < numbufs; i++) {
-               int tocpy, copied, thiscpy;
-               int bufoff;
-               dmu_buf_t *db = dbp[i];
-               caddr_t va;
-
-               ASSERT(size > 0);
-               ASSERT3U(db->db_size, >=, PAGESIZE);
-
-               bufoff = offset - db->db_offset;
-               tocpy = (int)MIN(db->db_size - bufoff, size);
-
-               ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
-
-               if (tocpy == db->db_size)
-                       dmu_buf_will_fill(db, tx);
-               else
-                       dmu_buf_will_dirty(db, tx);
-
-               for (copied = 0; copied < tocpy; copied += PAGESIZE) {
-                       ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
-                       thiscpy = MIN(PAGESIZE, tocpy - copied);
-                       va = zfs_map_page(pp, S_READ);
-                       bcopy(va, (char *)db->db_data + bufoff, thiscpy);
-                       zfs_unmap_page(pp, va);
-                       pp = pp->p_next;
-                       bufoff += PAGESIZE;
-               }
-
-               if (tocpy == db->db_size)
-                       dmu_buf_fill_done(db, tx);
-
-               offset += tocpy;
-               size -= tocpy;
-       }
-       dmu_buf_rele_array(dbp, numbufs, FTAG);
-       return (err);
-}
-#endif
+#endif /* _KERNEL */
  
  /*
   * Allocate a loaned anonymous arc buffer.
@@ -1341,7 +1431,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
                 return (EIO);   /* Make zl_get_data do txg_waited_synced() */
         }
  
-       dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+       dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_PUSHPAGE);
         dsa->dsa_dr = NULL;
         dsa->dsa_done = done;
         dsa->dsa_zgd = zgd;
@@ -1350,7 +1440,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
         zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
             zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
             dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
-           ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
+           ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, zb));
  
         return (0);
  }
@@ -1465,7 +1555,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
         dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
         mutex_exit(&db->db_mtx);
  
-       dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+       dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_PUSHPAGE);
         dsa->dsa_dr = dr;
         dsa->dsa_done = done;
         dsa->dsa_zgd = zgd;
@@ -1474,7 +1564,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
         zio_nowait(arc_write(pio, os->os_spa, txg,
             bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp,
             dmu_sync_ready, dmu_sync_done, dsa,
-           ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
+           ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb));
  
         return (0);
  }
@@ -1771,6 +1861,7 @@ dmu_init(void)
         dnode_init();
         dbuf_init();
         zfetch_init();
+       dmu_tx_init();
         arc_init();
         l2arc_init();
  }
@@ -1780,6 +1871,7 @@ dmu_fini(void)
  {
         l2arc_fini();
         arc_fini();
+       dmu_tx_fini();
         zfetch_fini();
         dbuf_fini();
         dnode_fini();
@@ -1788,3 +1880,27 @@ dmu_fini(void)
         sa_cache_fini();
         zfs_dbgmsg_fini();
  }
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(dmu_bonus_hold);
+EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
+EXPORT_SYMBOL(dmu_buf_rele_array);
+EXPORT_SYMBOL(dmu_free_range);
+EXPORT_SYMBOL(dmu_read);
+EXPORT_SYMBOL(dmu_write);
+EXPORT_SYMBOL(dmu_object_info);
+EXPORT_SYMBOL(dmu_object_info_from_dnode);
+EXPORT_SYMBOL(dmu_object_info_from_db);
+EXPORT_SYMBOL(dmu_object_size_from_db);
+EXPORT_SYMBOL(dmu_object_set_blocksize);
+EXPORT_SYMBOL(dmu_object_set_checksum);
+EXPORT_SYMBOL(dmu_object_set_compress);
+EXPORT_SYMBOL(dmu_request_arcbuf);
+EXPORT_SYMBOL(dmu_return_arcbuf);
+EXPORT_SYMBOL(dmu_assign_arcbuf);
+EXPORT_SYMBOL(dmu_buf_hold);
+EXPORT_SYMBOL(dmu_ot);
+
+module_param(zfs_mdcomp_disable, int, 0644);
+MODULE_PARM_DESC(zfs_mdcomp_disable, "Disable meta data compression");
+#endif