Add mmap(2) support

author Brian Behlendorf <behlendorf1@llnl.gov>

Thu, 3 Feb 2011 18:34:05 +0000 (10:34 -0800)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Thu, 10 Feb 2011 17:27:21 +0000 (09:27 -0800)
author Brian Behlendorf <behlendorf1@llnl.gov>
Thu, 3 Feb 2011 18:34:05 +0000 (10:34 -0800)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Thu, 10 Feb 2011 17:27:21 +0000 (09:27 -0800)
diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h

index 53bd181..9d12a11 100644 (file)
--- a/include/sys/zfs_znode.h
+++ b/include/sys/zfs_znode.h
@@ -210,6 +210,7 @@ typedef struct znode {
         sa_handle_t     *z_sa_hdl;      /* handle to sa data */
         boolean_t       z_is_sa;        /* are we native sa? */
         boolean_t       z_is_zvol;      /* are we used by the zvol */
+       boolean_t       z_is_mapped;    /* are we mmap'ed */
         struct inode    z_inode;        /* generic vfs inode */
  } znode_t;
  
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c

index a8019ba..30b3089 100644 (file)
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -163,32 +163,7 @@
   *     return (error);                 // done, report error
   */
  
-#if defined(_KERNEL) && defined(HAVE_MMAP)
-/*
- * Utility functions to map and unmap a single physical page.  These
- * are used to manage the mappable copies of ZFS file data, and therefore
- * do not update ref/mod bits.
- */
-caddr_t
-zfs_map_page(page_t *pp, enum seg_rw rw)
-{
-       if (kpm_enable)
-               return (hat_kpm_mapin(pp, 0));
-       ASSERT(rw == S_READ || rw == S_WRITE);
-       return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
-           (caddr_t)-1));
-}
-
-void
-zfs_unmap_page(page_t *pp, caddr_t addr)
-{
-       if (kpm_enable) {
-               hat_kpm_mapout(pp, 0, addr);
-       } else {
-               ppmapout(addr);
-       }
-}
-
+#if defined(_KERNEL)
  /*
   * When a file is memory mapped, we must keep the IO data synchronized
   * between the DMU cache and the memory mapped pages.  What this means:
@@ -197,25 +172,39 @@ zfs_unmap_page(page_t *pp, caddr_t addr)
   *             the page and the dmu buffer.
   */
  static void
-update_pages(struct inode *ip, int64_t start, int len, objset_t *os,
-    uint64_t oid)
+update_pages(struct inode *ip, int64_t start, int len,
+    objset_t *os, uint64_t oid)
  {
+       struct address_space *mp = ip->i_mapping;
+       struct page *pp;
+       uint64_t nbytes;
         int64_t off;
+       void *pb;
  
-       off = start & PAGEOFFSET;
-       for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
-               page_t *pp;
-               uint64_t nbytes = MIN(PAGESIZE - off, len);
+       off = start & (PAGE_CACHE_SIZE-1);
+       for (start &= PAGE_CACHE_MASK; len > 0; start += PAGE_CACHE_SIZE) {
+               nbytes = MIN(PAGE_CACHE_SIZE - off, len);
  
-               if (pp = page_lookup(ip, start, SE_SHARED)) {
-                       caddr_t va;
+               pp = find_lock_page(mp, start >> PAGE_CACHE_SHIFT);
+               if (pp) {
+                       if (mapping_writably_mapped(mp))
+                               flush_dcache_page(pp);
  
-                       va = zfs_map_page(pp, S_WRITE);
-                       (void) dmu_read(os, oid, start+off, nbytes, va+off,
+                       pb = kmap(pp);
+                       (void) dmu_read(os, oid, start+off, nbytes, pb+off,
                             DMU_READ_PREFETCH);
-                       zfs_unmap_page(pp, va);
-                       page_unlock(pp);
+                       kunmap(pp);
+
+                       if (mapping_writably_mapped(mp))
+                               flush_dcache_page(pp);
+
+                       mark_page_accessed(pp);
+                       SetPageUptodate(pp);
+                       ClearPageError(pp);
+                       unlock_page(pp);
+                       page_cache_release(pp);
                 }
+
                 len -= nbytes;
                 off = 0;
         }
@@ -234,28 +223,39 @@ update_pages(struct inode *ip, int64_t start, int len, objset_t *os,
  static int
  mappedread(struct inode *ip, int nbytes, uio_t *uio)
  {
+       struct address_space *mp = ip->i_mapping;
+       struct page *pp;
         znode_t *zp = ITOZ(ip);
         objset_t *os = ITOZSB(ip)->z_os;
         int64_t start, off;
+       uint64_t bytes;
         int len = nbytes;
         int error = 0;
+       void *pb;
  
         start = uio->uio_loffset;
-       off = start & PAGEOFFSET;
-       for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
-               page_t *pp;
-               uint64_t bytes = MIN(PAGESIZE - off, len);
-
-               if (pp = page_lookup(ip, start, SE_SHARED)) {
-                       caddr_t va;
-
-                       va = zfs_map_page(pp, S_READ);
-                       error = uiomove(va + off, bytes, UIO_READ, uio);
-                       zfs_unmap_page(pp, va);
-                       page_unlock(pp);
+       off = start & (PAGE_CACHE_SIZE-1);
+       for (start &= PAGE_CACHE_MASK; len > 0; start += PAGE_CACHE_SIZE) {
+               bytes = MIN(PAGE_CACHE_SIZE - off, len);
+
+               pp = find_lock_page(mp, start >> PAGE_CACHE_SHIFT);
+               if (pp) {
+                       ASSERT(PageUptodate(pp));
+
+                       pb = kmap(pp);
+                       error = uiomove(pb + off, bytes, UIO_READ, uio);
+                       kunmap(pp);
+
+                       if (mapping_writably_mapped(mp))
+                               flush_dcache_page(pp);
+
+                       mark_page_accessed(pp);
+                       unlock_page(pp);
+                       page_cache_release(pp);
                 } else {
                         error = dmu_read_uio(os, zp->z_id, uio, bytes);
                 }
+
                 len -= bytes;
                 off = 0;
                 if (error)
@@ -263,7 +263,7 @@ mappedread(struct inode *ip, int nbytes, uio_t *uio)
         }
         return (error);
  }
-#endif /* _KERNEL && HAVE_MMAP */
+#endif /* _KERNEL */
  
  offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
  
@@ -273,7 +273,8 @@ offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
   *     IN:     ip      - inode of file to be read from.
   *             uio     - structure supplying read location, range info,
   *                       and return buffer.
- *             ioflag  - SYNC flags; used to provide FRSYNC semantics.
+ *             ioflag  - FSYNC flags; used to provide FRSYNC semantics.
+ *                       O_DIRECT flag; used to bypass page cache.
   *             cr      - credentials of caller.
   *
   *     OUT:    uio     - updated offset and range, buffer filled.
@@ -394,15 +395,11 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
                 nbytes = MIN(n, zfs_read_chunk_size -
                     P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
  
-/* XXX: Drop this, ARC update handled by zpl layer */
-#ifdef HAVE_MMAP
-               if (vn_has_cached_data(ip))
+               if (zp->z_is_mapped && !(ioflag & O_DIRECT))
                         error = mappedread(ip, nbytes, uio);
                 else
                         error = dmu_read_uio(os, zp->z_id, uio, nbytes);
-#else
-               error = dmu_read_uio(os, zp->z_id, uio, nbytes);
-#endif /* HAVE_MMAP */
+
                 if (error) {
                         /* convert checksum errors into IO errors */
                         if (error == ECKSUM)
@@ -429,6 +426,7 @@ EXPORT_SYMBOL(zfs_read);
   *             uio     - structure supplying write location, range info,
   *                       and data buffer.
   *             ioflag  - FAPPEND flag set if in append mode.
+ *                       O_DIRECT flag; used to bypass page cache.
   *             cr      - credentials of caller.
   *
   *     OUT:    uio     - updated offset and range.
@@ -700,13 +698,9 @@ again:
                         ASSERT(tx_bytes <= uio->uio_resid);
                         uioskip(uio, tx_bytes);
                 }
-/* XXX: Drop this, ARC update handled by zpl layer */
-#ifdef HAVE_MMAP
-               if (tx_bytes && vn_has_cached_data(ip)) {
-                       update_pages(ip, woff,
-                           tx_bytes, zsb->z_os, zp->z_id);
-               }
-#endif /* HAVE_MMAP */
+
+               if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT))
+                       update_pages(ip, woff, tx_bytes, zsb->z_os, zp->z_id);
  
                 /*
                  * If we made no progress, we're done.  If we made even
@@ -3392,6 +3386,7 @@ top:
  }
  EXPORT_SYMBOL(zfs_link);
  
+#ifdef HAVE_MMAP
  /*
   * zfs_null_putapage() is used when the file system has been force
   * unmounted. It just drops the pages.
@@ -3627,48 +3622,30 @@ out:
         ZFS_EXIT(zfsvfs);
         return (error);
  }
+#endif /* HAVE_MMAP */
  
  /*ARGSUSED*/
  void
-zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+zfs_inactive(struct inode *ip)
  {
-       znode_t *zp = VTOZ(vp);
-       zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+       znode_t *zp = ITOZ(ip);
+       zfs_sb_t *zsb = ITOZSB(ip);
         int error;
  
-       rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
-       if (zp->z_sa_hdl == NULL) {
-               /*
-                * The fs has been unmounted, or we did a
-                * suspend/resume and this file no longer exists.
-                */
-               if (vn_has_cached_data(vp)) {
-                       (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
-                           B_INVAL, cr);
-               }
+       truncate_inode_pages(&ip->i_data, 0);
  
-               mutex_enter(&zp->z_lock);
-               mutex_enter(&vp->v_lock);
-               ASSERT(vp->v_count == 1);
-               vp->v_count = 0;
-               mutex_exit(&vp->v_lock);
-               mutex_exit(&zp->z_lock);
-               rw_exit(&zfsvfs->z_teardown_inactive_lock);
-               zfs_znode_free(zp);
-               return;
-       }
+#ifdef HAVE_SNAPSHOT
+       /* Early return for snapshot inode? */
+#endif /* HAVE_SNAPSHOT */
  
-       /*
-        * Attempt to push any data in the page cache.  If this fails
-        * we will get kicked out later in zfs_zinactive().
-        */
-       if (vn_has_cached_data(vp)) {
-               (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
-                   cr);
+       rw_enter(&zsb->z_teardown_inactive_lock, RW_READER);
+       if (zp->z_sa_hdl == NULL) {
+               rw_exit(&zsb->z_teardown_inactive_lock);
+               return;
         }
  
         if (zp->z_atime_dirty && zp->z_unlinked == 0) {
-               dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+               dmu_tx_t *tx = dmu_tx_create(zsb->z_os);
  
                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
                 zfs_sa_upgrade_txholds(tx, zp);
@@ -3712,6 +3689,7 @@ zfs_seek(struct inode *ip, offset_t ooff, offset_t *noffp,
  }
  EXPORT_SYMBOL(zfs_seek);
  
+#ifdef HAVE_MMAP
  /*
   * Pre-filter the generic locking function to trap attempts to place
   * a mandatory lock on a memory mapped file.
@@ -4056,6 +4034,7 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
  
         return (0);
  }
+#endif /* HAVE_MMAP */
  
  /*
   * convoff - converts the given data (start, whence) to the
diff --git a/module/zfs/zpl_file.c b/module/zfs/zpl_file.c

index 41901bb..585f971 100644 (file)
--- a/module/zfs/zpl_file.c
+++ b/module/zfs/zpl_file.c
@@ -145,24 +145,185 @@ zpl_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
         return (wrote);
  }
  
+/*
+ * It's worth taking a moment to describe how mmap is implemented
+ * for zfs because it differs considerably from other Linux filesystems.
+ * However, this issue is handled the same way under OpenSolaris.
+ *
+ * The issue is that by design zfs bypasses the Linux page cache and
+ * leaves all caching up to the ARC.  This has been shown to work
+ * well for the common read(2)/write(2) case.  However, mmap(2)
+ * is problem because it relies on being tightly integrated with the
+ * page cache.  To handle this we cache mmap'ed files twice, once in
+ * the ARC and a second time in the page cache.  The code is careful
+ * to keep both copies synchronized.
+ *
+ * When a file with an mmap'ed region is written to using write(2)
+ * both the data in the ARC and existing pages in the page cache
+ * are updated.  For a read(2) data will be read first from the page
+ * cache then the ARC if needed.  Neither a write(2) or read(2) will
+ * will ever result in new pages being added to the page cache.
+ *
+ * New pages are added to the page cache only via .readpage() which
+ * is called when the vfs needs to read a page off disk to back the
+ * virtual memory region.  These pages may be modified without
+ * notifying the ARC and will be written out periodically via
+ * .writepage().  This will occur due to either a sync or the usual
+ * page aging behavior.  Note because a read(2) of a mmap'ed file
+ * will always check the page cache first even when the ARC is out
+ * of date correct data will still be returned.
+ *
+ * While this implementation ensures correct behavior it does have
+ * have some drawbacks.  The most obvious of which is that it
+ * increases the required memory footprint when access mmap'ed
+ * files.  It also adds additional complexity to the code keeping
+ * both caches synchronized.
+ *
+ * Longer term it may be possible to cleanly resolve this wart by
+ * mapping page cache pages directly on to the ARC buffers.  The
+ * Linux address space operations are flexible enough to allow
+ * selection of which pages back a particular index.  The trick
+ * would be working out the details of which subsystem is in
+ * charge, the ARC, the page cache, or both.  It may also prove
+ * helpful to move the ARC buffers to a scatter-gather lists
+ * rather than a vmalloc'ed region.
+ */
+static int
+zpl_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+       znode_t *zp = ITOZ(filp->f_mapping->host);
+       int error;
+
+       error = generic_file_mmap(filp, vma);
+       if (error)
+               return (error);
+
+       mutex_enter(&zp->z_lock);
+       zp->z_is_mapped = 1;
+       mutex_exit(&zp->z_lock);
+
+       return (error);
+}
+
+/*
+ * Populate a page with data for the Linux page cache.  This function is
+ * only used to support mmap(2).  There will be an identical copy of the
+ * data in the ARC which is kept up to date via .write() and .writepage().
+ *
+ * Current this function relies on zpl_read_common() and the O_DIRECT
+ * flag to read in a page.  This works but the more correct way is to
+ * update zfs_fillpage() to be Linux friendly and use that interface.
+ */
+static int
+zpl_readpage(struct file *filp, struct page *pp)
+{
+       struct inode *ip;
+       loff_t off, i_size;
+       size_t len, wrote;
+       cred_t *cr;
+       void *pb;
+       int error = 0;
+
+       ASSERT(PageLocked(pp));
+       ip = pp->mapping->host;
+       off = page_offset(pp);
+       i_size = i_size_read(ip);
+       ASSERT3S(off, <, i_size);
+
+       cr = (cred_t *)get_current_cred();
+       len = MIN(PAGE_CACHE_SIZE, i_size - off);
+
+       pb = kmap(pp);
+
+       /* O_DIRECT is passed to bypass the page cache and avoid deadlock. */
+       wrote = zpl_read_common(ip, pb, len, off, UIO_SYSSPACE, O_DIRECT, cr);
+       if (wrote != len)
+               error = -EIO;
+
+       if (!error && (len < PAGE_CACHE_SIZE))
+               memset(pb + len, 0, PAGE_CACHE_SIZE - len);
+
+       kunmap(pp);
+       put_cred(cr);
+
+       if (error) {
+               SetPageError(pp);
+               ClearPageUptodate(pp);
+       } else {
+               ClearPageError(pp);
+               SetPageUptodate(pp);
+               flush_dcache_page(pp);
+       }
+
+       unlock_page(pp);
+
+       return (error);
+}
+
+/*
+ * Write out dirty pages to the ARC, this function is only required to
+ * support mmap(2).  Mapped pages may be dirtied by memory operations
+ * which never call .write().  These dirty pages are kept in sync with
+ * the ARC buffers via this hook.
+ *
+ * Currently this function relies on zpl_write_common() and the O_DIRECT
+ * flag to push out the page.  This works but the more correct way is
+ * to update zfs_putapage() to be Linux friendly and use that interface.
+ */
+static int
+zpl_writepage(struct page *pp, struct writeback_control *wbc)
+{
+       struct inode *ip;
+       loff_t off, i_size;
+       size_t len, read;
+       cred_t *cr;
+       void *pb;
+       int error = 0;
+
+       ASSERT(PageLocked(pp));
+       ip = pp->mapping->host;
+       off = page_offset(pp);
+       i_size = i_size_read(ip);
+
+       cr = (cred_t *)get_current_cred();
+       len = MIN(PAGE_CACHE_SIZE, i_size - off);
+
+       pb = kmap(pp);
+
+       /* O_DIRECT is passed to bypass the page cache and avoid deadlock. */
+       read = zpl_write_common(ip, pb, len, off, UIO_SYSSPACE, O_DIRECT, cr);
+       if (read != len)
+               error = -EIO;
+
+       kunmap(pp);
+       put_cred(cr);
+
+       if (error) {
+               SetPageError(pp);
+               ClearPageUptodate(pp);
+       } else {
+               ClearPageError(pp);
+               SetPageUptodate(pp);
+       }
+
+       unlock_page(pp);
+
+       return (error);
+}
+
  const struct address_space_operations zpl_address_space_operations = {
-#if 0
         .readpage       = zpl_readpage,
         .writepage      = zpl_writepage,
-       .direct_IO      = zpl_direct_IO,
-#endif
  };
  
  const struct file_operations zpl_file_operations = {
         .open           = generic_file_open,
         .llseek         = generic_file_llseek,
-       .read           = zpl_read,     /* do_sync_read */
-       .write          = zpl_write,    /* do_sync_write */
+       .read           = zpl_read,
+       .write          = zpl_write,
         .readdir        = zpl_readdir,
-       .mmap           = generic_file_mmap,
+       .mmap           = zpl_mmap,
         .fsync          = zpl_fsync,
-       .aio_read       = NULL,         /* generic_file_aio_read */
-       .aio_write      = NULL,         /* generic_file_aio_write */
  };
  
  const struct file_operations zpl_dir_file_operations = {
author	Brian Behlendorf <behlendorf1@llnl.gov>
	Thu, 3 Feb 2011 18:34:05 +0000 (10:34 -0800)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Thu, 10 Feb 2011 17:27:21 +0000 (09:27 -0800)
include/sys/zfs_znode.h		patch \| blob \| history
module/zfs/zfs_vnops.c		patch \| blob \| history
module/zfs/zpl_file.c		patch \| blob \| history