X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fzfs_vnops.c;h=88d4e52c1f8de63777adc9db161e6781b55b6d9e;hb=9babb37438b58e77bad04e820d5702e15b79e6a6;hp=f62d3bfa07da12eeaf2c2961a607b0a63314ab42;hpb=fb5f0bc83330c8a0236c4d34a23723ac1974971a;p=zfs.git diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index f62d3bf..88d4e52 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -101,6 +101,7 @@ * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. + * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. @@ -348,56 +349,29 @@ zfs_unmap_page(page_t *pp, caddr_t addr) * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. - * - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. */ -static int -mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) +static void +update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid) { - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int64_t start, off; - int len = nbytes; - int error = 0; + int64_t off; - start = uio->uio_loffset; off = start & PAGEOFFSET; for (start &= PAGEMASK; len > 0; start += PAGESIZE) { page_t *pp; - uint64_t bytes = MIN(PAGESIZE - off, len); - uint64_t woff = uio->uio_loffset; + uint64_t nbytes = MIN(PAGESIZE - off, len); - /* - * We don't want a new page to "appear" in the middle of - * the file update (because it may not get the write - * update data), so we grab a lock to block - * zfs_getpage(). - */ - rw_enter(&zp->z_map_lock, RW_WRITER); if (pp = page_lookup(vp, start, SE_SHARED)) { caddr_t va; - rw_exit(&zp->z_map_lock); va = zfs_map_page(pp, S_WRITE); - error = uiomove(va+off, bytes, UIO_WRITE, uio); - if (error == 0) { - dmu_write(zfsvfs->z_os, zp->z_id, - woff, bytes, va+off, tx); - } + (void) dmu_read(os, oid, start+off, nbytes, va+off, + DMU_READ_PREFETCH); zfs_unmap_page(pp, va); page_unlock(pp); - } else { - error = dmu_write_uio(zfsvfs->z_os, zp->z_id, - uio, bytes, tx); - rw_exit(&zp->z_map_lock); } - len -= bytes; + len -= nbytes; off = 0; - if (error) - break; } - return (error); } /* @@ -595,6 +569,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) int max_blksz = zfsvfs->z_max_blksz; uint64_t pflags; int error; + arc_buf_t *abuf; /* * Fasttrack empty write @@ -691,10 +666,46 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) * and allows us to do more fine-grained space accounting. */ while (n > 0) { + abuf = NULL; + woff = uio->uio_loffset; + +again: + if (zfs_usergroup_overquota(zfsvfs, + B_FALSE, zp->z_phys->zp_uid) || + zfs_usergroup_overquota(zfsvfs, + B_TRUE, zp->z_phys->zp_gid)) { + if (abuf != NULL) + dmu_return_arcbuf(abuf); + error = EDQUOT; + break; + } + + /* + * If dmu_assign_arcbuf() is expected to execute with minimum + * overhead loan an arc buffer and copy user data to it before + * we enter a txg. This avoids holding a txg forever while we + * pagefault on a hanging NFS server mapping. + */ + if (abuf == NULL && n >= max_blksz && + woff >= zp->z_phys->zp_size && + P2PHASE(woff, max_blksz) == 0 && + zp->z_blksz == max_blksz) { + size_t cbytes; + + abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz); + ASSERT(abuf != NULL); + ASSERT(arc_buf_size(abuf) == max_blksz); + if (error = uiocopy(abuf->b_data, max_blksz, + UIO_WRITE, uio, &cbytes)) { + dmu_return_arcbuf(abuf); + break; + } + ASSERT(cbytes == max_blksz); + } + /* * Start a transaction. */ - woff = uio->uio_loffset; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); @@ -703,9 +714,11 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); - continue; + goto again; } dmu_tx_abort(tx); + if (abuf != NULL) + dmu_return_arcbuf(abuf); break; } @@ -733,18 +746,23 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - rw_enter(&zp->z_map_lock, RW_READER); - tx_bytes = uio->uio_resid; - if (vn_has_cached_data(vp)) { - rw_exit(&zp->z_map_lock); - error = mappedwrite(vp, nbytes, uio, tx); + if (abuf == NULL) { + tx_bytes = uio->uio_resid; + error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio, + nbytes, tx); + tx_bytes -= uio->uio_resid; } else { - error = dmu_write_uio(zfsvfs->z_os, zp->z_id, - uio, nbytes, tx); - rw_exit(&zp->z_map_lock); + tx_bytes = nbytes; + ASSERT(tx_bytes == max_blksz); + dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx); + ASSERT(tx_bytes <= uio->uio_resid); + uioskip(uio, tx_bytes); + } + if (tx_bytes && vn_has_cached_data(vp)) { + update_pages(vp, woff, + tx_bytes, zfsvfs->z_os, zp->z_id); } - tx_bytes -= uio->uio_resid; /* * If we made no progress, we're done. If we made even @@ -824,10 +842,15 @@ zfs_get_done(dmu_buf_t *db, void *vzgd) zgd_t *zgd = (zgd_t *)vzgd; rl_t *rl = zgd->zgd_rl; vnode_t *vp = ZTOV(rl->r_zp); + objset_t *os = rl->r_zp->z_zfsvfs->z_os; dmu_buf_rele(db, vzgd); zfs_range_unlock(rl); - VN_RELE(vp); + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(vp, dsl_pool_vnrele_taskq(dmu_objset_pool(os))); zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); kmem_free(zgd, sizeof (zgd_t)); } @@ -857,7 +880,12 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) return (ENOENT); if (zp->z_unlinked) { - VN_RELE(ZTOV(zp)); + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_vnrele_taskq(dmu_objset_pool(os))); return (ENOENT); } @@ -875,7 +903,8 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) error = ENOENT; goto out; } - VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); + VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf, + DMU_READ_NO_PREFETCH)); } else { /* indirect write */ uint64_t boff; /* block starting offset */ @@ -929,7 +958,11 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) } out: zfs_range_unlock(rl); - VN_RELE(ZTOV(zp)); + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); return (error); } @@ -1107,11 +1140,11 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, zfs_dirlock_t *dl; dmu_tx_t *tx; int error; - zfs_acl_t *aclp = NULL; - zfs_fuid_info_t *fuidp = NULL; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; /* * If we have an ephemeral id, ACL, or XVATTR then @@ -1174,21 +1207,9 @@ top: if (strcmp(name, "..") == 0) error = EISDIR; ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); - return (error); - } - } - if (vsecp && aclp == NULL) { - error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); - if (error) { - ZFS_EXIT(zfsvfs); - if (dl) - zfs_dirent_unlock(dl); return (error); } } - if (zp == NULL) { uint64_t txtype; @@ -1210,30 +1231,28 @@ top: goto out; } + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, + &acl_ids)) != 0) + goto out; + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + error = EDQUOT; + goto out; + } + tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) || - IS_EPHEMERAL(gid)) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, - FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) { + if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); } error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { + zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); if (error == ERESTART) { dmu_tx_wait(tx); @@ -1242,19 +1261,21 @@ top: } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); return (error); } - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + (void) zfs_link_create(dl, zp, tx, ZNEW); + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, name, - vsecp, fuidp, vap); - if (fuidp) - zfs_fuid_info_free(fuidp); + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); } else { int aflags = (flag & FAPPEND) ? V_APPEND : 0; @@ -1325,8 +1346,6 @@ out: *vpp = svp; } } - if (aclp) - zfs_acl_free(aclp); ZFS_EXIT(zfsvfs); return (error); @@ -1561,12 +1580,12 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, uint64_t txtype; dmu_tx_t *tx; int error; - zfs_acl_t *aclp = NULL; - zfs_fuid_info_t *fuidp = NULL; int zf = ZNEW; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; ASSERT(vap->va_type == VDIR); @@ -1627,38 +1646,33 @@ top: return (error); } - if (vsecp && aclp == NULL) { - error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); - if (error) { - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (error); - } + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, + &acl_ids)) != 0) { + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (EDQUOT); } + /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) || - IS_EPHEMERAL(gid)) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } - if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { + zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); if (error == ERESTART) { dmu_tx_wait(tx); @@ -1667,19 +1681,16 @@ top: } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); return (error); } /* * Create new node. */ - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); - - if (aclp) - zfs_acl_free(aclp); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); /* * Now put new name in parent dir. */ @@ -1690,10 +1701,10 @@ top: txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); if (flags & FIGNORECASE) txtype |= TX_CI; - zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, + acl_ids.z_fuidp, vap); - if (fuidp) - zfs_fuid_info_free(fuidp); + zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); @@ -2002,6 +2013,21 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, } } + if (flags & V_RDDIR_ACCFILTER) { + /* + * If we have no access at all, don't include + * this entry in the returned information + */ + znode_t *ezp; + if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) + goto skip_entry; + if (!zfs_has_access(ezp, cr)) { + VN_RELE(ZTOV(ezp)); + goto skip_entry; + } + VN_RELE(ZTOV(ezp)); + } + if (flags & V_RDDIR_ENTFLAGS) reclen = EDIRENT_RECLEN(strlen(zap.za_name)); else @@ -2053,6 +2079,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, if (prefetch) dmu_prefetch(os, objnum, 0, 0); + skip_entry: /* * Move to the next entry, fill in the previous offset. */ @@ -2153,8 +2180,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, ZFS_VERIFY_ZP(zp); pzp = zp->z_phys; - mutex_enter(&zp->z_lock); - /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should @@ -2164,7 +2189,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, (pzp->zp_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr)) { - mutex_exit(&zp->z_lock); ZFS_EXIT(zfsvfs); return (error); } @@ -2175,6 +2199,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, * than to determine whether we were asked the question. */ + mutex_enter(&zp->z_lock); vap->va_type = vp->v_type; vap->va_mode = pzp->zp_mode & MODEMASK; zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); @@ -2345,6 +2370,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, uint_t saved_mask; int trim_mask = 0; uint64_t new_mode; + uint64_t new_uid, new_gid; znode_t *attrzp; int need_policy = FALSE; int err; @@ -2353,6 +2379,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, xoptattr_t *xoap; zfs_acl_t *aclp = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + boolean_t fuid_dirtied = B_FALSE; if (mask == 0) return (0); @@ -2643,30 +2670,14 @@ top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); - if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || - ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } if (mask & AT_MODE) { uint64_t pmode = pzp->zp_mode; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); - if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) { - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (err); - } + if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) + goto out; if (pzp->zp_acl.z_acl_extern_obj) { /* Are we upgrading ACL from old V0 format to new V1 */ if (zfsvfs->z_version <= ZPL_VERSION_FUID && @@ -2688,36 +2699,53 @@ top: } } - if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) { - err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); - if (err) { - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); - return (err); + if (mask & (AT_UID | AT_GID)) { + if (pzp->zp_xattr) { + err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); + if (err) + goto out; + dmu_tx_hold_bonus(tx, attrzp->z_id); + } + if (mask & AT_UID) { + new_uid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); + if (new_uid != pzp->zp_uid && + zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) { + err = EDQUOT; + goto out; + } + } + + if (mask & AT_GID) { + new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &fuidp); + if (new_gid != pzp->zp_gid && + zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) { + err = EDQUOT; + goto out; + } + } + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, + FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } } - dmu_tx_hold_bonus(tx, attrzp->z_id); } err = dmu_tx_assign(tx, TXG_NOWAIT); if (err) { - if (attrzp) - VN_RELE(ZTOV(attrzp)); - - if (aclp) { - zfs_acl_free(aclp); - aclp = NULL; - } - - if (err == ERESTART) { + if (err == ERESTART) dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (err); + goto out; } dmu_buf_will_dirty(zp->z_dbuf, tx); @@ -2735,7 +2763,7 @@ top: if (mask & AT_MODE) { mutex_enter(&zp->z_acl_lock); zp->z_phys->zp_mode = new_mode; - err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); + err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT3U(err, ==, 0); mutex_exit(&zp->z_acl_lock); } @@ -2744,25 +2772,17 @@ top: mutex_enter(&attrzp->z_lock); if (mask & AT_UID) { - pzp->zp_uid = zfs_fuid_create(zfsvfs, - vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); - if (attrzp) { - attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs, - vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); - } + pzp->zp_uid = new_uid; + if (attrzp) + attrzp->z_phys->zp_uid = new_uid; } if (mask & AT_GID) { - pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid, - cr, ZFS_GROUP, tx, &fuidp); + pzp->zp_gid = new_gid; if (attrzp) - attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs, - vap->va_gid, cr, ZFS_GROUP, tx, &fuidp); + attrzp->z_phys->zp_gid = new_gid; } - if (aclp) - zfs_acl_free(aclp); - if (attrzp) mutex_exit(&attrzp->z_lock); @@ -2824,17 +2844,35 @@ top: zfs_xvattr_set(zp, xvap); } + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); - if (fuidp) - zfs_fuid_info_free(fuidp); mutex_exit(&zp->z_lock); +out: if (attrzp) VN_RELE(ZTOV(attrzp)); - dmu_tx_commit(tx); + if (aclp) { + zfs_acl_free(aclp); + aclp = NULL; + } + + if (fuidp) { + zfs_fuid_info_free(fuidp); + fuidp = NULL; + } + + if (err) + dmu_tx_abort(tx); + else + dmu_tx_commit(tx); + + if (err == ERESTART) + goto top; ZFS_EXIT(zfsvfs); return (err); @@ -3265,7 +3303,8 @@ zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, int len = strlen(link); int error; int zflg = ZNEW; - zfs_fuid_info_t *fuidp = NULL; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; ASSERT(vap->va_type == VLNK); @@ -3300,26 +3339,25 @@ top: return (error); } + VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (EDQUOT); + } tx = dmu_tx_create(zfsvfs->z_os); + fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) + if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); - if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { + zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); if (error == ERESTART) { dmu_tx_wait(tx); @@ -3339,13 +3377,16 @@ top: * otherwise, store it just like any other file data. */ if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { - zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids); if (len != 0) bcopy(link, zp->z_phys + 1, len); } else { dmu_buf_t *dbp; - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); /* * Nothing can access the znode yet so no locking needed * for growing the znode's blocksize. @@ -3366,15 +3407,14 @@ top: * Insert the new object into the directory. */ (void) zfs_link_create(dl, zp, tx, ZNEW); -out: if (error == 0) { uint64_t txtype = TX_SYMLINK; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); } - if (fuidp) - zfs_fuid_info_free(fuidp); + + zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); @@ -3610,9 +3650,7 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; dmu_tx_t *tx; - rl_t *rl; u_offset_t off, koff; size_t len, klen; uint64_t filesz; @@ -3627,26 +3665,18 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, * a read-modify-write). */ if (off < filesz && zp->z_blksz > PAGESIZE) { - if (!ISP2(zp->z_blksz)) { - /* Only one block in the file. */ - klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); - koff = 0; - } else { - klen = zp->z_blksz; - koff = P2ALIGN(off, (u_offset_t)klen); - } + klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); + koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; ASSERT(koff <= filesz); if (koff + klen > filesz) klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE); pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); } ASSERT3U(btop(len), ==, btopr(len)); -top: - rl = zfs_range_lock(zp, off, len, RL_WRITER); + /* * Can't push pages past end-of-file. */ - filesz = zp->z_phys->zp_size; if (off >= filesz) { /* ignore all pages */ err = 0; @@ -3662,16 +3692,20 @@ top: len = filesz - off; } + if (zfs_usergroup_overquota(zfsvfs, B_FALSE, zp->z_phys->zp_uid) || + zfs_usergroup_overquota(zfsvfs, B_TRUE, zp->z_phys->zp_gid)) { + err = EDQUOT; + goto out; + } +top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_bonus(tx, zp->z_id); err = dmu_tx_assign(tx, TXG_NOWAIT); if (err != 0) { if (err == ERESTART) { - zfs_range_unlock(rl); dmu_tx_wait(tx); dmu_tx_abort(tx); - err = 0; goto top; } dmu_tx_abort(tx); @@ -3689,12 +3723,11 @@ top: if (err == 0) { zfs_time_stamper(zp, CONTENT_MODIFIED, tx); - zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0); + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); dmu_tx_commit(tx); } out: - zfs_range_unlock(rl); pvn_write_done(pp, (err ? B_ERROR : 0) | flags); if (offp) *offp = off; @@ -3731,31 +3764,50 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, page_t *pp; size_t io_len; u_offset_t io_off; - uint64_t filesz; + uint_t blksz; + rl_t *rl; int error = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - if (len == 0) { + /* + * Align this request to the file block size in case we kluster. + * XXX - this can result in pretty aggresive locking, which can + * impact simultanious read/write access. One option might be + * to break up long requests (len == 0) into block-by-block + * operations to get narrower locking. + */ + blksz = zp->z_blksz; + if (ISP2(blksz)) + io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); + else + io_off = 0; + if (len > 0 && ISP2(blksz)) + io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); + else + io_len = 0; + + if (io_len == 0) { /* - * Search the entire vp list for pages >= off. + * Search the entire vp list for pages >= io_off. */ - error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage, - flags, cr); + rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); + error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); goto out; } + rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); - filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ - if (off > filesz) { + if (off > zp->z_phys->zp_size) { /* past end of file */ + zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (0); } - len = MIN(len, filesz - off); + len = MIN(io_len, P2ROUNDUP(zp->z_phys->zp_size, PAGESIZE) - io_off); - for (io_off = off; io_off < off + len; io_off += io_len) { + for (off = io_off; io_off < off + len; io_off += io_len) { if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { pp = page_lookup(vp, io_off, (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); @@ -3778,6 +3830,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, } } out: + zfs_range_unlock(rl); if ((flags & B_ASYNC) == 0) zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id); ZFS_EXIT(zfsvfs); @@ -3894,7 +3947,9 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, /* * If we can't find a page in the cache, we will create a new page * and fill it with file data. For efficiency, we may try to fill - * multiple pages at once (klustering). + * multiple pages at once (klustering) to fill up the supplied page + * list. Note that the pages to be filled are held with an exclusive + * lock to prevent access by other threads while they are being filled. */ static int zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, @@ -3903,57 +3958,28 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, znode_t *zp = VTOZ(vp); page_t *pp, *cur_pp; objset_t *os = zp->z_zfsvfs->z_os; - caddr_t va; u_offset_t io_off, total; - uint64_t oid = zp->z_id; size_t io_len; - uint64_t filesz; int err; - /* - * If we are only asking for a single page don't bother klustering. - */ - filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ - if (off >= filesz) - return (EFAULT); if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { + /* + * We only have a single page, don't bother klustering + */ io_off = off; io_len = PAGESIZE; - pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr); + pp = page_create_va(vp, io_off, io_len, + PG_EXCL | PG_WAIT, seg, addr); } else { /* - * Try to fill a kluster of pages (a blocks worth). + * Try to find enough pages to fill the page list */ - size_t klen; - u_offset_t koff; - - if (!ISP2(zp->z_blksz)) { - /* Only one block in the file. */ - klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); - koff = 0; - } else { - /* - * It would be ideal to align our offset to the - * blocksize but doing so has resulted in some - * strange application crashes. For now, we - * leave the offset as is and only adjust the - * length if we are off the end of the file. - */ - koff = off; - klen = plsz; - } - ASSERT(koff <= filesz); - if (koff + klen > filesz) - klen = P2ROUNDUP(filesz, (uint64_t)PAGESIZE) - koff; - ASSERT3U(off, >=, koff); - ASSERT3U(off, <, koff + klen); pp = pvn_read_kluster(vp, off, seg, addr, &io_off, - &io_len, koff, klen, 0); + &io_len, off, plsz, 0); } if (pp == NULL) { /* - * Some other thread entered the page before us. - * Return to zfs_getpage to retry the lookup. + * The page already exists, nothing to do here. */ *pl = NULL; return (0); @@ -3964,9 +3990,12 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, */ cur_pp = pp; for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { + caddr_t va; + ASSERT3U(io_off, ==, cur_pp->p_offset); va = zfs_map_page(cur_pp, S_WRITE); - err = dmu_read(os, oid, io_off, PAGESIZE, va); + err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, + DMU_READ_PREFETCH); zfs_unmap_page(cur_pp, va); if (err) { /* On error, toss the entire kluster */ @@ -3978,15 +4007,14 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, } cur_pp = cur_pp->p_next; } -out: + /* - * Fill in the page list array from the kluster. If - * there are too many pages in the kluster, return - * as many pages as possible starting from the desired - * offset `off'. + * Fill in the page list array from the kluster starting + * from the desired offset `off'. * NOTE: the page list will always be null terminated. */ pvn_plist_init(pp, pl, plsz, off, io_len, rw); + ASSERT(pl == NULL || (*pl)->p_offset == off); return (0); } @@ -3994,10 +4022,10 @@ out: /* * Return pointers to the pages for the file region [off, off + len] * in the pl array. If plsz is greater than len, this function may - * also return page pointers from before or after the specified - * region (i.e. some region [off', off' + plsz]). These additional - * pages are only returned if they are already in the cache, or were - * created as part of a klustered read. + * also return page pointers from after the specified region + * (i.e. the region [off, off + plsz]). These additional pages are + * only returned if they are already in the cache, or were created as + * part of a klustered read. * * IN: vp - vnode of file to get data from. * off - position in file to get data from. @@ -4026,9 +4054,17 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - page_t *pp, **pl0 = pl; - int need_unlock = 0, err = 0; - offset_t orig_off; + page_t **pl0 = pl; + int err = 0; + + /* we do our own caching, faultahead is unnecessary */ + if (pl == NULL) + return (0); + else if (len > plsz) + len = plsz; + else + len = P2ROUNDUP(len, PAGESIZE); + ASSERT(plsz >= len); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -4036,104 +4072,51 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, if (protp) *protp = PROT_ALL; - /* no faultahead (for now) */ - if (pl == NULL) { - ZFS_EXIT(zfsvfs); - return (0); - } - - /* can't fault past EOF */ - if (off >= zp->z_phys->zp_size) { - ZFS_EXIT(zfsvfs); - return (EFAULT); - } - orig_off = off; - - /* - * If we already own the lock, then we must be page faulting - * in the middle of a write to this file (i.e., we are writing - * to this file using data from a mapped region of the file). - */ - if (rw_owner(&zp->z_map_lock) != curthread) { - rw_enter(&zp->z_map_lock, RW_WRITER); - need_unlock = TRUE; - } - /* - * Loop through the requested range [off, off + len] looking + * Loop through the requested range [off, off + len) looking * for pages. If we don't find a page, we will need to create * a new page and fill it with data from the file. */ while (len > 0) { - if (plsz < PAGESIZE) - break; - if (pp = page_lookup(vp, off, SE_SHARED)) { - *pl++ = pp; + if (*pl = page_lookup(vp, off, SE_SHARED)) + *(pl+1) = NULL; + else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) + goto out; + while (*pl) { + ASSERT3U((*pl)->p_offset, ==, off); off += PAGESIZE; addr += PAGESIZE; - len -= PAGESIZE; - plsz -= PAGESIZE; - } else { - err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw); - if (err) - goto out; - /* - * klustering may have changed our region - * to be block aligned. - */ - if (((pp = *pl) != 0) && (off != pp->p_offset)) { - int delta = off - pp->p_offset; - len += delta; - off -= delta; - addr -= delta; - } - while (*pl) { - pl++; - off += PAGESIZE; - addr += PAGESIZE; - plsz -= PAGESIZE; - if (len > PAGESIZE) - len -= PAGESIZE; - else - len = 0; + if (len > 0) { + ASSERT3U(len, >=, PAGESIZE); + len -= PAGESIZE; } + ASSERT3U(plsz, >=, PAGESIZE); + plsz -= PAGESIZE; + pl++; } } /* * Fill out the page array with any pages already in the cache. */ - while (plsz > 0) { - pp = page_lookup_nowait(vp, off, SE_SHARED); - if (pp == NULL) - break; - *pl++ = pp; - off += PAGESIZE; - plsz -= PAGESIZE; + while (plsz > 0 && + (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { + off += PAGESIZE; + plsz -= PAGESIZE; } - - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); out: - /* - * We can't grab the range lock for the page as reader which would - * stop truncation as this leads to deadlock. So we need to recheck - * the file size. - */ - if (orig_off >= zp->z_phys->zp_size) - err = EFAULT; if (err) { /* * Release any pages we have previously locked. */ while (pl > pl0) page_unlock(*--pl); + } else { + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); } *pl = NULL; - if (need_unlock) - rw_exit(&zp->z_map_lock); - ZFS_EXIT(zfsvfs); return (err); } @@ -4436,6 +4419,11 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, (vp->v_type == VREG || vp->v_type == VDIR); return (0); + case _PC_ACCESS_FILTERING: + *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && + vp->v_type == VDIR; + return (0); + case _PC_ACL_ENABLED: *valp = _ACL_ACE_ENABLED; return (0); @@ -4588,6 +4576,22 @@ const fs_operation_def_t zfs_symvnodeops_template[] = { }; /* + * special share hidden files vnode operations template + */ +vnodeops_t *zfs_sharevnodeops; +const fs_operation_def_t zfs_sharevnodeops_template[] = { + VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, + VOPNAME_ACCESS, { .vop_access = zfs_access }, + VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, + VOPNAME_FID, { .vop_fid = zfs_fid }, + VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, + VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, + VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + NULL, NULL +}; + +/* * Extended attribute directory vnode operations template * This template is identical to the directory vnodes * operation template except for restricted operations: