X-Git-Url: https://git.camperquake.de/gitweb.cgi?a=blobdiff_plain;f=module%2Fzfs%2Fzap_micro.c;h=4da78364726be215ad860018ee3474ab4f0375fb;hb=refs%2Fheads%2Frertzinger%2Ffeature-zpool-get--p;hp=fbc93b423dcbb5d1069480a24dd4da553b746465;hpb=9babb37438b58e77bad04e820d5702e15b79e6a6;p=zfs.git diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index fbc93b4..4da7836 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -19,10 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ +#include #include #include #include @@ -31,38 +32,98 @@ #include #include #include +#include #ifdef _KERNEL #include #endif -static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx); +static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags); +uint64_t +zap_getflags(zap_t *zap) +{ + if (zap->zap_ismicro) + return (0); + return (zap->zap_u.zap_fat.zap_phys->zap_flags); +} + +int +zap_hashbits(zap_t *zap) +{ + if (zap_getflags(zap) & ZAP_FLAG_HASH64) + return (48); + else + return (28); +} + +uint32_t +zap_maxcd(zap_t *zap) +{ + if (zap_getflags(zap) & ZAP_FLAG_HASH64) + return ((1<<16)-1); + else + return (-1U); +} static uint64_t -zap_hash(zap_t *zap, const char *normname) +zap_hash(zap_name_t *zn) { - const uint8_t *cp; - uint8_t c; - uint64_t crc = zap->zap_salt; + zap_t *zap = zn->zn_zap; + uint64_t h = 0; + + if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { + ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); + h = *(uint64_t *)zn->zn_key_orig; + } else { + h = zap->zap_salt; + ASSERT(h != 0); + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + + if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { + int i; + const uint64_t *wp = zn->zn_key_norm; + + ASSERT(zn->zn_key_intlen == 8); + for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) { + int j; + uint64_t word = *wp; + + for (j = 0; j < zn->zn_key_intlen; j++) { + h = (h >> 8) ^ + zfs_crc64_table[(h ^ word) & 0xFF]; + word >>= NBBY; + } + } + } else { + int i, len; + const uint8_t *cp = zn->zn_key_norm; - /* NB: name must already be normalized, if necessary */ + /* + * We previously stored the terminating null on + * disk, but didn't hash it, so we need to + * continue to not hash it. (The + * zn_key_*_numints includes the terminating + * null for non-binary keys.) + */ + len = zn->zn_key_norm_numints - 1; - ASSERT(crc != 0); - ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); - for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) { - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; + ASSERT(zn->zn_key_intlen == 1); + for (i = 0; i < len; cp++, i++) { + h = (h >> 8) ^ + zfs_crc64_table[(h ^ *cp) & 0xFF]; + } + } } - /* - * Only use 28 bits, since we need 4 bits in the cookie for the - * collision differentiator. We MUST use the high bits, since - * those are the ones that we first pay attention to when + * Don't use all 64 bits, since we need some in the cookie for + * the collision differentiator. We MUST use the high bits, + * since those are the ones that we first pay attention to when * chosing the bucket. */ - crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); + h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); - return (crc); + return (h); } static int @@ -71,6 +132,8 @@ zap_normalize(zap_t *zap, const char *name, char *namenorm) size_t inlen, outlen; int err; + ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); + inlen = strlen(name) + 1; outlen = ZAP_MAXNAMELEN; @@ -85,16 +148,18 @@ zap_normalize(zap_t *zap, const char *name, char *namenorm) boolean_t zap_match(zap_name_t *zn, const char *matchname) { + ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); + if (zn->zn_matchtype == MT_FIRST) { char norm[ZAP_MAXNAMELEN]; if (zap_normalize(zn->zn_zap, matchname, norm) != 0) return (B_FALSE); - return (strcmp(zn->zn_name_norm, norm) == 0); + return (strcmp(zn->zn_key_norm, norm) == 0); } else { /* MT_BEST or MT_EXACT */ - return (strcmp(zn->zn_name_orij, matchname) == 0); + return (strcmp(zn->zn_key_orig, matchname) == 0); } } @@ -104,30 +169,49 @@ zap_name_free(zap_name_t *zn) kmem_free(zn, sizeof (zap_name_t)); } -/* XXX combine this with zap_lockdir()? */ zap_name_t * -zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt) +zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) { - zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); + zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_PUSHPAGE); zn->zn_zap = zap; - zn->zn_name_orij = name; + zn->zn_key_intlen = sizeof (*key); + zn->zn_key_orig = key; + zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; zn->zn_matchtype = mt; if (zap->zap_normflags) { - if (zap_normalize(zap, name, zn->zn_normbuf) != 0) { + if (zap_normalize(zap, key, zn->zn_normbuf) != 0) { zap_name_free(zn); return (NULL); } - zn->zn_name_norm = zn->zn_normbuf; + zn->zn_key_norm = zn->zn_normbuf; + zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } else { if (mt != MT_EXACT) { zap_name_free(zn); return (NULL); } - zn->zn_name_norm = zn->zn_name_orij; + zn->zn_key_norm = zn->zn_key_orig; + zn->zn_key_norm_numints = zn->zn_key_orig_numints; } - zn->zn_hash = zap_hash(zap, zn->zn_name_norm); + zn->zn_hash = zap_hash(zn); + return (zn); +} + +zap_name_t * +zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) +{ + zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_PUSHPAGE); + + ASSERT(zap->zap_normflags == 0); + zn->zn_zap = zap; + zn->zn_key_intlen = sizeof (*key); + zn->zn_key_orig = zn->zn_key_norm = key; + zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; + zn->zn_matchtype = MT_EXACT; + + zn->zn_hash = zap_hash(zn); return (zn); } @@ -172,26 +256,26 @@ mze_compare(const void *arg1, const void *arg2) return (+1); if (mze1->mze_hash < mze2->mze_hash) return (-1); - if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd) + if (mze1->mze_cd > mze2->mze_cd) return (+1); - if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd) + if (mze1->mze_cd < mze2->mze_cd) return (-1); return (0); } static void -mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep) +mze_insert(zap_t *zap, int chunkid, uint64_t hash) { mzap_ent_t *mze; ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - ASSERT(mzep->mze_cd < ZAP_MAXCD); - mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); + mze = kmem_alloc(sizeof (mzap_ent_t), KM_PUSHPAGE); mze->mze_chunkid = chunkid; mze->mze_hash = hash; - mze->mze_phys = *mzep; + mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; + ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); avl_add(&zap->zap_m.zap_avl, mze); } @@ -206,18 +290,16 @@ mze_find(zap_name_t *zn) ASSERT(zn->zn_zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); - if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name)) - return (NULL); - mze_tofind.mze_hash = zn->zn_hash; - mze_tofind.mze_phys.mze_cd = 0; + mze_tofind.mze_cd = 0; again: mze = avl_find(avl, &mze_tofind, &idx); if (mze == NULL) mze = avl_nearest(avl, idx, AVL_AFTER); for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { - if (zap_match(zn, mze->mze_phys.mze_name)) + ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); + if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) return (mze); } if (zn->zn_matchtype == MT_BEST) { @@ -240,12 +322,12 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash) ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); mze_tofind.mze_hash = hash; - mze_tofind.mze_phys.mze_cd = 0; + mze_tofind.mze_cd = 0; cd = 0; for (mze = avl_find(avl, &mze_tofind, &idx); mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { - if (mze->mze_phys.mze_cd != cd) + if (mze->mze_cd != cd) break; cd++; } @@ -269,7 +351,7 @@ mze_destroy(zap_t *zap) mzap_ent_t *mze; void *avlcookie = NULL; - while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)) + while ((mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))) kmem_free(mze, sizeof (mzap_ent_t)); avl_destroy(&zap->zap_m.zap_avl); } @@ -283,8 +365,8 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); - zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); - rw_init(&zap->zap_rwlock, 0, 0, 0); + zap = kmem_zalloc(sizeof (zap_t), KM_PUSHPAGE); + rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL); rw_enter(&zap->zap_rwlock, RW_WRITER); zap->zap_objset = os; zap->zap_object = obj; @@ -329,7 +411,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) zap->zap_m.zap_num_entries++; zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT); - mze_insert(zap, i, zn->zn_hash, mze); + mze_insert(zap, i, zn->zn_hash); zap_name_free(zn); } } @@ -371,7 +453,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, *zapp = NULL; - err = dmu_buf_hold(os, obj, 0, NULL, &db); + err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH); if (err) return (err); @@ -379,7 +461,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); - ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif @@ -421,10 +503,10 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, dprintf("upgrading obj %llu: num_entries=%u\n", obj, zap->zap_m.zap_num_entries); *zapp = zap; - return (mzap_upgrade(zapp, tx)); + return (mzap_upgrade(zapp, tx, 0)); } err = dmu_object_set_blocksize(os, obj, newsz, 0, tx); - ASSERT3U(err, ==, 0); + ASSERT0(err); zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; } @@ -441,24 +523,27 @@ zap_unlockdir(zap_t *zap) } static int -mzap_upgrade(zap_t **zapp, dmu_tx_t *tx) +mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) { mzap_phys_t *mzp; - int i, sz, nchunks, err; + int i, sz, nchunks; + int err = 0; zap_t *zap = *zapp; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); sz = zap->zap_dbuf->db_size; - mzp = kmem_alloc(sz, KM_SLEEP); + mzp = kmem_alloc(sz, KM_PUSHPAGE | KM_NODEBUG); bcopy(zap->zap_dbuf->db_data, mzp, sz); nchunks = zap->zap_m.zap_num_chunks; - err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, - 1ULL << fzap_default_block_shift, 0, tx); - if (err) { - kmem_free(mzp, sz); - return (err); + if (!flags) { + err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, + 1ULL << fzap_default_block_shift, 0, tx); + if (err) { + kmem_free(mzp, sz); + return (err); + } } dprintf("upgrading obj=%llu with %u chunks\n", @@ -466,10 +551,9 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx) /* XXX destroy the avl later, so we can use the stored hash value */ mze_destroy(zap); - fzap_upgrade(zap, tx); + fzap_upgrade(zap, tx, flags); for (i = 0; i < nchunks; i++) { - int err; mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; zap_name_t *zn; if (mze->mze_name[0] == 0) @@ -489,18 +573,19 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx) } static void -mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx) +mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, + dmu_tx_t *tx) { dmu_buf_t *db; mzap_phys_t *zp; - VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db)); + VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); - ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif @@ -510,6 +595,15 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx) zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; zp->mz_normflags = normflags; dmu_buf_rele(db, FTAG); + + if (flags != 0) { + zap_t *zap; + /* Only fat zap supports flags; upgrade immediately. */ + VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER, + B_FALSE, B_FALSE, &zap)); + VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags)); + zap_unlockdir(zap); + } } int @@ -530,7 +624,7 @@ zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); if (err != 0) return (err); - mzap_create_impl(os, obj, normflags, tx); + mzap_create_impl(os, obj, normflags, 0, tx); return (0); } @@ -547,7 +641,26 @@ zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, { uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); - mzap_create_impl(os, obj, normflags, tx); + mzap_create_impl(os, obj, normflags, 0, tx); + return (obj); +} + +uint64_t +zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); + + ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT && + leaf_blockshift <= SPA_MAXBLOCKSHIFT && + indirect_blockshift >= SPA_MINBLOCKSHIFT && + indirect_blockshift <= SPA_MAXBLOCKSHIFT); + + VERIFY(dmu_object_set_blocksize(os, obj, + 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0); + + mzap_create_impl(os, obj, normflags, flags, tx); return (obj); } @@ -617,11 +730,11 @@ again: other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { if (zn == NULL) { - zn = zap_name_alloc(zap, mze->mze_phys.mze_name, + zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, MT_FIRST); allocdzn = B_TRUE; } - if (zap_match(zn, other->mze_phys.mze_name)) { + if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); @@ -683,9 +796,10 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, } else if (integer_size != 8) { err = EINVAL; } else { - *(uint64_t *)buf = mze->mze_phys.mze_value; + *(uint64_t *)buf = + MZE_PHYS(zap, mze)->mze_value; (void) strlcpy(realname, - mze->mze_phys.mze_name, rn_len); + MZE_PHYS(zap, mze)->mze_name, rn_len); if (ncp) { *ncp = mzap_normalization_conflict(zap, zn, mze); @@ -699,6 +813,63 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, } int +zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints) +{ + zap_t *zap; + int err; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + + fzap_prefetch(zn); + zap_name_free(zn); + zap_unlockdir(zap); + return (err); +} + +int +zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) +{ + zap_t *zap; + int err; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + + err = fzap_lookup(zn, integer_size, num_integers, buf, + NULL, 0, NULL); + zap_name_free(zn); + zap_unlockdir(zap); + return (err); +} + +int +zap_contains(objset_t *os, uint64_t zapobj, const char *name) +{ + int err = (zap_lookup_norm(os, zapobj, name, 0, + 0, NULL, MT_EXACT, NULL, 0, NULL)); + if (err == EOVERFLOW || err == EINVAL) + err = 0; /* found, but skipped reading the value */ + return (err); +} + +int zap_length(objset_t *os, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers) { @@ -733,6 +904,28 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name, return (err); } +int +zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t *integer_size, uint64_t *num_integers) +{ + zap_t *zap; + int err; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + err = fzap_length(zn, integer_size, num_integers); + zap_name_free(zn); + zap_unlockdir(zap); + return (err); +} + static void mzap_addent(zap_name_t *zn, uint64_t value) { @@ -741,20 +934,18 @@ mzap_addent(zap_name_t *zn, uint64_t value) int start = zap->zap_m.zap_alloc_next; uint32_t cd; - dprintf("obj=%llu %s=%llu\n", zap->zap_object, - zn->zn_name_orij, value); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); #ifdef ZFS_DEBUG for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { - mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; - ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0); + ASSERTV(mzap_ent_phys_t *mze=&zap->zap_m.zap_phys->mz_chunk[i]); + ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); } #endif cd = mze_find_unused_cd(zap, zn->zn_hash); /* given the limited size of the microzap, this can't happen */ - ASSERT(cd != ZAP_MAXCD); + ASSERT(cd < zap_maxcd(zap)); again: for (i = start; i < zap->zap_m.zap_num_chunks; i++) { @@ -762,13 +953,13 @@ again: if (mze->mze_name[0] == 0) { mze->mze_value = value; mze->mze_cd = cd; - (void) strcpy(mze->mze_name, zn->zn_name_orij); + (void) strcpy(mze->mze_name, zn->zn_key_orig); zap->zap_m.zap_num_entries++; zap->zap_m.zap_alloc_next = i+1; if (zap->zap_m.zap_alloc_next == zap->zap_m.zap_num_chunks) zap->zap_m.zap_alloc_next = 0; - mze_insert(zap, i, zn->zn_hash, mze); + mze_insert(zap, i, zn->zn_hash); return; } } @@ -780,7 +971,7 @@ again: } int -zap_add(objset_t *os, uint64_t zapobj, const char *name, +zap_add(objset_t *os, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { @@ -793,7 +984,7 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name, err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); if (err) return (err); - zn = zap_name_alloc(zap, name, MT_EXACT); + zn = zap_name_alloc(zap, key, MT_EXACT); if (zn == NULL) { zap_unlockdir(zap); return (ENOTSUP); @@ -802,10 +993,8 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name, err = fzap_add(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ } else if (integer_size != 8 || num_integers != 1 || - strlen(name) >= MZAP_NAME_LEN) { - dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", - zapobj, integer_size, num_integers, name); - err = mzap_upgrade(&zn->zn_zap, tx); + strlen(key) >= MZAP_NAME_LEN) { + err = mzap_upgrade(&zn->zn_zap, tx, 0); if (err == 0) err = fzap_add(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ @@ -825,6 +1014,31 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name, } int +zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + err = fzap_add(zn, integer_size, num_integers, val, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_add() failed */ + zap_unlockdir(zap); + return (err); +} + +int zap_update(objset_t *os, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { @@ -834,6 +1048,17 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, zap_name_t *zn; int err; +#ifdef ZFS_DEBUG + uint64_t oldval; + + /* + * If there is an old value, it shouldn't change across the + * lockdir (eg, due to bprewrite's xlation). + */ + if (integer_size == 8 && num_integers == 1) + (void) zap_lookup(os, zapobj, name, 8, 1, &oldval); +#endif + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); if (err) return (err); @@ -849,7 +1074,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, strlen(name) >= MZAP_NAME_LEN) { dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", zapobj, integer_size, num_integers, name); - err = mzap_upgrade(&zn->zn_zap, tx); + err = mzap_upgrade(&zn->zn_zap, tx, 0); if (err == 0) err = fzap_update(zn, integer_size, num_integers, val, tx); @@ -857,9 +1082,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, } else { mze = mze_find(zn); if (mze != NULL) { - mze->mze_phys.mze_value = *intval; - zap->zap_m.zap_phys->mz_chunk - [mze->mze_chunkid].mze_value = *intval; + ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval); + MZE_PHYS(zap, mze)->mze_value = *intval; } else { mzap_addent(zn, *intval); } @@ -872,6 +1096,31 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, } int +zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + zap_name_t *zn; + int err; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + err = fzap_update(zn, integer_size, num_integers, val, tx); + zap = zn->zn_zap; /* fzap_update() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ + zap_unlockdir(zap); + return (err); +} + +int zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) { return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx)); @@ -912,17 +1161,32 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, return (err); } +int +zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + err = fzap_remove(zn, tx); + zap_name_free(zn); + zap_unlockdir(zap); + return (err); +} + /* * Routines for iterating over the attributes. */ -/* - * We want to keep the high 32 bits of the cursor zero if we can, so - * that 32-bit programs can access this. So use a small hash value so - * we can fit 4 bits of cd into the 32-bit cursor. - * - * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ] - */ void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, uint64_t serialized) @@ -931,15 +1195,9 @@ zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, zc->zc_zap = NULL; zc->zc_leaf = NULL; zc->zc_zapobj = zapobj; - if (serialized == -1ULL) { - zc->zc_hash = -1ULL; - zc->zc_cd = 0; - } else { - zc->zc_hash = serialized << (64-ZAP_HASHBITS); - zc->zc_cd = serialized >> ZAP_HASHBITS; - if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */ - zc->zc_cd = 0; - } + zc->zc_serialized = serialized; + zc->zc_hash = 0; + zc->zc_cd = 0; } void @@ -969,10 +1227,21 @@ zap_cursor_serialize(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return (-1ULL); - ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0); - ASSERT(zc->zc_cd < ZAP_MAXCD); - return ((zc->zc_hash >> (64-ZAP_HASHBITS)) | - ((uint64_t)zc->zc_cd << ZAP_HASHBITS)); + if (zc->zc_zap == NULL) + return (zc->zc_serialized); + ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); + ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); + + /* + * We want to keep the high 32 bits of the cursor zero if we can, so + * that 32-bit programs can access this. So usually use a small + * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits + * of the cursor. + * + * [ collision differentiator | zap_hashbits()-bit hash value ] + */ + return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | + ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); } int @@ -987,10 +1256,23 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) return (ENOENT); if (zc->zc_zap == NULL) { + int hb; err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, RW_READER, TRUE, FALSE, &zc->zc_zap); if (err) return (err); + + /* + * To support zap_cursor_init_serialized, advance, retrieve, + * we must add to the existing zc_cd, which may already + * be 1 due to the zap_cursor_advance. + */ + ASSERT(zc->zc_hash == 0); + hb = zap_hashbits(zc->zc_zap); + zc->zc_hash = zc->zc_serialized << (64 - hb); + zc->zc_cd += zc->zc_serialized >> hb; + if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ + zc->zc_cd = 0; } else { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } @@ -1000,7 +1282,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) err = ENOENT; mze_tofind.mze_hash = zc->zc_hash; - mze_tofind.mze_phys.mze_cd = zc->zc_cd; + mze_tofind.mze_cd = zc->zc_cd; mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); if (mze == NULL) { @@ -1008,18 +1290,16 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) idx, AVL_AFTER); } if (mze) { - ASSERT(0 == bcmp(&mze->mze_phys, - &zc->zc_zap->zap_m.zap_phys->mz_chunk - [mze->mze_chunkid], sizeof (mze->mze_phys))); - + mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); + ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); za->za_normalization_conflict = mzap_normalization_conflict(zc->zc_zap, NULL, mze); za->za_integer_length = 8; za->za_num_integers = 1; - za->za_first_integer = mze->mze_phys.mze_value; - (void) strcpy(za->za_name, mze->mze_phys.mze_name); + za->za_first_integer = mzep->mze_value; + (void) strcpy(za->za_name, mzep->mze_name); zc->zc_hash = mze->mze_hash; - zc->zc_cd = mze->mze_phys.mze_cd; + zc->zc_cd = mze->mze_cd; err = 0; } else { zc->zc_hash = -1ULL; @@ -1035,12 +1315,46 @@ zap_cursor_advance(zap_cursor_t *zc) if (zc->zc_hash == -1ULL) return; zc->zc_cd++; - if (zc->zc_cd >= ZAP_MAXCD) { - zc->zc_cd = 0; - zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS); - if (zc->zc_hash == 0) /* EOF */ - zc->zc_hash = -1ULL; +} + +int +zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) +{ + int err = 0; + mzap_ent_t *mze; + zap_name_t *zn; + + if (zc->zc_zap == NULL) { + err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, + RW_READER, TRUE, FALSE, &zc->zc_zap); + if (err) + return (err); + } else { + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } + + zn = zap_name_alloc(zc->zc_zap, name, mt); + if (zn == NULL) { + rw_exit(&zc->zc_zap->zap_rwlock); + return (ENOTSUP); + } + + if (!zc->zc_zap->zap_ismicro) { + err = fzap_cursor_move_to_key(zc, zn); + } else { + mze = mze_find(zn); + if (mze == NULL) { + err = ENOENT; + goto out; + } + zc->zc_hash = mze->mze_hash; + zc->zc_cd = mze->mze_cd; + } + +out: + zap_name_free(zn); + rw_exit(&zc->zc_zap->zap_rwlock); + return (err); } int @@ -1068,7 +1382,7 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) int zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, - uint64_t *towrite, uint64_t *tooverwrite, uint64_t dn_datablkshift) + uint64_t *towrite, uint64_t *tooverwrite) { zap_t *zap; int err = 0; @@ -1091,7 +1405,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, } /* - * We lock the zap with adding == FALSE. Because, if we pass + * We lock the zap with adding == FALSE. Because, if we pass * the actual value of add, it could trigger a mzap_upgrade(). * At present we are just evaluating the possibility of this operation * and hence we donot want to trigger an upgrade. @@ -1113,31 +1427,74 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; } } else { - if (!add) { - if (dmu_buf_freeable(zap->zap_dbuf)) - *tooverwrite += SPA_MAXBLOCKSIZE; - else - *towrite += SPA_MAXBLOCKSIZE; - } else { - /* - * We are here if we are adding and (name != NULL). - * It is hard to find out if this add will promote this - * microzap to fatzap. Hence, we assume the worst case - * and account for the blocks assuming this microzap - * would be promoted to a fatzap. - * - * 1 block overwritten : header block - * 4 new blocks written : 2 new split leaf, 2 grown - * ptrtbl blocks - */ - if (dmu_buf_freeable(zap->zap_dbuf)) - *tooverwrite += 1 << dn_datablkshift; - else - *towrite += 1 << dn_datablkshift; - *towrite += 4 << dn_datablkshift; + /* + * We are here if (name != NULL) and this is a micro-zap. + * We account for the header block depending on whether it + * is freeable. + * + * Incase of an add-operation it is hard to find out + * if this add will promote this microzap to fatzap. + * Hence, we consider the worst case and account for the + * blocks assuming this microzap would be promoted to a + * fatzap. + * + * 1 block overwritten : header block + * 4 new blocks written : 2 new split leaf, 2 grown + * ptrtbl blocks + */ + if (dmu_buf_freeable(zap->zap_dbuf)) + *tooverwrite += SPA_MAXBLOCKSIZE; + else + *towrite += SPA_MAXBLOCKSIZE; + + if (add) { + *towrite += 4 * SPA_MAXBLOCKSIZE; } } zap_unlockdir(zap); return (err); } + +#if defined(_KERNEL) && defined(HAVE_SPL) +EXPORT_SYMBOL(zap_create); +EXPORT_SYMBOL(zap_create_norm); +EXPORT_SYMBOL(zap_create_flags); +EXPORT_SYMBOL(zap_create_claim); +EXPORT_SYMBOL(zap_create_claim_norm); +EXPORT_SYMBOL(zap_destroy); +EXPORT_SYMBOL(zap_lookup); +EXPORT_SYMBOL(zap_lookup_norm); +EXPORT_SYMBOL(zap_lookup_uint64); +EXPORT_SYMBOL(zap_contains); +EXPORT_SYMBOL(zap_prefetch_uint64); +EXPORT_SYMBOL(zap_count_write); +EXPORT_SYMBOL(zap_add); +EXPORT_SYMBOL(zap_add_uint64); +EXPORT_SYMBOL(zap_update); +EXPORT_SYMBOL(zap_update_uint64); +EXPORT_SYMBOL(zap_length); +EXPORT_SYMBOL(zap_length_uint64); +EXPORT_SYMBOL(zap_remove); +EXPORT_SYMBOL(zap_remove_norm); +EXPORT_SYMBOL(zap_remove_uint64); +EXPORT_SYMBOL(zap_count); +EXPORT_SYMBOL(zap_value_search); +EXPORT_SYMBOL(zap_join); +EXPORT_SYMBOL(zap_join_increment); +EXPORT_SYMBOL(zap_add_int); +EXPORT_SYMBOL(zap_remove_int); +EXPORT_SYMBOL(zap_lookup_int); +EXPORT_SYMBOL(zap_increment_int); +EXPORT_SYMBOL(zap_add_int_key); +EXPORT_SYMBOL(zap_lookup_int_key); +EXPORT_SYMBOL(zap_increment); +EXPORT_SYMBOL(zap_cursor_init); +EXPORT_SYMBOL(zap_cursor_fini); +EXPORT_SYMBOL(zap_cursor_retrieve); +EXPORT_SYMBOL(zap_cursor_advance); +EXPORT_SYMBOL(zap_cursor_serialize); +EXPORT_SYMBOL(zap_cursor_move_to_key); +EXPORT_SYMBOL(zap_cursor_init_serialized); +EXPORT_SYMBOL(zap_get_stats); +#endif