Illumos #734: Use taskq_dispatch_ent() interface
authorGarrett D'Amore <garrett@nexenta.com>
Tue, 8 Nov 2011 00:26:52 +0000 (16:26 -0800)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Wed, 14 Dec 2011 17:19:30 +0000 (09:19 -0800)
It has been observed that some of the hottest locks are those
of the zio taskqs.  Contention on these locks can limit the
rate at which zios are dispatched which limits performance.

This upstream change from Illumos uses new interface to the
taskqs which allow them to utilize a prealloc'ed taskq_ent_t.
This removes the need to perform an allocation at dispatch
time while holding the contended lock.  This has the effect
of improving system performance.

Reviewed by: Albert Lee <trisk@nexenta.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Alexey Zaytsev <alexey.zaytsev@nexenta.com>
Reviewed by: Jason Brian King <jason.brian.king@gmail.com>
Reviewed by: George Wilson <gwilson@zfsmail.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Approved by: Gordon Ross <gwr@nexenta.com>

References to Illumos issue:
  https://www.illumos.org/issues/734

Ported-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #482

include/sys/zfs_context.h
include/sys/zio.h
lib/libzpool/taskq.c
module/zfs/spa.c
module/zfs/zio.c

index a328489..4abafcc 100644 (file)
@@ -22,6 +22,9 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ */
 
 #ifndef _SYS_ZFS_CONTEXT_H
 #define        _SYS_ZFS_CONTEXT_H
@@ -365,6 +368,16 @@ typedef struct taskq taskq_t;
 typedef uintptr_t taskqid_t;
 typedef void (task_func_t)(void *);
 
+typedef struct taskq_ent {
+       struct taskq_ent        *tqent_next;
+       struct taskq_ent        *tqent_prev;
+       task_func_t             *tqent_func;
+       void                    *tqent_arg;
+       uintptr_t               tqent_flags;
+} taskq_ent_t;
+
+#define        TQENT_FLAG_PREALLOC     0x1     /* taskq_dispatch_ent used */
+
 #define        TASKQ_PREPOPULATE       0x0001
 #define        TASKQ_CPR_SAFE          0x0002  /* Use CPR safe protocol */
 #define        TASKQ_DYNAMIC           0x0004  /* Use dynamic thread scheduling */
@@ -385,6 +398,10 @@ extern taskq_t     *taskq_create(const char *, int, pri_t, int, int, uint_t);
 #define        taskq_create_sysdc(a, b, d, e, p, dc, f) \
            (taskq_create(a, b, maxclsyspri, d, e, f))
 extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
+extern void    taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
+    taskq_ent_t *);
+extern int     taskq_empty_ent(taskq_ent_t *);
+extern void    taskq_init_ent(taskq_ent_t *);
 extern void    taskq_destroy(taskq_t *);
 extern void    taskq_wait(taskq_t *);
 extern int     taskq_member(taskq_t *, kthread_t *);
index a469181..c0da4e2 100644 (file)
@@ -22,6 +22,9 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
+/*
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ */
 
 #ifndef _ZIO_H
 #define        _ZIO_H
@@ -423,6 +426,9 @@ struct zio {
        /* FMA state */
        zio_cksum_report_t *io_cksum_report;
        uint64_t        io_ena;
+
+       /* Taskq dispatching state */
+       taskq_ent_t     io_tqent;
 };
 
 extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
index 36c0ec7..6143a91 100644 (file)
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ */
 
 #include <sys/zfs_context.h>
 
 int taskq_now;
 taskq_t *system_taskq;
 
-typedef struct task {
-       struct task     *task_next;
-       struct task     *task_prev;
-       task_func_t     *task_func;
-       void            *task_arg;
-} task_t;
-
 #define        TASKQ_ACTIVE    0x00010000
 
 struct taskq {
@@ -51,18 +47,19 @@ struct taskq {
        int             tq_maxalloc;
        kcondvar_t      tq_maxalloc_cv;
        int             tq_maxalloc_wait;
-       task_t          *tq_freelist;
-       task_t          tq_task;
+       taskq_ent_t     *tq_freelist;
+       taskq_ent_t     tq_task;
 };
 
-static task_t *
+static taskq_ent_t *
 task_alloc(taskq_t *tq, int tqflags)
 {
-       task_t *t;
+       taskq_ent_t *t;
        int rv;
 
 again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
-               tq->tq_freelist = t->task_next;
+               ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+               tq->tq_freelist = t->tqent_next;
        } else {
                if (tq->tq_nalloc >= tq->tq_maxalloc) {
                        if (!(tqflags & KM_SLEEP))
@@ -87,25 +84,28 @@ again:      if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
                }
                mutex_exit(&tq->tq_lock);
 
-               t = kmem_alloc(sizeof (task_t), tqflags);
+               t = kmem_alloc(sizeof (taskq_ent_t), tqflags);
 
                mutex_enter(&tq->tq_lock);
-               if (t != NULL)
+               if (t != NULL) {
+                       /* Make sure we start without any flags */
+                       t->tqent_flags = 0;
                        tq->tq_nalloc++;
+               }
        }
        return (t);
 }
 
 static void
-task_free(taskq_t *tq, task_t *t)
+task_free(taskq_t *tq, taskq_ent_t *t)
 {
        if (tq->tq_nalloc <= tq->tq_minalloc) {
-               t->task_next = tq->tq_freelist;
+               t->tqent_next = tq->tq_freelist;
                tq->tq_freelist = t;
        } else {
                tq->tq_nalloc--;
                mutex_exit(&tq->tq_lock);
-               kmem_free(t, sizeof (task_t));
+               kmem_free(t, sizeof (taskq_ent_t));
                mutex_enter(&tq->tq_lock);
        }
 
@@ -116,7 +116,7 @@ task_free(taskq_t *tq, task_t *t)
 taskqid_t
 taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
 {
-       task_t *t;
+       taskq_ent_t *t;
 
        if (taskq_now) {
                func(arg);
@@ -130,26 +130,77 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
                return (0);
        }
        if (tqflags & TQ_FRONT) {
-               t->task_next = tq->tq_task.task_next;
-               t->task_prev = &tq->tq_task;
+               t->tqent_next = tq->tq_task.tqent_next;
+               t->tqent_prev = &tq->tq_task;
        } else {
-               t->task_next = &tq->tq_task;
-               t->task_prev = tq->tq_task.task_prev;
+               t->tqent_next = &tq->tq_task;
+               t->tqent_prev = tq->tq_task.tqent_prev;
        }
-       t->task_next->task_prev = t;
-       t->task_prev->task_next = t;
-       t->task_func = func;
-       t->task_arg = arg;
+       t->tqent_next->tqent_prev = t;
+       t->tqent_prev->tqent_next = t;
+       t->tqent_func = func;
+       t->tqent_arg = arg;
+
+       ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
        cv_signal(&tq->tq_dispatch_cv);
        mutex_exit(&tq->tq_lock);
        return (1);
 }
 
+int
+taskq_empty_ent(taskq_ent_t *t)
+{
+       return t->tqent_next == NULL;
+}
+
+void
+taskq_init_ent(taskq_ent_t *t)
+{
+       t->tqent_next = NULL;
+       t->tqent_prev = NULL;
+       t->tqent_func = NULL;
+       t->tqent_arg = NULL;
+       t->tqent_flags = 0;
+}
+
+void
+taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
+    taskq_ent_t *t)
+{
+       ASSERT(func != NULL);
+       ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC));
+
+       /*
+        * Mark it as a prealloc'd task.  This is important
+        * to ensure that we don't free it later.
+        */
+       t->tqent_flags |= TQENT_FLAG_PREALLOC;
+       /*
+        * Enqueue the task to the underlying queue.
+        */
+       mutex_enter(&tq->tq_lock);
+
+       if (flags & TQ_FRONT) {
+               t->tqent_next = tq->tq_task.tqent_next;
+               t->tqent_prev = &tq->tq_task;
+       } else {
+               t->tqent_next = &tq->tq_task;
+               t->tqent_prev = tq->tq_task.tqent_prev;
+       }
+       t->tqent_next->tqent_prev = t;
+       t->tqent_prev->tqent_next = t;
+       t->tqent_func = func;
+       t->tqent_arg = arg;
+       cv_signal(&tq->tq_dispatch_cv);
+       mutex_exit(&tq->tq_lock);
+}
+
 void
 taskq_wait(taskq_t *tq)
 {
        mutex_enter(&tq->tq_lock);
-       while (tq->tq_task.task_next != &tq->tq_task || tq->tq_active != 0)
+       while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0)
                cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
        mutex_exit(&tq->tq_lock);
 }
@@ -158,27 +209,32 @@ static void
 taskq_thread(void *arg)
 {
        taskq_t *tq = arg;
-       task_t *t;
+       taskq_ent_t *t;
+       boolean_t prealloc;
 
        mutex_enter(&tq->tq_lock);
        while (tq->tq_flags & TASKQ_ACTIVE) {
-               if ((t = tq->tq_task.task_next) == &tq->tq_task) {
+               if ((t = tq->tq_task.tqent_next) == &tq->tq_task) {
                        if (--tq->tq_active == 0)
                                cv_broadcast(&tq->tq_wait_cv);
                        cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock);
                        tq->tq_active++;
                        continue;
                }
-               t->task_prev->task_next = t->task_next;
-               t->task_next->task_prev = t->task_prev;
+               t->tqent_prev->tqent_next = t->tqent_next;
+               t->tqent_next->tqent_prev = t->tqent_prev;
+               t->tqent_next = NULL;
+               t->tqent_prev = NULL;
+               prealloc = t->tqent_flags & TQENT_FLAG_PREALLOC;
                mutex_exit(&tq->tq_lock);
 
                rw_enter(&tq->tq_threadlock, RW_READER);
-               t->task_func(t->task_arg);
+               t->tqent_func(t->tqent_arg);
                rw_exit(&tq->tq_threadlock);
 
                mutex_enter(&tq->tq_lock);
-               task_free(tq, t);
+               if (!prealloc)
+                       task_free(tq, t);
        }
        tq->tq_nthreads--;
        cv_broadcast(&tq->tq_wait_cv);
@@ -217,8 +273,8 @@ taskq_create(const char *name, int nthreads, pri_t pri,
        tq->tq_nthreads = nthreads;
        tq->tq_minalloc = minalloc;
        tq->tq_maxalloc = maxalloc;
-       tq->tq_task.task_next = &tq->tq_task;
-       tq->tq_task.task_prev = &tq->tq_task;
+       tq->tq_task.tqent_next = &tq->tq_task;
+       tq->tq_task.tqent_prev = &tq->tq_task;
        tq->tq_threadlist = kmem_alloc(nthreads*sizeof(kthread_t *), KM_SLEEP);
 
        if (flags & TASKQ_PREPOPULATE) {
index 868a0d9..0b96497 100644 (file)
@@ -22,6 +22,9 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
+/*
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ */
 
 /*
  * This file contains all the routines used when modifying on-disk SPA state.
@@ -665,7 +668,7 @@ spa_create_zio_taskqs(spa_t *spa)
                        const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
                        enum zti_modes mode = ztip->zti_mode;
                        uint_t value = ztip->zti_value;
-                       uint_t flags = TASKQ_PREPOPULATE;
+                       uint_t flags = 0;
                        char name[32];
 
                        if (t == ZIO_TYPE_WRITE)
index 6b03be6..c96442d 100644 (file)
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -570,6 +571,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
                zio_add_child(pio, zio);
        }
 
+       taskq_init_ent(&zio->io_tqent);
+
        return (zio);
 }
 
@@ -1073,7 +1076,7 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
 {
        spa_t *spa = zio->io_spa;
        zio_type_t t = zio->io_type;
-       int flags = TQ_NOSLEEP | (cutinline ? TQ_FRONT : 0);
+       int flags = (cutinline ? TQ_FRONT : 0);
 
        /*
         * If we're a config writer or a probe, the normal issue and
@@ -1098,8 +1101,14 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
 
        ASSERT3U(q, <, ZIO_TASKQ_TYPES);
 
-       while (taskq_dispatch(spa->spa_zio_taskq[t][q],
-           (task_func_t *)zio_execute, zio, flags) == 0); /* do nothing */
+       /*
+        * NB: We are assuming that the zio can only be dispatched
+        * to a single taskq at a time.  It would be a grievous error
+        * to dispatch the zio to another taskq at the same time.
+        */
+       ASSERT(taskq_empty_ent(&zio->io_tqent));
+       taskq_dispatch_ent(spa->spa_zio_taskq[t][q],
+           (task_func_t *)zio_execute, zio, flags, &zio->io_tqent);
 }
 
 static boolean_t
@@ -2947,9 +2956,11 @@ zio_done(zio_t *zio)
                         * Reexecution is potentially a huge amount of work.
                         * Hand it off to the otherwise-unused claim taskq.
                         */
-                       (void) taskq_dispatch(
+                       ASSERT(taskq_empty_ent(&zio->io_tqent));
+                       (void) taskq_dispatch_ent(
                            zio->io_spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
-                           (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
+                           (task_func_t *)zio_reexecute, zio, 0,
+                           &zio->io_tqent);
                }
                return (ZIO_PIPELINE_STOP);
        }