3 # ZPOOL fault verification test script.
5 # The current suite of fault tests should not be thought of an exhaustive
6 # list of failure modes. Rather it is simply an starting point which trys
7 # to cover the bulk the of the 'easy' and hopefully common, failure modes.
9 # Additional tests should be added but the current suite as new interesting
10 # failures modes are observed. Additional failure modes I'd like to see
11 # tests for include, but are not limited too:
13 # * Slow but successful IO.
14 # * SCSI sense codes generated as zevents.
20 # The current infrastructure using the 'mdadm' faulty device and the
21 # 'scsi_debug' simulated scsi devices. The idea is to inject the error
22 # below the zfs stack to validate all the error paths. More targeted
23 # failure testing should be added using the 'zinject' command line util.
25 # Requires the following packages:
31 basedir="$(dirname $0)"
33 SCRIPT_COMMON=common.sh
34 if [ -f "${basedir}/${SCRIPT_COMMON}" ]; then
35 . "${basedir}/${SCRIPT_COMMON}"
37 echo "Missing helper script ${SCRIPT_COMMON}" && exit 1
48 ZPOOL fault verification tests
53 -c Cleanup md+lo+file devices at start
54 -t <#> Run listed tests
55 -s <#> Skip listed tests
60 while getopts 'hvct:s:?' OPTION; do
85 if [ $(id -u) != 0 ]; then
86 die "Must run as root"
89 # Perform pre-cleanup is requested
90 if [ ${CLEANUP} ]; then
93 rm -f /tmp/zpool.cache.*
96 # Check if we need to skip all md based tests.
98 check_md_partitionable && MD_PARTITIONABLE=1
99 if [ ${MD_PARTITIONABLE} -eq 0 ]; then
100 echo "Skipping tests 1-7 which require partitionable md devices"
103 # Check if we need to skip all the scsi_debug tests.
105 ${INFOMOD} scsi_debug &>/dev/null && SCSI_DEBUG=1
106 if [ ${SCSI_DEBUG} -eq 0 ]; then
107 echo "Skipping tests 8-9 which require the scsi_debug module"
110 if [ ${MD_PARTITIONABLE} -eq 0 ] || [ ${SCSI_DEBUG} -eq 0 ]; then
114 printf "%40s%s\t%s\t%s\t%s\t%s\n" "" "raid0" "raid10" "raidz" "raidz2" "raidz3"
117 echo -n -e "${COLOR_GREEN}Pass${COLOR_RESET}\t"
121 echo -n -e "${COLOR_BROWN}Skip${COLOR_RESET}\t"
129 ${ZPOOL} status ${POOL_NAME} | grep ${DEVICE_TYPE} ${TMP_STATUS} | \
130 head -n${DEVICE_NTH} | tail -n1 | ${AWK} "{ print \$1 }"
137 ${ZPOOL} status ${POOL_NAME} | ${AWK} "/${VDEV_NAME}/ { print \$2 }"
140 # Required format is x.yz[KMGTP]
141 expand_numeric_suffix() {
144 VALUE=`echo "${VALUE/%K/*1000}"`
145 VALUE=`echo "${VALUE/%M/*1000000}"`
146 VALUE=`echo "${VALUE/%G/*1000000000}"`
147 VALUE=`echo "${VALUE/%T/*1000000000000}"`
148 VALUE=`echo "${VALUE/%P/*1000000000000000}"`
149 VALUE=`echo "${VALUE}" | bc | cut -d'.' -f1`
157 local VDEV_ERRORS=`${ZPOOL} status ${POOL_NAME} |
158 ${AWK} "/${VDEV_NAME}/ { print \\$3 }"`
160 expand_numeric_suffix ${VDEV_ERRORS}
163 vdev_write_errors() {
166 local VDEV_ERRORS=`${ZPOOL} status ${POOL_NAME} |
167 ${AWK} "/${VDEV_NAME}/ { print \\$4 }"`
169 expand_numeric_suffix ${VDEV_ERRORS}
172 vdev_cksum_errors() {
175 local VDEV_ERRORS=`${ZPOOL} status ${POOL_NAME} |
176 ${AWK} "/${VDEV_NAME}/ { print \\$5 }"`
178 expand_numeric_suffix ${VDEV_ERRORS}
184 ${ZPOOL} status ${POOL_NAME} | ${AWK} "/state/ { print \$2; exit }"
191 SCRIPT1="BEGIN {RS=\"\"; FS=\"\n\"} /${EVENT_NAME}/ { print \$0; exit }"
192 SCRIPT2="BEGIN {FS=\"=\"} /${EVENT_KEY}/ { print \$2; exit }"
194 ${ZPOOL} events -vH | ${AWK} "${SCRIPT1}" | ${AWK} "${SCRIPT2}"
197 zpool_scan_errors() {
200 ${ZPOOL} status ${POOL_NAME} | ${AWK} "/scan: scrub/ { print \$8 }"
201 ${ZPOOL} status ${POOL_NAME} | ${AWK} "/scan: resilver/ { print \$7 }"
205 local PATTERN_BLOCK_SIZE=$1
206 local PATTERN_BLOCK_COUNT=$2
207 local PATTERN_NAME=`mktemp -p /tmp zpool.pattern.XXXXXXXX`
210 dd if=/dev/urandom of=${PATTERN_NAME} bs=${PATTERN_BLOCK_SIZE} \
211 count=${PATTERN_BLOCK_COUNT} &>/dev/null
216 local PATTERN_NAME=$1
217 local PATTERN_BLOCK_SIZE=$2
218 local PATTERN_BLOCK_COUNT=$3
221 dd if=${PATTERN_NAME} of=${DEVICE_NAME} bs=${PATTERN_BLOCK_SIZE} \
222 count=${PATTERN_BLOCK_COUNT} oflag=direct &>/dev/null
227 local PATTERN_NAME=$1
228 local PATTERN_BLOCK_SIZE=$2
229 local PATTERN_BLOCK_COUNT=$3
232 dd if=${PATTERN_NAME} of=${DEVICE_NAME} bs=${PATTERN_BLOCK_SIZE} \
233 count=${PATTERN_BLOCK_COUNT} oflag=direct &>/dev/null &
238 local PATTERN_NAME=$1
239 local PATTERN_BLOCK_SIZE=$2
240 local PATTERN_BLOCK_COUNT=$3
242 local DEVICE_FILE=`mktemp -p /tmp zpool.pattern.XXXXXXXX`
244 dd if=${DEVICE_NAME} of=${DEVICE_FILE} bs=${PATTERN_BLOCK_SIZE} \
245 count=${PATTERN_BLOCK_COUNT} iflag=direct &>/dev/null
246 cmp -s ${PATTERN_NAME} ${DEVICE_FILE}
254 local PATTERN_NAME=$1
256 rm -f ${PATTERN_NAME}
264 ${MDADM} /dev/${VDEV_FAULTY} --grow --level=faulty \
265 --layout=${FAULT_TYPE} >/dev/null
272 # Clear all failure injection.
273 ${MDADM} /dev/${VDEV_FAULTY} --grow --level=faulty \
274 --layout=clear >/dev/null || return $?
275 ${MDADM} /dev/${VDEV_FAULTY} --grow --level=faulty \
276 --layout=flush >/dev/null || return $?
284 echo ${OPTS} >/sys/bus/pseudo/drivers/scsi_debug/opts
285 echo ${NTH} >/sys/bus/pseudo/drivers/scsi_debug/every_nth
289 echo 0 >/sys/bus/pseudo/drivers/scsi_debug/every_nth
290 echo 0 >/sys/bus/pseudo/drivers/scsi_debug/opts
299 ${ZFS_SH} zfs="spa_config_path=${TMP_CACHE}" || fail 1
300 ${ZPOOL_CREATE_SH} -p ${POOL_NAME} -c ${POOL_CONFIG} || fail 2
301 ${ZFS} create -V 64M ${POOL_NAME}/${ZVOL_NAME} || fail 3
303 # Trigger udev and re-read the partition table to ensure all of
304 # this IO is out of the way before we begin injecting failures.
305 udev_trigger || fail 4
306 ${BLOCKDEV} --rereadpt /dev/${POOL_NAME}/${ZVOL_NAME} || fail 5
315 ${ZFS} destroy ${POOL_NAME}/${ZVOL_NAME} || fail 101
316 ${ZPOOL_CREATE_SH} -p ${POOL_NAME} -c ${POOL_CONFIG} -d || fail 102
317 ${ZFS_SH} -u || fail 103
318 rm -f ${TMP_CACHE} || fail 104
324 local POOL_REDUNDANT=$3
325 local ZVOL_NAME="zvol"
326 local ZVOL_DEVICE="/dev/${POOL_NAME}/${ZVOL_NAME}"
328 if [ ${MD_PARTITIONABLE} -eq 0 ]; then
333 local TMP_CACHE=`mktemp -p /tmp zpool.cache.XXXXXXXX`
334 test_setup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
336 # Set soft write failure for first vdev device.
337 local VDEV_FAULTY=`nth_zpool_vdev ${POOL_NAME} md 1`
338 fault_set_md ${VDEV_FAULTY} write-transient
340 # The application must not observe an error.
341 local TMP_PATTERN=`pattern_create 1M 8` || fail 11
342 pattern_write ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 12
343 fault_clear_md ${VDEV_FAULTY}
345 # Soft errors will not be logged to 'zpool status'
346 local WRITE_ERRORS=`vdev_write_errors ${POOL_NAME} ${VDEV_FAULTY}`
347 test ${WRITE_ERRORS} -eq 0 || fail 13
349 # Soft errors will still generate an EIO (5) event.
350 test `zpool_event "zfs.io" "zio_err"` = "0x5" || fail 14
352 # Verify the known pattern.
353 pattern_verify ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 15
354 pattern_remove ${TMP_PATTERN} || fail 16
356 test_cleanup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
362 test_write_soft tank lo-faulty-raid0 0
363 test_write_soft tank lo-faulty-raid10 1
364 test_write_soft tank lo-faulty-raidz 1
365 test_write_soft tank lo-faulty-raidz2 1
366 test_write_soft tank lo-faulty-raidz3 1
369 run_test 1 "soft write error"
374 local POOL_REDUNDANT=$3
375 local ZVOL_NAME="zvol"
376 local ZVOL_DEVICE="/dev/${POOL_NAME}/${ZVOL_NAME}"
378 if [ ${MD_PARTITIONABLE} -eq 0 ]; then
383 local TMP_CACHE=`mktemp -p /tmp zpool.cache.XXXXXXXX`
384 test_setup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
386 # Set hard write failure for first vdev device.
387 local VDEV_FAULTY=`nth_zpool_vdev ${POOL_NAME} md 1`
388 fault_set_md ${VDEV_FAULTY} write-persistent
390 # The application must not observe an error.
391 local TMP_PATTERN=`pattern_create 1M 8` || fail 11
392 pattern_write ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 12
393 fault_clear_md ${VDEV_FAULTY}
395 local WRITE_ERRORS=`vdev_write_errors ${POOL_NAME} ${VDEV_FAULTY}`
396 if [ ${POOL_REDUNDANT} -eq 1 ]; then
397 # For redundant configurations hard errors will not be
398 # logged to 'zpool status' but will generate EIO events.
399 test ${WRITE_ERRORS} -eq 0 || fail 21
400 test `zpool_event "zfs.io" "zio_err"` = "0x5" || fail 22
402 # For non-redundant configurations hard errors will be
403 # logged to 'zpool status' and generate EIO events. They
404 # will also trigger a scrub of the impacted sectors.
406 test ${WRITE_ERRORS} -gt 0 || fail 31
407 test `zpool_event "zfs.io" "zio_err"` = "0x5" || fail 32
408 test `zpool_event "zfs.resilver.start" "ena"` != "" || fail 33
409 test `zpool_event "zfs.resilver.finish" "ena"` != "" || fail 34
410 test `zpool_scan_errors ${POOL_NAME}` -eq 0 || fail 35
413 # Verify the known pattern.
414 pattern_verify ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 41
415 pattern_remove ${TMP_PATTERN} || fail 42
417 test_cleanup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
423 test_write_hard tank lo-faulty-raid0 0
424 test_write_hard tank lo-faulty-raid10 1
425 test_write_hard tank lo-faulty-raidz 1
426 test_write_hard tank lo-faulty-raidz2 1
427 test_write_hard tank lo-faulty-raidz3 1
430 run_test 2 "hard write error"
435 local POOL_REDUNDANT=$3
436 local ZVOL_NAME="zvol"
437 local ZVOL_DEVICE="/dev/${POOL_NAME}/${ZVOL_NAME}"
439 if [ ${MD_PARTITIONABLE} -eq 0 ]; then
444 local TMP_CACHE=`mktemp -p /tmp zpool.cache.XXXXXXXX`
445 test_setup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
447 # Set all write failures for first vdev device.
448 local VDEV_FAULTY=`nth_zpool_vdev ${POOL_NAME} md 1`
449 fault_set_md ${VDEV_FAULTY} write-all
451 local TMP_PATTERN=`pattern_create 1M 8` || fail 11
452 if [ ${POOL_REDUNDANT} -eq 1 ]; then
453 # The application must not observe an error.
454 pattern_write ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 12
456 # The application is expected to hang in the background until
457 # the faulty device is repaired and 'zpool clear' is run.
458 pattern_write_bg ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 13
461 fault_clear_md ${VDEV_FAULTY}
463 local WRITE_ERRORS=`vdev_write_errors ${POOL_NAME} ${VDEV_FAULTY}`
464 local VDEV_STATUS=`vdev_status ${POOL_NAME} ${VDEV_FAULTY}`
465 local POOL_STATE=`zpool_state ${POOL_NAME}`
466 # For all configurations write errors are logged to 'zpool status',
467 # and EIO events are generated. However, only a redundant config
468 # will cause the vdev to be FAULTED and pool DEGRADED. In a non-
469 # redundant config the IO will hang until 'zpool clear' is run.
470 test ${WRITE_ERRORS} -gt 0 || fail 14
471 test `zpool_event "zfs.io" "zio_err"` = "0x5" || fail 15
473 if [ ${POOL_REDUNDANT} -eq 1 ]; then
474 test "${VDEV_STATUS}" = "FAULTED" || fail 21
475 test "${POOL_STATE}" = "DEGRADED" || fail 22
477 BLOCKED=`ps a | grep "${ZVOL_DEVICE}" | grep -c -v "grep"`
478 ${ZPOOL} clear ${POOL_NAME} || fail 31
479 test ${BLOCKED} -eq 1 || fail 32
483 # Verify the known pattern.
484 pattern_verify ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 41
485 pattern_remove ${TMP_PATTERN} || fail 42
487 test_cleanup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
493 test_write_all tank lo-faulty-raid0 0
494 test_write_all tank lo-faulty-raid10 1
495 test_write_all tank lo-faulty-raidz 1
496 test_write_all tank lo-faulty-raidz2 1
497 test_write_all tank lo-faulty-raidz3 1
500 run_test 3 "all write errors"
505 local POOL_REDUNDANT=$3
506 local ZVOL_NAME="zvol"
507 local ZVOL_DEVICE="/dev/${POOL_NAME}/${ZVOL_NAME}"
510 if [ ${MD_PARTITIONABLE} -eq 0 ]; then
515 local TMP_CACHE=`mktemp -p /tmp zpool.cache.XXXXXXXX`
516 test_setup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
518 # Create a pattern to be verified during a read error.
519 local TMP_PATTERN=`pattern_create 1M 8` || fail 11
520 pattern_write ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 12
522 # Set soft read failure for all the vdevs to ensure we hit it.
523 for (( i=1; i<=4; i++ )); do
524 fault_set_md `nth_zpool_vdev ${POOL_NAME} md $i` read-transient
527 pattern_verify ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 13
528 pattern_remove ${TMP_PATTERN} || fail 14
530 # Clear all failure injection and sum read errors.
531 for (( i=1; i<=4; i++ )); do
532 local VDEV_FAULTY=`nth_zpool_vdev ${POOL_NAME} md $i`
533 local VDEV_ERRORS=`vdev_read_errors ${POOL_NAME} ${VDEV_FAULTY}`
534 let READ_ERRORS=${READ_ERRORS}+${VDEV_ERRORS}
535 fault_clear_md ${VDEV_FAULTY}
538 # Soft errors will not be logged to 'zpool status'.
539 test ${READ_ERRORS} -eq 0 || fail 15
541 # Soft errors will still generate an EIO (5) event.
542 test `zpool_event "zfs.io" "zio_err"` = "0x5" || fail 16
544 test_cleanup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
550 test_read_soft tank lo-faulty-raid0 0
551 test_read_soft tank lo-faulty-raid10 1
552 test_read_soft tank lo-faulty-raidz 1
553 test_read_soft tank lo-faulty-raidz2 1
554 test_read_soft tank lo-faulty-raidz3 1
557 run_test 4 "soft read error"
562 local POOL_REDUNDANT=$3
563 local ZVOL_NAME="zvol"
564 local ZVOL_DEVICE="/dev/${POOL_NAME}/${ZVOL_NAME}"
567 if [ ${MD_PARTITIONABLE} -eq 0 ]; then
572 local TMP_CACHE=`mktemp -p /tmp zpool.cache.XXXXXXXX`
573 test_setup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
575 # Create a pattern to be verified during a read error.
576 local TMP_PATTERN=`pattern_create 1M 8` || fail 11
577 pattern_write ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 12
579 # Set hard read failure for the fourth vdev.
580 local VDEV_FAULTY=`nth_zpool_vdev ${POOL_NAME} md 4`
581 fault_set_md ${VDEV_FAULTY} read-persistent
583 # For a redundant pool there must be no IO error, for a non-redundant
584 # pool we expect permanent damage and an IO error during verify, unless
585 # we get exceptionally lucky and have just damaged redundant metadata.
586 if [ ${POOL_REDUNDANT} -eq 1 ]; then
587 pattern_verify ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 21
588 local READ_ERRORS=`vdev_read_errors ${POOL_NAME} ${VDEV_FAULTY}`
589 test ${READ_ERRORS} -eq 0 || fail 22
591 pattern_verify ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE}
592 ${ZPOOL} scrub ${POOL_NAME} || fail 32
593 local READ_ERRORS=`vdev_read_errors ${POOL_NAME} ${VDEV_FAULTY}`
594 test ${READ_ERRORS} -gt 0 || fail 33
595 ${ZPOOL} status -v ${POOL_NAME} | \
596 grep -A8 "Permanent errors" | \
597 grep -q "${POOL_NAME}" || fail 34
599 pattern_remove ${TMP_PATTERN} || fail 41
601 # Clear all failure injection and sum read errors.
602 fault_clear_md ${VDEV_FAULTY}
604 # Hard errors will generate an EIO (5) event.
605 test `zpool_event "zfs.io" "zio_err"` = "0x5" || fail 42
607 test_cleanup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
613 test_read_hard tank lo-faulty-raid0 0
614 test_read_hard tank lo-faulty-raid10 1
615 test_read_hard tank lo-faulty-raidz 1
616 test_read_hard tank lo-faulty-raidz2 1
617 test_read_hard tank lo-faulty-raidz3 1
620 run_test 5 "hard read error"
622 # Fixable read error.
623 test_read_fixable() {
626 local POOL_REDUNDANT=$3
627 local ZVOL_NAME="zvol"
628 local ZVOL_DEVICE="/dev/${POOL_NAME}/${ZVOL_NAME}"
631 if [ ${MD_PARTITIONABLE} -eq 0 ]; then
636 local TMP_CACHE=`mktemp -p /tmp zpool.cache.XXXXXXXX`
637 test_setup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
639 # Create a pattern to be verified during a read error.
640 local TMP_PATTERN=`pattern_create 1M 8` || fail 11
641 pattern_write ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 12
643 # Set hard read failure for the fourth vdev.
644 local VDEV_FAULTY=`nth_zpool_vdev ${POOL_NAME} md 4`
645 fault_set_md ${VDEV_FAULTY} read-fixable
647 # For a redundant pool there must be no IO error, for a non-redundant
648 # pool we expect permanent damage and an IO error during verify, unless
649 # we get exceptionally lucky and have just damaged redundant metadata.
650 if [ ${POOL_REDUNDANT} -eq 1 ]; then
651 pattern_verify ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 21
652 local READ_ERRORS=`vdev_read_errors ${POOL_NAME} ${VDEV_FAULTY}`
653 test ${READ_ERRORS} -eq 0 || fail 22
655 pattern_verify ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE}
656 ${ZPOOL} scrub ${POOL_NAME} || fail 32
657 local READ_ERRORS=`vdev_read_errors ${POOL_NAME} ${VDEV_FAULTY}`
658 test ${READ_ERRORS} -gt 0 || fail 33
659 ${ZPOOL} status -v ${POOL_NAME} | \
660 grep -A8 "Permanent errors" | \
661 grep -q "${POOL_NAME}" || fail 34
663 pattern_remove ${TMP_PATTERN} || fail 41
665 # Clear all failure injection and sum read errors.
666 fault_clear_md ${VDEV_FAULTY}
668 # Hard errors will generate an EIO (5) event.
669 test `zpool_event "zfs.io" "zio_err"` = "0x5" || fail 42
671 test_cleanup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
675 # Read errors fixable with a write.
677 test_read_fixable tank lo-faulty-raid0 0
678 test_read_fixable tank lo-faulty-raid10 1
679 test_read_fixable tank lo-faulty-raidz 1
680 test_read_fixable tank lo-faulty-raidz2 1
681 test_read_fixable tank lo-faulty-raidz3 1
684 run_test 6 "fixable read error"
689 local POOL_REDUNDANT=$3
690 local VDEV_DAMAGE="$4"
691 local ZVOL_NAME="zvol"
692 local ZVOL_DEVICE="/dev/${POOL_NAME}/${ZVOL_NAME}"
694 if [ ${MD_PARTITIONABLE} -eq 0 ]; then
699 local TMP_CACHE=`mktemp -p /tmp zpool.cache.XXXXXXXX`
700 test_setup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
702 # Create a pattern to be verified.
703 local TMP_PATTERN=`pattern_create 1M 8` || fail 11
704 pattern_write ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 12
706 # Verify the pattern and that no vdev has cksum errors.
707 pattern_verify ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 13
708 for (( i=1; i<4; i++ )); do
709 VDEV_FAULTY=`nth_zpool_vdev ${POOL_NAME} md ${i}`
710 CKSUM_ERRORS=`vdev_cksum_errors ${POOL_NAME} ${VDEV_FAULTY}`
711 test ${CKSUM_ERRORS} -eq 0 || fail 14
714 # Corrupt the bulk of a vdev with random garbage, we damage as many
715 # vdevs as we have levels of redundancy. For example for a raidz3
716 # configuration we can trash 3 vdevs and still expect correct data.
717 # This improves the odds that we read one of the damaged vdevs.
718 for VDEV in ${VDEV_DAMAGE}; do
719 VDEV_FAULTY=`nth_zpool_vdev ${POOL_NAME} md $VDEV`
720 pattern_write /dev/urandom 1M 64 /dev/${VDEV_FAULTY}p1
723 # Verify the pattern is still correct. For non-redundant pools
724 # expect failure and for redundant pools success due to resilvering.
725 if [ ${POOL_REDUNDANT} -eq 1 ]; then
726 pattern_verify ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 16
728 pattern_verify ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} && fail 17
731 CKSUM_ERRORS=`vdev_cksum_errors ${POOL_NAME} ${VDEV_FAULTY}`
732 test ${CKSUM_ERRORS} -gt 0 || fail 18
733 STATUS=`vdev_status ${POOL_NAME} ${VDEV_FAULTY}`
734 test "${STATUS}" = "ONLINE" || fail 19
736 # The checksum errors must be logged as an event.
737 local CKSUM_ERRORS=`zpool_event "zfs.checksum" "zio_err"`
738 test ${CKSUM_ERRORS} = "0x34" || test ${CKSUM_ERRORS} = "0x0" || fail 20
740 # Verify permant errors for non-redundant pools, and for redundant
741 # pools trigger a scrub and check that all checksums have been fixed.
742 if [ ${POOL_REDUNDANT} -eq 1 ]; then
743 # Scrub the checksum errors and clear the faults.
744 ${ZPOOL} scrub ${POOL_NAME} || fail 21
746 ${ZPOOL} clear ${POOL_NAME} || fail 22
748 # Re-verify the pattern for fixed checksums.
749 pattern_verify ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 23
750 CKSUM_ERRORS=`vdev_cksum_errors ${POOL_NAME} ${VDEV_FAULTY}`
751 test ${CKSUM_ERRORS} -eq 0 || fail 24
753 # Re-verify the entire pool for fixed checksums.
754 ${ZPOOL} scrub ${POOL_NAME} || fail 25
755 CKSUM_ERRORS=`vdev_cksum_errors ${POOL_NAME} ${VDEV_FAULTY}`
756 test ${CKSUM_ERRORS} -eq 0 || fail 26
758 ${ZPOOL} status -v ${POOL_NAME} | \
759 grep -A8 "Permanent errors" | \
760 grep -q "${POOL_NAME}/${ZVOL_NAME}" || fail 31
761 ${ZPOOL} clear ${POOL_NAME} || fail 32
763 pattern_remove ${TMP_PATTERN} || fail 41
765 test_cleanup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
769 # Silent data corruption
771 test_cksum tank lo-faulty-raid0 0 "1"
772 test_cksum tank lo-faulty-raid10 1 "1 3"
773 test_cksum tank lo-faulty-raidz 1 "4"
774 test_cksum tank lo-faulty-raidz2 1 "3 4"
775 test_cksum tank lo-faulty-raidz3 1 "2 3 4"
778 run_test 7 "silent data corruption"
780 # Soft write timeout at the scsi device layer.
781 test_write_timeout_soft() {
784 local POOL_REDUNDANT=$3
786 local ZVOL_NAME="zvol"
787 local ZVOL_DEVICE="/dev/${POOL_NAME}/${ZVOL_NAME}"
789 if [ ${SCSI_DEBUG} -eq 0 ]; then
794 local TMP_CACHE=`mktemp -p /tmp zpool.cache.XXXXXXXX`
795 test_setup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
797 # Set timeout(0x4) for every nth command.
798 fault_set_sd 4 ${POOL_NTH}
800 # The application must not observe an error.
801 local TMP_PATTERN=`pattern_create 1M 8` || fail 11
802 pattern_write ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 12
805 # Intermittent write timeouts even with FAILFAST set may not cause
806 # an EIO (5) event. This is because how FAILFAST is handled depends
807 # a log on the low level driver and the exact nature of the failure.
808 # We will however see a 'zfs.delay' event logged due to the timeout.
809 VDEV_DELAY=`zpool_event "zfs.delay" "zio_delay"`
810 test `printf "%d" ${VDEV_DELAY}` -ge 30000 || fail 13
812 # Verify the known pattern.
813 pattern_verify ${TMP_PATTERN} 1M 8 ${ZVOL_DEVICE} || fail 14
814 pattern_remove ${TMP_PATTERN} || fail 15
816 test_cleanup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
821 test_write_timeout_soft tank scsi_debug-raid0 0 50
822 test_write_timeout_soft tank scsi_debug-raid10 1 100
823 test_write_timeout_soft tank scsi_debug-raidz 1 75
824 test_write_timeout_soft tank scsi_debug-raidz2 1 150
825 test_write_timeout_soft tank scsi_debug-raidz3 1 300
828 run_test 8 "soft write timeout"
830 # Persistent write timeout at the scsi device layer.
831 test_write_timeout_hard() {
834 local POOL_REDUNDANT=$3
836 local ZVOL_NAME="zvol"
837 local ZVOL_DEVICE="/dev/${POOL_NAME}/${ZVOL_NAME}"
840 if [ ${SCSI_DEBUG} -eq 0 ]; then
845 local TMP_CACHE=`mktemp -p /tmp zpool.cache.XXXXXXXX`
846 test_setup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
848 local TMP_PATTERN1=`pattern_create 1M 8`
849 local TMP_PATTERN2=`pattern_create 1M 8`
850 local TMP_PATTERN3=`pattern_create 1M 8`
852 # Create three partitions each one gets a unique pattern. The first
853 # pattern is written before the failure, the second pattern during
854 # the failure, and the third pattern while the vdev is degraded.
855 # All three patterns are verified while the vdev is degraded and
856 # then again once it is brought back online.
857 ${PARTED} -s ${ZVOL_DEVICE} mklabel gpt || fail 11
858 ${PARTED} -s ${ZVOL_DEVICE} mkpart primary 1M 16M || fail 12
859 ${PARTED} -s ${ZVOL_DEVICE} mkpart primary 16M 32M || fail 13
860 ${PARTED} -s ${ZVOL_DEVICE} mkpart primary 32M 48M || fail 14
862 wait_udev ${ZVOL_DEVICE}1 30
863 wait_udev ${ZVOL_DEVICE}2 30
864 wait_udev ${ZVOL_DEVICE}3 30
866 # Before the failure.
867 pattern_write ${TMP_PATTERN1} 1M 8 ${ZVOL_DEVICE}1 || fail 15
869 # Get the faulty vdev name.
870 local VDEV_FAULTY=`nth_zpool_vdev ${POOL_NAME} sd 1`
872 # Set timeout(0x4) for every nth command.
873 fault_set_sd 4 ${POOL_NTH}
875 # During the failure.
876 pattern_write ${TMP_PATTERN2} 1M 8 ${ZVOL_DEVICE}2 || fail 21
878 # Expect write errors to be logged to 'zpool status'
879 local WRITE_ERRORS=`vdev_write_errors ${POOL_NAME} ${VDEV_FAULTY}`
880 test ${WRITE_ERRORS} -gt 0 || fail 22
882 local VDEV_STATUS=`vdev_status ${POOL_NAME} ${VDEV_FAULTY}`
883 test "${VDEV_STATUS}" = "UNAVAIL" || fail 23
885 # Clear the error and remove it from /dev/.
887 rm -f /dev/${VDEV_FAULTY}[0-9]
889 # Verify the first two patterns and write out the third.
890 pattern_write ${TMP_PATTERN3} 1M 8 ${ZVOL_DEVICE}3 || fail 31
891 pattern_verify ${TMP_PATTERN1} 1M 8 ${ZVOL_DEVICE}1 || fail 32
892 pattern_verify ${TMP_PATTERN2} 1M 8 ${ZVOL_DEVICE}2 || fail 33
893 pattern_verify ${TMP_PATTERN3} 1M 8 ${ZVOL_DEVICE}3 || fail 34
895 # Bring the device back online by rescanning for it. It must appear
896 # in lsscsi and be available to dd before allowing ZFS to bring it
897 # online. This is not required but provides additional sanity.
898 while [ ${RESCAN} -eq 1 ]; do
900 wait_udev /dev/${VDEV_FAULTY} 30
902 if [ `${LSSCSI} | grep -c "/dev/${VDEV_FAULTY}"` -eq 0 ]; then
906 dd if=/dev/${VDEV_FAULTY} of=/dev/null bs=8M count=1 &>/dev/null
907 if [ $? -ne 0 ]; then
914 # Bring the device back online. We expect it to be automatically
915 # resilvered without error and we should see minimally the zfs.io,
916 # zfs.statechange (VDEV_STATE_HEALTHY (0x7)), and zfs.resilver.*
918 ${ZPOOL} online ${POOL_NAME} ${VDEV_FAULTY}1 || fail 51
920 test `zpool_event "zfs.io" "zio_err"` = "0x5" || fail 52
921 test `zpool_event "zfs.statechange" "vdev_state"` = "0x7" || fail 53
922 test `zpool_event "zfs.resilver.start" "ena"` != "" || fail 54
923 test `zpool_event "zfs.resilver.finish" "ena"` != "" || fail 55
924 test `zpool_scan_errors ${POOL_NAME}` -eq 0 || fail 56
926 local VDEV_STATUS=`vdev_status ${POOL_NAME} ${VDEV_FAULTY}`
927 test "${VDEV_STATUS}" = "ONLINE" || fail 57
929 # Verify the known pattern.
930 pattern_verify ${TMP_PATTERN1} 1M 8 ${ZVOL_DEVICE}1 || fail 61
931 pattern_verify ${TMP_PATTERN2} 1M 8 ${ZVOL_DEVICE}2 || fail 62
932 pattern_verify ${TMP_PATTERN3} 1M 8 ${ZVOL_DEVICE}3 || fail 63
933 pattern_remove ${TMP_PATTERN1} || fail 64
934 pattern_remove ${TMP_PATTERN2} || fail 65
935 pattern_remove ${TMP_PATTERN3} || fail 66
937 test_cleanup ${POOL_NAME} ${POOL_CONFIG} ${ZVOL_NAME} ${TMP_CACHE}
942 skip_nonewline # Skip non-redundant config
943 test_write_timeout_hard tank scsi_debug-raid10 1 -50
944 test_write_timeout_hard tank scsi_debug-raidz 1 -50
945 test_write_timeout_hard tank scsi_debug-raidz2 1 -50
946 test_write_timeout_hard tank scsi_debug-raidz3 1 -50
949 run_test 9 "hard write timeout"