module/zfs/vdev_disk.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
  23  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  24  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
  25  * LLNL-CODE-403049.
  26  */
  27
  28 #include <sys/zfs_context.h>
  29 #include <sys/spa.h>
  30 #include <sys/vdev_disk.h>
  31 #include <sys/vdev_impl.h>
  32 #include <sys/fs/zfs.h>
  33 #include <sys/zio.h>
  34 #include <sys/sunldi.h>
  35
  36 /*
  37  * Virtual device vector for disks.
  38  */
  39 typedef struct dio_request {
  40         struct completion       dr_comp;        /* Completion for sync IO */
  41         atomic_t                dr_ref;         /* References */
  42         zio_t                   *dr_zio;        /* Parent ZIO */
  43         int                     dr_rw;          /* Read/Write */
  44         int                     dr_error;       /* Bio error */
  45         int                     dr_bio_count;   /* Count of bio's */
  46         struct bio              *dr_bio[0];     /* Attached bio's */
  47 } dio_request_t;
  48
  49
  50 #ifdef HAVE_OPEN_BDEV_EXCLUSIVE
  51 static fmode_t
  52 vdev_bdev_mode(int smode)
  53 {
  54         fmode_t mode = 0;
  55
  56         ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
  57
  58         if (smode & FREAD)
  59                 mode |= FMODE_READ;
  60
  61         if (smode & FWRITE)
  62                 mode |= FMODE_WRITE;
  63
  64         return mode;
  65 }
  66 #else
  67 static int
  68 vdev_bdev_mode(int smode)
  69 {
  70         int mode = 0;
  71
  72         ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
  73
  74         if ((smode & FREAD) && !(smode & FWRITE))
  75                 mode = MS_RDONLY;
  76
  77         return mode;
  78 }
  79 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
  80
  81 static uint64_t
  82 bdev_capacity(struct block_device *bdev)
  83 {
  84         struct hd_struct *part = bdev->bd_part;
  85
  86         /* The partition capacity referenced by the block device */
  87         if (part)
  88                return part->nr_sects;
  89
  90         /* Otherwise assume the full device capacity */
  91         return get_capacity(bdev->bd_disk);
  92 }
  93
  94 static int
  95 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
  96 {
  97         struct block_device *bdev;
  98         vdev_disk_t *vd;
  99         int mode, block_size;
 100
 101         /* Must have a pathname and it must be absolute. */
 102         if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
 103                 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 104                 return EINVAL;
 105         }
 106
 107         vd = kmem_zalloc(sizeof(vdev_disk_t), KM_SLEEP);
 108         if (vd == NULL)
 109                 return ENOMEM;
 110
 111         /*
 112          * Devices are always opened by the path provided at configuration
 113          * time.  This means that if the provided path is a udev by-id path
 114          * then drives may be recabled without an issue.  If the provided
 115          * path is a udev by-path path then the physical location information
 116          * will be preserved.  This can be critical for more complicated
 117          * configurations where drives are located in specific physical
 118          * locations to maximize the systems tolerence to component failure.
 119          * Alternately you can provide your own udev rule to flexibly map
 120          * the drives as you see fit.  It is not advised that you use the
 121          * /dev/[hd]d devices which may be reorder due to probing order.
 122          * Devices in the wrong locations will be detected by the higher
 123          * level vdev validation.
 124          */
 125         mode = spa_mode(v->vdev_spa);
 126         bdev = vdev_bdev_open(v->vdev_path, vdev_bdev_mode(mode), vd);
 127         if (IS_ERR(bdev)) {
 128                 kmem_free(vd, sizeof(vdev_disk_t));
 129                 return -PTR_ERR(bdev);
 130         }
 131
 132         v->vdev_tsd = vd;
 133         vd->vd_bdev = bdev;
 134         block_size =  vdev_bdev_block_size(bdev);
 135
 136         /* Check if this is a whole device.  When bdev->bd_contains ==
 137          * bdev we have a whole device and not simply a partition. */
 138         v->vdev_wholedisk = !!(bdev->bd_contains == bdev);
 139
 140         /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
 141         v->vdev_nowritecache = B_FALSE;
 142
 143         /* Physical volume size in bytes */
 144         *psize = bdev_capacity(bdev) * block_size;
 145
 146         /* Based on the minimum sector size set the block size */
 147         *ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
 148
 149         return 0;
 150 }
 151
 152 static void
 153 vdev_disk_close(vdev_t *v)
 154 {
 155         vdev_disk_t *vd = v->vdev_tsd;
 156
 157         if (vd == NULL)
 158                 return;
 159
 160         if (vd->vd_bdev != NULL)
 161                 vdev_bdev_close(vd->vd_bdev,
 162                                 vdev_bdev_mode(spa_mode(v->vdev_spa)));
 163
 164         kmem_free(vd, sizeof(vdev_disk_t));
 165         v->vdev_tsd = NULL;
 166 }
 167
 168 static dio_request_t *
 169 vdev_disk_dio_alloc(int bio_count)
 170 {
 171         dio_request_t *dr;
 172         int i;
 173
 174         dr = kmem_zalloc(sizeof(dio_request_t) +
 175                          sizeof(struct bio *) * bio_count, KM_SLEEP);
 176         if (dr) {
 177                 init_completion(&dr->dr_comp);
 178                 atomic_set(&dr->dr_ref, 0);
 179                 dr->dr_bio_count = bio_count;
 180                 dr->dr_error = 0;
 181
 182                 for (i = 0; i < dr->dr_bio_count; i++)
 183                         dr->dr_bio[i] = NULL;
 184         }
 185
 186         return dr;
 187 }
 188
 189 static void
 190 vdev_disk_dio_free(dio_request_t *dr)
 191 {
 192         int i;
 193
 194         for (i = 0; i < dr->dr_bio_count; i++)
 195                 if (dr->dr_bio[i])
 196                         bio_put(dr->dr_bio[i]);
 197
 198         kmem_free(dr, sizeof(dio_request_t) +
 199                   sizeof(struct bio *) * dr->dr_bio_count);
 200 }
 201
 202 static void
 203 vdev_disk_dio_get(dio_request_t *dr)
 204 {
 205         atomic_inc(&dr->dr_ref);
 206 }
 207
 208 static int
 209 vdev_disk_dio_put(dio_request_t *dr)
 210 {
 211         int rc = atomic_dec_return(&dr->dr_ref);
 212
 213         /*
 214          * Free the dio_request when the last reference is dropped and
 215          * ensure zio_interpret is called only once with the correct zio
 216          */
 217         if (rc == 0) {
 218                 zio_t *zio = dr->dr_zio;
 219                 int error = dr->dr_error;
 220
 221                 vdev_disk_dio_free(dr);
 222
 223                 if (zio) {
 224                         zio->io_error = error;
 225                         zio_interrupt(zio);
 226                 }
 227         }
 228
 229         return rc;
 230 }
 231
 232 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, size, error)
 233 {
 234         dio_request_t *dr = bio->bi_private;
 235         int rc;
 236
 237         /* Fatal error but print some useful debugging before asserting */
 238         if (dr == NULL)
 239                 PANIC("dr == NULL, bio->bi_private == NULL\n"
 240                     "bi_next: %p, bi_flags: %lx, bi_rw: %lu, bi_vcnt: %d\n"
 241                     "bi_idx: %d, bi_size: %d, bi_end_io: %p, bi_cnt: %d\n",
 242                     bio->bi_next, bio->bi_flags, bio->bi_rw, bio->bi_vcnt,
 243                     bio->bi_idx, bio->bi_size, bio->bi_end_io,
 244                     atomic_read(&bio->bi_cnt));
 245
 246 #ifndef HAVE_2ARGS_BIO_END_IO_T
 247         if (bio->bi_size)
 248                 return 1;
 249 #endif /* HAVE_2ARGS_BIO_END_IO_T */
 250
 251         if (error == 0 && !test_bit(BIO_UPTODATE, &bio->bi_flags))
 252                 error = EIO;
 253
 254         if (dr->dr_error == 0)
 255                 dr->dr_error = error;
 256
 257         /* Drop reference aquired by __vdev_disk_physio */
 258         rc = vdev_disk_dio_put(dr);
 259
 260         /* Wake up synchronous waiter this is the last outstanding bio */
 261         if ((rc == 1) && (dr->dr_rw & (1 << DIO_RW_SYNCIO)))
 262                 complete(&dr->dr_comp);
 263
 264         BIO_END_IO_RETURN(0);
 265 }
 266
 267 static inline unsigned long
 268 bio_nr_pages(void *bio_ptr, unsigned int bio_size)
 269 {
 270         return ((((unsigned long)bio_ptr + bio_size + PAGE_SIZE - 1) >>
 271                 PAGE_SHIFT) - ((unsigned long)bio_ptr >> PAGE_SHIFT));
 272 }
 273
 274 static unsigned int
 275 bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
 276 {
 277         unsigned int offset, size, i;
 278         struct page *page;
 279
 280         offset = offset_in_page(bio_ptr);
 281         for (i = 0; i < bio->bi_max_vecs; i++) {
 282                 size = PAGE_SIZE - offset;
 283
 284                 if (bio_size <= 0)
 285                         break;
 286
 287                 if (size > bio_size)
 288                         size = bio_size;
 289
 290                 if (kmem_virt(bio_ptr))
 291                         page = vmalloc_to_page(bio_ptr);
 292                 else
 293                         page = virt_to_page(bio_ptr);
 294
 295                 if (bio_add_page(bio, page, size, offset) != size)
 296                         break;
 297
 298                 bio_ptr  += size;
 299                 bio_size -= size;
 300                 offset = 0;
 301         }
 302
 303         return bio_size;
 304 }
 305
 306 static int
 307 __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
 308                    size_t kbuf_size, uint64_t kbuf_offset, int flags)
 309 {
 310         dio_request_t *dr;
 311         caddr_t bio_ptr;
 312         uint64_t bio_offset;
 313         int bio_size, bio_count = 16;
 314         int i = 0, error = 0, block_size;
 315
 316 retry:
 317         dr = vdev_disk_dio_alloc(bio_count);
 318         if (dr == NULL)
 319                 return ENOMEM;
 320
 321         dr->dr_zio = zio;
 322         dr->dr_rw = flags;
 323         block_size = vdev_bdev_block_size(bdev);
 324
 325 #ifdef BIO_RW_FAILFAST
 326         if (flags & (1 << BIO_RW_FAILFAST))
 327                 dr->dr_rw |= 1 << BIO_RW_FAILFAST;
 328 #endif /* BIO_RW_FAILFAST */
 329
 330         /*
 331          * When the IO size exceeds the maximum bio size for the request
 332          * queue we are forced to break the IO in multiple bio's and wait
 333          * for them all to complete.  Ideally, all pool users will set
 334          * their volume block size to match the maximum request size and
 335          * the common case will be one bio per vdev IO request.
 336          */
 337         bio_ptr    = kbuf_ptr;
 338         bio_offset = kbuf_offset;
 339         bio_size   = kbuf_size;
 340         for (i = 0; i <= dr->dr_bio_count; i++) {
 341
 342                 /* Finished constructing bio's for given buffer */
 343                 if (bio_size <= 0)
 344                         break;
 345
 346                 /*
 347                  * By default only 'bio_count' bio's per dio are allowed.
 348                  * However, if we find ourselves in a situation where more
 349                  * are needed we allocate a larger dio and warn the user.
 350                  */
 351                 if (dr->dr_bio_count == i) {
 352                         vdev_disk_dio_free(dr);
 353                         bio_count *= 2;
 354                         printk("WARNING: Resized bio's/dio to %d\n",bio_count);
 355                         goto retry;
 356                 }
 357
 358                 dr->dr_bio[i] = bio_alloc(GFP_NOIO,
 359                                           bio_nr_pages(bio_ptr, bio_size));
 360                 if (dr->dr_bio[i] == NULL) {
 361                         vdev_disk_dio_free(dr);
 362                         return ENOMEM;
 363                 }
 364
 365                 /* Matching put called by vdev_disk_physio_completion */
 366                 vdev_disk_dio_get(dr);
 367
 368                 dr->dr_bio[i]->bi_bdev = bdev;
 369                 dr->dr_bio[i]->bi_sector = bio_offset / block_size;
 370                 dr->dr_bio[i]->bi_rw = dr->dr_rw;
 371                 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
 372                 dr->dr_bio[i]->bi_private = dr;
 373
 374                 /* Remaining size is returned to become the new size */
 375                 bio_size = bio_map(dr->dr_bio[i], bio_ptr, bio_size);
 376
 377                 /* Advance in buffer and construct another bio if needed */
 378                 bio_ptr    += dr->dr_bio[i]->bi_size;
 379                 bio_offset += dr->dr_bio[i]->bi_size;
 380         }
 381
 382         /* Extra reference to protect dio_request during submit_bio */
 383         vdev_disk_dio_get(dr);
 384
 385         /* Submit all bio's associated with this dio */
 386         for (i = 0; i < dr->dr_bio_count; i++)
 387                 if (dr->dr_bio[i])
 388                         submit_bio(dr->dr_rw, dr->dr_bio[i]);
 389
 390         /*
 391          * On synchronous blocking requests we wait for all bio the completion
 392          * callbacks to run.  We will be woken when the last callback runs
 393          * for this dio.  We are responsible for putting the last dio_request
 394          * reference will in turn put back the last bio references.  The
 395          * only synchronous consumer is vdev_disk_read_rootlabel() all other
 396          * IO originating from vdev_disk_io_start() is asynchronous.
 397          */
 398         if (dr->dr_rw & (1 << DIO_RW_SYNCIO)) {
 399                 wait_for_completion(&dr->dr_comp);
 400                 error = dr->dr_error;
 401                 ASSERT3S(atomic_read(&dr->dr_ref), ==, 1);
 402         }
 403
 404         (void)vdev_disk_dio_put(dr);
 405
 406         return error;
 407 }
 408
 409 int
 410 vdev_disk_physio(struct block_device *bdev, caddr_t kbuf,
 411                  size_t size, uint64_t offset, int flags)
 412 {
 413         return __vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags);
 414 }
 415
 416 /* 2.6.24 API change */
 417 #ifdef HAVE_BIO_EMPTY_BARRIER
 418 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, size, rc)
 419 {
 420         zio_t *zio = bio->bi_private;
 421
 422         zio->io_error = -rc;
 423         if (rc && (rc == -EOPNOTSUPP))
 424                 zio->io_vd->vdev_nowritecache = B_TRUE;
 425
 426         bio_put(bio);
 427         zio_interrupt(zio);
 428
 429         BIO_END_IO_RETURN(0);
 430 }
 431
 432 static int
 433 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
 434 {
 435         struct request_queue *q;
 436         struct bio *bio;
 437
 438         q = bdev_get_queue(bdev);
 439         if (!q)
 440                 return ENXIO;
 441
 442         bio = bio_alloc(GFP_KERNEL, 0);
 443         if (!bio)
 444                 return ENOMEM;
 445
 446         bio->bi_end_io = vdev_disk_io_flush_completion;
 447         bio->bi_private = zio;
 448         bio->bi_bdev = bdev;
 449         submit_bio(WRITE_BARRIER, bio);
 450
 451         return 0;
 452 }
 453 #else
 454 static int
 455 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
 456 {
 457         return ENOTSUP;
 458 }
 459 #endif /* HAVE_BIO_EMPTY_BARRIER */
 460
 461 static int
 462 vdev_disk_io_start(zio_t *zio)
 463 {
 464         vdev_t *v = zio->io_vd;
 465         vdev_disk_t *vd = v->vdev_tsd;
 466         int flags, error;
 467
 468         switch (zio->io_type) {
 469         case ZIO_TYPE_IOCTL:
 470
 471                 if (!vdev_readable(v)) {
 472                         zio->io_error = ENXIO;
 473                         return ZIO_PIPELINE_CONTINUE;
 474                 }
 475
 476                 switch (zio->io_cmd) {
 477                 case DKIOCFLUSHWRITECACHE:
 478
 479                         if (zfs_nocacheflush)
 480                                 break;
 481
 482                         if (v->vdev_nowritecache) {
 483                                 zio->io_error = ENOTSUP;
 484                                 break;
 485                         }
 486
 487                         error = vdev_disk_io_flush(vd->vd_bdev, zio);
 488                         if (error == 0)
 489                                 return ZIO_PIPELINE_STOP;
 490
 491                         zio->io_error = error;
 492                         if (error == ENOTSUP)
 493                                 v->vdev_nowritecache = B_TRUE;
 494
 495                         break;
 496
 497                 default:
 498                         zio->io_error = ENOTSUP;
 499                 }
 500
 501                 return ZIO_PIPELINE_CONTINUE;
 502
 503         case ZIO_TYPE_WRITE:
 504                 flags = WRITE;
 505                 break;
 506
 507         case ZIO_TYPE_READ:
 508                 flags = READ;
 509                 break;
 510
 511         default:
 512                 zio->io_error = ENOTSUP;
 513                 return ZIO_PIPELINE_CONTINUE;
 514         }
 515
 516 #ifdef BIO_RW_FAILFAST
 517         if (zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))
 518                 flags |= (1 << BIO_RW_FAILFAST);
 519 #endif /* BIO_RW_FAILFAST */
 520
 521         error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
 522                                    zio->io_size, zio->io_offset, flags);
 523         if (error) {
 524                 zio->io_error = error;
 525                 return ZIO_PIPELINE_CONTINUE;
 526         }
 527
 528         return ZIO_PIPELINE_STOP;
 529 }
 530
 531 static void
 532 vdev_disk_io_done(zio_t *zio)
 533 {
 534         /*
 535          * If the device returned EIO, we revalidate the media.  If it is
 536          * determined the media has changed this triggers the asynchronous
 537          * removal of the device from the configuration.
 538          */
 539         if (zio->io_error == EIO) {
 540                 vdev_t *v = zio->io_vd;
 541                 vdev_disk_t *vd = v->vdev_tsd;
 542
 543                 if (check_disk_change(vd->vd_bdev)) {
 544                         vdev_bdev_invalidate(vd->vd_bdev);
 545                         v->vdev_remove_wanted = B_TRUE;
 546                         spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
 547                 }
 548         }
 549 }
 550
 551 static void
 552 vdev_disk_hold(vdev_t *vd)
 553 {
 554         ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
 555
 556         /* We must have a pathname, and it must be absolute. */
 557         if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
 558                 return;
 559
 560         /*
 561          * Only prefetch path and devid info if the device has
 562          * never been opened.
 563          */
 564         if (vd->vdev_tsd != NULL)
 565                 return;
 566
 567         /* XXX: Implement me as a vnode lookup for the device */
 568         vd->vdev_name_vp = NULL;
 569         vd->vdev_devid_vp = NULL;
 570 }
 571
 572 static void
 573 vdev_disk_rele(vdev_t *vd)
 574 {
 575         ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
 576
 577         /* XXX: Implement me as a vnode rele for the device */
 578 }
 579
 580 vdev_ops_t vdev_disk_ops = {
 581         vdev_disk_open,
 582         vdev_disk_close,
 583         vdev_default_asize,
 584         vdev_disk_io_start,
 585         vdev_disk_io_done,
 586         NULL,
 587         vdev_disk_hold,
 588         vdev_disk_rele,
 589         VDEV_TYPE_DISK,         /* name of this vdev type */
 590         B_TRUE                  /* leaf vdev */
 591 };
 592
 593 /*
 594  * Given the root disk device devid or pathname, read the label from
 595  * the device, and construct a configuration nvlist.
 596  */
 597 int
 598 vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
 599 {
 600         struct block_device *bdev;
 601         vdev_label_t *label;
 602         uint64_t s, size;
 603         int i;
 604
 605         bdev = vdev_bdev_open(devpath, vdev_bdev_mode(FREAD), NULL);
 606         if (IS_ERR(bdev))
 607                 return -PTR_ERR(bdev);
 608
 609         s = bdev_capacity(bdev) * vdev_bdev_block_size(bdev);
 610         if (s == 0) {
 611                 vdev_bdev_close(bdev, vdev_bdev_mode(FREAD));
 612                 return EIO;
 613         }
 614
 615         size = P2ALIGN_TYPED(s, sizeof(vdev_label_t), uint64_t);
 616         label = vmem_alloc(sizeof(vdev_label_t), KM_SLEEP);
 617
 618         for (i = 0; i < VDEV_LABELS; i++) {
 619                 uint64_t offset, state, txg = 0;
 620
 621                 /* read vdev label */
 622                 offset = vdev_label_offset(size, i, 0);
 623                 if (vdev_disk_physio(bdev, (caddr_t)label,
 624                     VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, READ_SYNC) != 0)
 625                         continue;
 626
 627                 if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
 628                     sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
 629                         *config = NULL;
 630                         continue;
 631                 }
 632
 633                 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
 634                     &state) != 0 || state >= POOL_STATE_DESTROYED) {
 635                         nvlist_free(*config);
 636                         *config = NULL;
 637                         continue;
 638                 }
 639
 640                 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
 641                     &txg) != 0 || txg == 0) {
 642                         nvlist_free(*config);
 643                         *config = NULL;
 644                         continue;
 645                 }
 646
 647                 break;
 648         }
 649
 650         vmem_free(label, sizeof(vdev_label_t));
 651         vdev_bdev_close(bdev, vdev_bdev_mode(FREAD));
 652
 653         return 0;
 654 }