fxr.watson.org: linux-2.6 sys/block/ll_rw

FreeBSD/Linux Kernel Cross Reference
sys/block/ll_rw_blk.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10

1 /* 2 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics 4 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE 5 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> 6 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000 7 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 8 */ 9 10 /* 11 * This handles all read/write requests to block devices 12 */ 13 #include <linux/kernel.h> 14 #include <linux/module.h> 15 #include <linux/backing-dev.h> 16 #include <linux/bio.h> 17 #include <linux/blkdev.h> 18 #include <linux/highmem.h> 19 #include <linux/mm.h> 20 #include <linux/kernel_stat.h> 21 #include <linux/string.h> 22 #include <linux/init.h> 23 #include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ 24 #include <linux/completion.h> 25 #include <linux/slab.h> 26 #include <linux/swap.h> 27 #include <linux/writeback.h> 28 #include <linux/task_io_accounting_ops.h> 29 #include <linux/interrupt.h> 30 #include <linux/cpu.h> 31 #include <linux/blktrace_api.h> 32 #include <linux/fault-inject.h> 33 #include <linux/scatterlist.h> 34 35 /* 36 * for max sense size 37 */ 38 #include <scsi/scsi_cmnd.h> 39 40 static void blk_unplug_work(struct work_struct *work); 41 static void blk_unplug_timeout(unsigned long data); 42 static void drive_stat_acct(struct request *rq, int new_io); 43 static void init_request_from_bio(struct request *req, struct bio *bio); 44 static int __make_request(struct request_queue *q, struct bio *bio); 45 static struct io_context *current_io_context(gfp_t gfp_flags, int node); 46 static void blk_recalc_rq_segments(struct request *rq); 47 static void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 48 struct bio *bio); 49 50 /* 51 * For the allocated request tables 52 */ 53 static struct kmem_cache *request_cachep; 54 55 /* 56 * For queue allocation 57 */ 58 static struct kmem_cache *requestq_cachep; 59 60 /* 61 * For io context allocations 62 */ 63 static struct kmem_cache *iocontext_cachep; 64 65 /* 66 * Controlling structure to kblockd 67 */ 68 static struct workqueue_struct *kblockd_workqueue; 69 70 unsigned long blk_max_low_pfn, blk_max_pfn; 71 72 EXPORT_SYMBOL(blk_max_low_pfn); 73 EXPORT_SYMBOL(blk_max_pfn); 74 75 static DEFINE_PER_CPU(struct list_head, blk_cpu_done); 76 77 /* Amount of time in which a process may batch requests */ 78 #define BLK_BATCH_TIME (HZ/50UL) 79 80 /* Number of requests a "batching" process may submit */ 81 #define BLK_BATCH_REQ 32 82 83 /* 84 * Return the threshold (number of used requests) at which the queue is 85 * considered to be congested. It include a little hysteresis to keep the 86 * context switch rate down. 87 */ 88 static inline int queue_congestion_on_threshold(struct request_queue *q) 89 { 90 return q->nr_congestion_on; 91 } 92 93 /* 94 * The threshold at which a queue is considered to be uncongested 95 */ 96 static inline int queue_congestion_off_threshold(struct request_queue *q) 97 { 98 return q->nr_congestion_off; 99 } 100 101 static void blk_queue_congestion_threshold(struct request_queue *q) 102 { 103 int nr; 104 105 nr = q->nr_requests - (q->nr_requests / 8) + 1; 106 if (nr > q->nr_requests) 107 nr = q->nr_requests; 108 q->nr_congestion_on = nr; 109 110 nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; 111 if (nr < 1) 112 nr = 1; 113 q->nr_congestion_off = nr; 114 } 115 116 /** 117 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info 118 * @bdev: device 119 * 120 * Locates the passed device's request queue and returns the address of its 121 * backing_dev_info 122 * 123 * Will return NULL if the request queue cannot be located. 124 */ 125 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) 126 { 127 struct backing_dev_info *ret = NULL; 128 struct request_queue *q = bdev_get_queue(bdev); 129 130 if (q) 131 ret = &q->backing_dev_info; 132 return ret; 133 } 134 EXPORT_SYMBOL(blk_get_backing_dev_info); 135 136 /** 137 * blk_queue_prep_rq - set a prepare_request function for queue 138 * @q: queue 139 * @pfn: prepare_request function 140 * 141 * It's possible for a queue to register a prepare_request callback which 142 * is invoked before the request is handed to the request_fn. The goal of 143 * the function is to prepare a request for I/O, it can be used to build a 144 * cdb from the request data for instance. 145 * 146 */ 147 void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) 148 { 149 q->prep_rq_fn = pfn; 150 } 151 152 EXPORT_SYMBOL(blk_queue_prep_rq); 153 154 /** 155 * blk_queue_merge_bvec - set a merge_bvec function for queue 156 * @q: queue 157 * @mbfn: merge_bvec_fn 158 * 159 * Usually queues have static limitations on the max sectors or segments that 160 * we can put in a request. Stacking drivers may have some settings that 161 * are dynamic, and thus we have to query the queue whether it is ok to 162 * add a new bio_vec to a bio at a given offset or not. If the block device 163 * has such limitations, it needs to register a merge_bvec_fn to control 164 * the size of bio's sent to it. Note that a block device *must* allow a 165 * single page to be added to an empty bio. The block device driver may want 166 * to use the bio_split() function to deal with these bio's. By default 167 * no merge_bvec_fn is defined for a queue, and only the fixed limits are 168 * honored. 169 */ 170 void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn) 171 { 172 q->merge_bvec_fn = mbfn; 173 } 174 175 EXPORT_SYMBOL(blk_queue_merge_bvec); 176 177 void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn) 178 { 179 q->softirq_done_fn = fn; 180 } 181 182 EXPORT_SYMBOL(blk_queue_softirq_done); 183 184 /** 185 * blk_queue_make_request - define an alternate make_request function for a device 186 * @q: the request queue for the device to be affected 187 * @mfn: the alternate make_request function 188 * 189 * Description: 190 * The normal way for &struct bios to be passed to a device 191 * driver is for them to be collected into requests on a request 192 * queue, and then to allow the device driver to select requests 193 * off that queue when it is ready. This works well for many block 194 * devices. However some block devices (typically virtual devices 195 * such as md or lvm) do not benefit from the processing on the 196 * request queue, and are served best by having the requests passed 197 * directly to them. This can be achieved by providing a function 198 * to blk_queue_make_request(). 199 * 200 * Caveat: 201 * The driver that does this *must* be able to deal appropriately 202 * with buffers in "highmemory". This can be accomplished by either calling 203 * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling 204 * blk_queue_bounce() to create a buffer in normal memory. 205 **/ 206 void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn) 207 { 208 /* 209 * set defaults 210 */ 211 q->nr_requests = BLKDEV_MAX_RQ; 212 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); 213 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); 214 q->make_request_fn = mfn; 215 q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 216 q->backing_dev_info.state = 0; 217 q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; 218 blk_queue_max_sectors(q, SAFE_MAX_SECTORS); 219 blk_queue_hardsect_size(q, 512); 220 blk_queue_dma_alignment(q, 511); 221 blk_queue_congestion_threshold(q); 222 q->nr_batching = BLK_BATCH_REQ; 223 224 q->unplug_thresh = 4; /* hmm */ 225 q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ 226 if (q->unplug_delay == 0) 227 q->unplug_delay = 1; 228 229 INIT_WORK(&q->unplug_work, blk_unplug_work); 230 231 q->unplug_timer.function = blk_unplug_timeout; 232 q->unplug_timer.data = (unsigned long)q; 233 234 /* 235 * by default assume old behaviour and bounce for any highmem page 236 */ 237 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); 238 } 239 240 EXPORT_SYMBOL(blk_queue_make_request); 241 242 static void rq_init(struct request_queue *q, struct request *rq) 243 { 244 INIT_LIST_HEAD(&rq->queuelist); 245 INIT_LIST_HEAD(&rq->donelist); 246 247 rq->errors = 0; 248 rq->bio = rq->biotail = NULL; 249 INIT_HLIST_NODE(&rq->hash); 250 RB_CLEAR_NODE(&rq->rb_node); 251 rq->ioprio = 0; 252 rq->buffer = NULL; 253 rq->ref_count = 1; 254 rq->q = q; 255 rq->special = NULL; 256 rq->data_len = 0; 257 rq->data = NULL; 258 rq->nr_phys_segments = 0; 259 rq->sense = NULL; 260 rq->end_io = NULL; 261 rq->end_io_data = NULL; 262 rq->completion_data = NULL; 263 rq->next_rq = NULL; 264 } 265 266 /** 267 * blk_queue_ordered - does this queue support ordered writes 268 * @q: the request queue 269 * @ordered: one of QUEUE_ORDERED_* 270 * @prepare_flush_fn: rq setup helper for cache flush ordered writes 271 * 272 * Description: 273 * For journalled file systems, doing ordered writes on a commit 274 * block instead of explicitly doing wait_on_buffer (which is bad 275 * for performance) can be a big win. Block drivers supporting this 276 * feature should call this function and indicate so. 277 * 278 **/ 279 int blk_queue_ordered(struct request_queue *q, unsigned ordered, 280 prepare_flush_fn *prepare_flush_fn) 281 { 282 if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) && 283 prepare_flush_fn == NULL) { 284 printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n"); 285 return -EINVAL; 286 } 287 288 if (ordered != QUEUE_ORDERED_NONE && 289 ordered != QUEUE_ORDERED_DRAIN && 290 ordered != QUEUE_ORDERED_DRAIN_FLUSH && 291 ordered != QUEUE_ORDERED_DRAIN_FUA && 292 ordered != QUEUE_ORDERED_TAG && 293 ordered != QUEUE_ORDERED_TAG_FLUSH && 294 ordered != QUEUE_ORDERED_TAG_FUA) { 295 printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered); 296 return -EINVAL; 297 } 298 299 q->ordered = ordered; 300 q->next_ordered = ordered; 301 q->prepare_flush_fn = prepare_flush_fn; 302 303 return 0; 304 } 305 306 EXPORT_SYMBOL(blk_queue_ordered); 307 308 /* 309 * Cache flushing for ordered writes handling 310 */ 311 inline unsigned blk_ordered_cur_seq(struct request_queue *q) 312 { 313 if (!q->ordseq) 314 return 0; 315 return 1 << ffz(q->ordseq); 316 } 317 318 unsigned blk_ordered_req_seq(struct request *rq) 319 { 320 struct request_queue *q = rq->q; 321 322 BUG_ON(q->ordseq == 0); 323 324 if (rq == &q->pre_flush_rq) 325 return QUEUE_ORDSEQ_PREFLUSH; 326 if (rq == &q->bar_rq) 327 return QUEUE_ORDSEQ_BAR; 328 if (rq == &q->post_flush_rq) 329 return QUEUE_ORDSEQ_POSTFLUSH; 330 331 /* 332 * !fs requests don't need to follow barrier ordering. Always 333 * put them at the front. This fixes the following deadlock. 334 * 335 * http://thread.gmane.org/gmane.linux.kernel/537473 336 */ 337 if (!blk_fs_request(rq)) 338 return QUEUE_ORDSEQ_DRAIN; 339 340 if ((rq->cmd_flags & REQ_ORDERED_COLOR) == 341 (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR)) 342 return QUEUE_ORDSEQ_DRAIN; 343 else 344 return QUEUE_ORDSEQ_DONE; 345 } 346 347 void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error) 348 { 349 struct request *rq; 350 int uptodate; 351 352 if (error && !q->orderr) 353 q->orderr = error; 354 355 BUG_ON(q->ordseq & seq); 356 q->ordseq |= seq; 357 358 if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) 359 return; 360 361 /* 362 * Okay, sequence complete. 363 */ 364 uptodate = 1; 365 if (q->orderr) 366 uptodate = q->orderr; 367 368 q->ordseq = 0; 369 rq = q->orig_bar_rq; 370 371 end_that_request_first(rq, uptodate, rq->hard_nr_sectors); 372 end_that_request_last(rq, uptodate); 373 } 374 375 static void pre_flush_end_io(struct request *rq, int error) 376 { 377 elv_completed_request(rq->q, rq); 378 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error); 379 } 380 381 static void bar_end_io(struct request *rq, int error) 382 { 383 elv_completed_request(rq->q, rq); 384 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error); 385 } 386 387 static void post_flush_end_io(struct request *rq, int error) 388 { 389 elv_completed_request(rq->q, rq); 390 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); 391 } 392 393 static void queue_flush(struct request_queue *q, unsigned which) 394 { 395 struct request *rq; 396 rq_end_io_fn *end_io; 397 398 if (which == QUEUE_ORDERED_PREFLUSH) { 399 rq = &q->pre_flush_rq; 400 end_io = pre_flush_end_io; 401 } else { 402 rq = &q->post_flush_rq; 403 end_io = post_flush_end_io; 404 } 405 406 rq->cmd_flags = REQ_HARDBARRIER; 407 rq_init(q, rq); 408 rq->elevator_private = NULL; 409 rq->elevator_private2 = NULL; 410 rq->rq_disk = q->bar_rq.rq_disk; 411 rq->end_io = end_io; 412 q->prepare_flush_fn(q, rq); 413 414 elv_insert(q, rq, ELEVATOR_INSERT_FRONT); 415 } 416 417 static inline struct request *start_ordered(struct request_queue *q, 418 struct request *rq) 419 { 420 q->orderr = 0; 421 q->ordered = q->next_ordered; 422 q->ordseq |= QUEUE_ORDSEQ_STARTED; 423 424 /* 425 * Prep proxy barrier request. 426 */ 427 blkdev_dequeue_request(rq); 428 q->orig_bar_rq = rq; 429 rq = &q->bar_rq; 430 rq->cmd_flags = 0; 431 rq_init(q, rq); 432 if (bio_data_dir(q->orig_bar_rq->bio) == WRITE) 433 rq->cmd_flags |= REQ_RW; 434 if (q->ordered & QUEUE_ORDERED_FUA) 435 rq->cmd_flags |= REQ_FUA; 436 rq->elevator_private = NULL; 437 rq->elevator_private2 = NULL; 438 init_request_from_bio(rq, q->orig_bar_rq->bio); 439 rq->end_io = bar_end_io; 440 441 /* 442 * Queue ordered sequence. As we stack them at the head, we 443 * need to queue in reverse order. Note that we rely on that 444 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs 445 * request gets inbetween ordered sequence. If this request is 446 * an empty barrier, we don't need to do a postflush ever since 447 * there will be no data written between the pre and post flush. 448 * Hence a single flush will suffice. 449 */ 450 if ((q->ordered & QUEUE_ORDERED_POSTFLUSH) && !blk_empty_barrier(rq)) 451 queue_flush(q, QUEUE_ORDERED_POSTFLUSH); 452 else 453 q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH; 454 455 elv_insert(q, rq, ELEVATOR_INSERT_FRONT); 456 457 if (q->ordered & QUEUE_ORDERED_PREFLUSH) { 458 queue_flush(q, QUEUE_ORDERED_PREFLUSH); 459 rq = &q->pre_flush_rq; 460 } else 461 q->ordseq |= QUEUE_ORDSEQ_PREFLUSH; 462 463 if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0) 464 q->ordseq |= QUEUE_ORDSEQ_DRAIN; 465 else 466 rq = NULL; 467 468 return rq; 469 } 470 471 int blk_do_ordered(struct request_queue *q, struct request **rqp) 472 { 473 struct request *rq = *rqp; 474 const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq); 475 476 if (!q->ordseq) { 477 if (!is_barrier) 478 return 1; 479 480 if (q->next_ordered != QUEUE_ORDERED_NONE) { 481 *rqp = start_ordered(q, rq); 482 return 1; 483 } else { 484 /* 485 * This can happen when the queue switches to 486 * ORDERED_NONE while this request is on it. 487 */ 488 blkdev_dequeue_request(rq); 489 end_that_request_first(rq, -EOPNOTSUPP, 490 rq->hard_nr_sectors); 491 end_that_request_last(rq, -EOPNOTSUPP); 492 *rqp = NULL; 493 return 0; 494 } 495 } 496 497 /* 498 * Ordered sequence in progress 499 */ 500 501 /* Special requests are not subject to ordering rules. */ 502 if (!blk_fs_request(rq) && 503 rq != &q->pre_flush_rq && rq != &q->post_flush_rq) 504 return 1; 505 506 if (q->ordered & QUEUE_ORDERED_TAG) { 507 /* Ordered by tag. Blocking the next barrier is enough. */ 508 if (is_barrier && rq != &q->bar_rq) 509 *rqp = NULL; 510 } else { 511 /* Ordered by draining. Wait for turn. */ 512 WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); 513 if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) 514 *rqp = NULL; 515 } 516 517 return 1; 518 } 519 520 static void req_bio_endio(struct request *rq, struct bio *bio, 521 unsigned int nbytes, int error) 522 { 523 struct request_queue *q = rq->q; 524 525 if (&q->bar_rq != rq) { 526 if (error) 527 clear_bit(BIO_UPTODATE, &bio->bi_flags); 528 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 529 error = -EIO; 530 531 if (unlikely(nbytes > bio->bi_size)) { 532 printk("%s: want %u bytes done, only %u left\n", 533 __FUNCTION__, nbytes, bio->bi_size); 534 nbytes = bio->bi_size; 535 } 536 537 bio->bi_size -= nbytes; 538 bio->bi_sector += (nbytes >> 9); 539 if (bio->bi_size == 0) 540 bio_endio(bio, error); 541 } else { 542 543 /* 544 * Okay, this is the barrier request in progress, just 545 * record the error; 546 */ 547 if (error && !q->orderr) 548 q->orderr = error; 549 } 550 } 551 552 /** 553 * blk_queue_bounce_limit - set bounce buffer limit for queue 554 * @q: the request queue for the device 555 * @dma_addr: bus address limit 556 * 557 * Description: 558 * Different hardware can have different requirements as to what pages 559 * it can do I/O directly to. A low level driver can call 560 * blk_queue_bounce_limit to have lower memory pages allocated as bounce 561 * buffers for doing I/O to pages residing above @page. 562 **/ 563 void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr) 564 { 565 unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; 566 int dma = 0; 567 568 q->bounce_gfp = GFP_NOIO; 569 #if BITS_PER_LONG == 64 570 /* Assume anything <= 4GB can be handled by IOMMU. 571 Actually some IOMMUs can handle everything, but I don't 572 know of a way to test this here. */ 573 if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) 574 dma = 1; 575 q->bounce_pfn = max_low_pfn; 576 #else 577 if (bounce_pfn < blk_max_low_pfn) 578 dma = 1; 579 q->bounce_pfn = bounce_pfn; 580 #endif 581 if (dma) { 582 init_emergency_isa_pool(); 583 q->bounce_gfp = GFP_NOIO | GFP_DMA; 584 q->bounce_pfn = bounce_pfn; 585 } 586 } 587 588 EXPORT_SYMBOL(blk_queue_bounce_limit); 589 590 /** 591 * blk_queue_max_sectors - set max sectors for a request for this queue 592 * @q: the request queue for the device 593 * @max_sectors: max sectors in the usual 512b unit 594 * 595 * Description: 596 * Enables a low level driver to set an upper limit on the size of 597 * received requests. 598 **/ 599 void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors) 600 { 601 if ((max_sectors << 9) < PAGE_CACHE_SIZE) { 602 max_sectors = 1 << (PAGE_CACHE_SHIFT - 9); 603 printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors); 604 } 605 606 if (BLK_DEF_MAX_SECTORS > max_sectors) 607 q->max_hw_sectors = q->max_sectors = max_sectors; 608 else { 609 q->max_sectors = BLK_DEF_MAX_SECTORS; 610 q->max_hw_sectors = max_sectors; 611 } 612 } 613 614 EXPORT_SYMBOL(blk_queue_max_sectors); 615 616 /** 617 * blk_queue_max_phys_segments - set max phys segments for a request for this queue 618 * @q: the request queue for the device 619 * @max_segments: max number of segments 620 * 621 * Description: 622 * Enables a low level driver to set an upper limit on the number of 623 * physical data segments in a request. This would be the largest sized 624 * scatter list the driver could handle. 625 **/ 626 void blk_queue_max_phys_segments(struct request_queue *q, 627 unsigned short max_segments) 628 { 629 if (!max_segments) { 630 max_segments = 1; 631 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); 632 } 633 634 q->max_phys_segments = max_segments; 635 } 636 637 EXPORT_SYMBOL(blk_queue_max_phys_segments); 638 639 /** 640 * blk_queue_max_hw_segments - set max hw segments for a request for this queue 641 * @q: the request queue for the device 642 * @max_segments: max number of segments 643 * 644 * Description: 645 * Enables a low level driver to set an upper limit on the number of 646 * hw data segments in a request. This would be the largest number of 647 * address/length pairs the host adapter can actually give as once 648 * to the device. 649 **/ 650 void blk_queue_max_hw_segments(struct request_queue *q, 651 unsigned short max_segments) 652 { 653 if (!max_segments) { 654 max_segments = 1; 655 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); 656 } 657 658 q->max_hw_segments = max_segments; 659 } 660 661 EXPORT_SYMBOL(blk_queue_max_hw_segments); 662 663 /** 664 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg 665 * @q: the request queue for the device 666 * @max_size: max size of segment in bytes 667 * 668 * Description: 669 * Enables a low level driver to set an upper limit on the size of a 670 * coalesced segment 671 **/ 672 void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) 673 { 674 if (max_size < PAGE_CACHE_SIZE) { 675 max_size = PAGE_CACHE_SIZE; 676 printk("%s: set to minimum %d\n", __FUNCTION__, max_size); 677 } 678 679 q->max_segment_size = max_size; 680 } 681 682 EXPORT_SYMBOL(blk_queue_max_segment_size); 683 684 /** 685 * blk_queue_hardsect_size - set hardware sector size for the queue 686 * @q: the request queue for the device 687 * @size: the hardware sector size, in bytes 688 * 689 * Description: 690 * This should typically be set to the lowest possible sector size 691 * that the hardware can operate on (possible without reverting to 692 * even internal read-modify-write operations). Usually the default 693 * of 512 covers most hardware. 694 **/ 695 void blk_queue_hardsect_size(struct request_queue *q, unsigned short size) 696 { 697 q->hardsect_size = size; 698 } 699 700 EXPORT_SYMBOL(blk_queue_hardsect_size); 701 702 /* 703 * Returns the minimum that is _not_ zero, unless both are zero. 704 */ 705 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) 706 707 /** 708 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers 709 * @t: the stacking driver (top) 710 * @b: the underlying device (bottom) 711 **/ 712 void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) 713 { 714 /* zero is "infinity" */ 715 t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors); 716 t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors); 717 718 t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments); 719 t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments); 720 t->max_segment_size = min(t->max_segment_size,b->max_segment_size); 721 t->hardsect_size = max(t->hardsect_size,b->hardsect_size); 722 if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) 723 clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags); 724 } 725 726 EXPORT_SYMBOL(blk_queue_stack_limits); 727 728 /** 729 * blk_queue_segment_boundary - set boundary rules for segment merging 730 * @q: the request queue for the device 731 * @mask: the memory boundary mask 732 **/ 733 void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) 734 { 735 if (mask < PAGE_CACHE_SIZE - 1) { 736 mask = PAGE_CACHE_SIZE - 1; 737 printk("%s: set to minimum %lx\n", __FUNCTION__, mask); 738 } 739 740 q->seg_boundary_mask = mask; 741 } 742 743 EXPORT_SYMBOL(blk_queue_segment_boundary); 744 745 /** 746 * blk_queue_dma_alignment - set dma length and memory alignment 747 * @q: the request queue for the device 748 * @mask: alignment mask 749 * 750 * description: 751 * set required memory and length aligment for direct dma transactions. 752 * this is used when buiding direct io requests for the queue. 753 * 754 **/ 755 void blk_queue_dma_alignment(struct request_queue *q, int mask) 756 { 757 q->dma_alignment = mask; 758 } 759 760 EXPORT_SYMBOL(blk_queue_dma_alignment); 761 762 /** 763 * blk_queue_update_dma_alignment - update dma length and memory alignment 764 * @q: the request queue for the device 765 * @mask: alignment mask 766 * 767 * description: 768 * update required memory and length aligment for direct dma transactions. 769 * If the requested alignment is larger than the current alignment, then 770 * the current queue alignment is updated to the new value, otherwise it 771 * is left alone. The design of this is to allow multiple objects 772 * (driver, device, transport etc) to set their respective 773 * alignments without having them interfere. 774 * 775 **/ 776 void blk_queue_update_dma_alignment(struct request_queue *q, int mask) 777 { 778 BUG_ON(mask > PAGE_SIZE); 779 780 if (mask > q->dma_alignment) 781 q->dma_alignment = mask; 782 } 783 784 EXPORT_SYMBOL(blk_queue_update_dma_alignment); 785 786 /** 787 * blk_queue_find_tag - find a request by its tag and queue 788 * @q: The request queue for the device 789 * @tag: The tag of the request 790 * 791 * Notes: 792 * Should be used when a device returns a tag and you want to match 793 * it with a request. 794 * 795 * no locks need be held. 796 **/ 797 struct request *blk_queue_find_tag(struct request_queue *q, int tag) 798 { 799 return blk_map_queue_find_tag(q->queue_tags, tag); 800 } 801 802 EXPORT_SYMBOL(blk_queue_find_tag); 803 804 /** 805 * __blk_free_tags - release a given set of tag maintenance info 806 * @bqt: the tag map to free 807 * 808 * Tries to free the specified @bqt@. Returns true if it was 809 * actually freed and false if there are still references using it 810 */ 811 static int __blk_free_tags(struct blk_queue_tag *bqt) 812 { 813 int retval; 814 815 retval = atomic_dec_and_test(&bqt->refcnt); 816 if (retval) { 817 BUG_ON(bqt->busy); 818 819 kfree(bqt->tag_index); 820 bqt->tag_index = NULL; 821 822 kfree(bqt->tag_map); 823 bqt->tag_map = NULL; 824 825 kfree(bqt); 826 827 } 828 829 return retval; 830 } 831 832 /** 833 * __blk_queue_free_tags - release tag maintenance info 834 * @q: the request queue for the device 835 * 836 * Notes: 837 * blk_cleanup_queue() will take care of calling this function, if tagging 838 * has been used. So there's no need to call this directly. 839 **/ 840 static void __blk_queue_free_tags(struct request_queue *q) 841 { 842 struct blk_queue_tag *bqt = q->queue_tags; 843 844 if (!bqt) 845 return; 846 847 __blk_free_tags(bqt); 848 849 q->queue_tags = NULL; 850 q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED); 851 } 852 853 854 /** 855 * blk_free_tags - release a given set of tag maintenance info 856 * @bqt: the tag map to free 857 * 858 * For externally managed @bqt@ frees the map. Callers of this 859 * function must guarantee to have released all the queues that 860 * might have been using this tag map. 861 */ 862 void blk_free_tags(struct blk_queue_tag *bqt) 863 { 864 if (unlikely(!__blk_free_tags(bqt))) 865 BUG(); 866 } 867 EXPORT_SYMBOL(blk_free_tags); 868 869 /** 870 * blk_queue_free_tags - release tag maintenance info 871 * @q: the request queue for the device 872 * 873 * Notes: 874 * This is used to disabled tagged queuing to a device, yet leave 875 * queue in function. 876 **/ 877 void blk_queue_free_tags(struct request_queue *q) 878 { 879 clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); 880 } 881 882 EXPORT_SYMBOL(blk_queue_free_tags); 883 884 static int 885 init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth) 886 { 887 struct request **tag_index; 888 unsigned long *tag_map; 889 int nr_ulongs; 890 891 if (q && depth > q->nr_requests * 2) { 892 depth = q->nr_requests * 2; 893 printk(KERN_ERR "%s: adjusted depth to %d\n", 894 __FUNCTION__, depth); 895 } 896 897 tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC); 898 if (!tag_index) 899 goto fail; 900 901 nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; 902 tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC); 903 if (!tag_map) 904 goto fail; 905 906 tags->real_max_depth = depth; 907 tags->max_depth = depth; 908 tags->tag_index = tag_index; 909 tags->tag_map = tag_map; 910 911 return 0; 912 fail: 913 kfree(tag_index); 914 return -ENOMEM; 915 } 916 917 static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q, 918 int depth) 919 { 920 struct blk_queue_tag *tags; 921 922 tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC); 923 if (!tags) 924 goto fail; 925 926 if (init_tag_map(q, tags, depth)) 927 goto fail; 928 929 tags->busy = 0; 930 atomic_set(&tags->refcnt, 1); 931 return tags; 932 fail: 933 kfree(tags); 934 return NULL; 935 } 936 937 /** 938 * blk_init_tags - initialize the tag info for an external tag map 939 * @depth: the maximum queue depth supported 940 * @tags: the tag to use 941 **/ 942 struct blk_queue_tag *blk_init_tags(int depth) 943 { 944 return __blk_queue_init_tags(NULL, depth); 945 } 946 EXPORT_SYMBOL(blk_init_tags); 947 948 /** 949 * blk_queue_init_tags - initialize the queue tag info 950 * @q: the request queue for the device 951 * @depth: the maximum queue depth supported 952 * @tags: the tag to use 953 **/ 954 int blk_queue_init_tags(struct request_queue *q, int depth, 955 struct blk_queue_tag *tags) 956 { 957 int rc; 958 959 BUG_ON(tags && q->queue_tags && tags != q->queue_tags); 960 961 if (!tags && !q->queue_tags) { 962 tags = __blk_queue_init_tags(q, depth); 963 964 if (!tags) 965 goto fail; 966 } else if (q->queue_tags) { 967 if ((rc = blk_queue_resize_tags(q, depth))) 968 return rc; 969 set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); 970 return 0; 971 } else 972 atomic_inc(&tags->refcnt); 973 974 /* 975 * assign it, all done 976 */ 977 q->queue_tags = tags; 978 q->queue_flags |= (1 << QUEUE_FLAG_QUEUED); 979 INIT_LIST_HEAD(&q->tag_busy_list); 980 return 0; 981 fail: 982 kfree(tags); 983 return -ENOMEM; 984 } 985 986 EXPORT_SYMBOL(blk_queue_init_tags); 987 988 /** 989 * blk_queue_resize_tags - change the queueing depth 990 * @q: the request queue for the device 991 * @new_depth: the new max command queueing depth 992 * 993 * Notes: 994 * Must be called with the queue lock held. 995 **/ 996 int blk_queue_resize_tags(struct request_queue *q, int new_depth) 997 { 998 struct blk_queue_tag *bqt = q->queue_tags; 999 struct request **tag_index; 1000 unsigned long *tag_map; 1001 int max_depth, nr_ulongs; 1002 1003 if (!bqt) 1004 return -ENXIO; 1005 1006 /* 1007 * if we already have large enough real_max_depth. just 1008 * adjust max_depth. *NOTE* as requests with tag value 1009 * between new_depth and real_max_depth can be in-flight, tag 1010 * map can not be shrunk blindly here. 1011 */ 1012 if (new_depth <= bqt->real_max_depth) { 1013 bqt->max_depth = new_depth; 1014 return 0; 1015 } 1016 1017 /* 1018 * Currently cannot replace a shared tag map with a new 1019 * one, so error out if this is the case 1020 */ 1021 if (atomic_read(&bqt->refcnt) != 1) 1022 return -EBUSY; 1023 1024 /* 1025 * save the old state info, so we can copy it back 1026 */ 1027 tag_index = bqt->tag_index; 1028 tag_map = bqt->tag_map; 1029 max_depth = bqt->real_max_depth; 1030 1031 if (init_tag_map(q, bqt, new_depth)) 1032 return -ENOMEM; 1033 1034 memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *)); 1035 nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG; 1036 memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long)); 1037 1038 kfree(tag_index); 1039 kfree(tag_map); 1040 return 0; 1041 } 1042 1043 EXPORT_SYMBOL(blk_queue_resize_tags); 1044 1045 /** 1046 * blk_queue_end_tag - end tag operations for a request 1047 * @q: the request queue for the device 1048 * @rq: the request that has completed 1049 * 1050 * Description: 1051 * Typically called when end_that_request_first() returns 0, meaning 1052 * all transfers have been done for a request. It's important to call 1053 * this function before end_that_request_last(), as that will put the 1054 * request back on the free list thus corrupting the internal tag list. 1055 * 1056 * Notes: 1057 * queue lock must be held. 1058 **/ 1059 void blk_queue_end_tag(struct request_queue *q, struct request *rq) 1060 { 1061 struct blk_queue_tag *bqt = q->queue_tags; 1062 int tag = rq->tag; 1063 1064 BUG_ON(tag == -1); 1065 1066 if (unlikely(tag >= bqt->real_max_depth)) 1067 /* 1068 * This can happen after tag depth has been reduced. 1069 * FIXME: how about a warning or info message here? 1070 */ 1071 return; 1072 1073 list_del_init(&rq->queuelist); 1074 rq->cmd_flags &= ~REQ_QUEUED; 1075 rq->tag = -1; 1076 1077 if (unlikely(bqt->tag_index[tag] == NULL)) 1078 printk(KERN_ERR "%s: tag %d is missing\n", 1079 __FUNCTION__, tag); 1080 1081 bqt->tag_index[tag] = NULL; 1082 1083 if (unlikely(!test_bit(tag, bqt->tag_map))) { 1084 printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n", 1085 __FUNCTION__, tag); 1086 return; 1087 } 1088 /* 1089 * The tag_map bit acts as a lock for tag_index[bit], so we need 1090 * unlock memory barrier semantics. 1091 */ 1092 clear_bit_unlock(tag, bqt->tag_map); 1093 bqt->busy--; 1094 } 1095 1096 EXPORT_SYMBOL(blk_queue_end_tag); 1097 1098 /** 1099 * blk_queue_start_tag - find a free tag and assign it 1100 * @q: the request queue for the device 1101 * @rq: the block request that needs tagging 1102 * 1103 * Description: 1104 * This can either be used as a stand-alone helper, or possibly be 1105 * assigned as the queue &prep_rq_fn (in which case &struct request 1106 * automagically gets a tag assigned). Note that this function 1107 * assumes that any type of request can be queued! if this is not 1108 * true for your device, you must check the request type before 1109 * calling this function. The request will also be removed from 1110 * the request queue, so it's the drivers responsibility to readd 1111 * it if it should need to be restarted for some reason. 1112 * 1113 * Notes: 1114 * queue lock must be held. 1115 **/ 1116 int blk_queue_start_tag(struct request_queue *q, struct request *rq) 1117 { 1118 struct blk_queue_tag *bqt = q->queue_tags; 1119 int tag; 1120 1121 if (unlikely((rq->cmd_flags & REQ_QUEUED))) { 1122 printk(KERN_ERR 1123 "%s: request %p for device [%s] already tagged %d", 1124 __FUNCTION__, rq, 1125 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag); 1126 BUG(); 1127 } 1128 1129 /* 1130 * Protect against shared tag maps, as we may not have exclusive 1131 * access to the tag map. 1132 */ 1133 do { 1134 tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth); 1135 if (tag >= bqt->max_depth) 1136 return 1; 1137 1138 } while (test_and_set_bit_lock(tag, bqt->tag_map)); 1139 /* 1140 * We need lock ordering semantics given by test_and_set_bit_lock. 1141 * See blk_queue_end_tag for details. 1142 */ 1143 1144 rq->cmd_flags |= REQ_QUEUED; 1145 rq->tag = tag; 1146 bqt->tag_index[tag] = rq; 1147 blkdev_dequeue_request(rq); 1148 list_add(&rq->queuelist, &q->tag_busy_list); 1149 bqt->busy++; 1150 return 0; 1151 } 1152 1153 EXPORT_SYMBOL(blk_queue_start_tag); 1154 1155 /** 1156 * blk_queue_invalidate_tags - invalidate all pending tags 1157 * @q: the request queue for the device 1158 * 1159 * Description: 1160 * Hardware conditions may dictate a need to stop all pending requests. 1161 * In this case, we will safely clear the block side of the tag queue and 1162 * readd all requests to the request queue in the right order. 1163 * 1164 * Notes: 1165 * queue lock must be held. 1166 **/ 1167 void blk_queue_invalidate_tags(struct request_queue *q) 1168 { 1169 struct list_head *tmp, *n; 1170 1171 list_for_each_safe(tmp, n, &q->tag_busy_list) 1172 blk_requeue_request(q, list_entry_rq(tmp)); 1173 } 1174 1175 EXPORT_SYMBOL(blk_queue_invalidate_tags); 1176 1177 void blk_dump_rq_flags(struct request *rq, char *msg) 1178 { 1179 int bit; 1180 1181 printk("%s: dev %s: type=%x, flags=%x\n", msg, 1182 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, 1183 rq->cmd_flags); 1184 1185 printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector, 1186 rq->nr_sectors, 1187 rq->current_nr_sectors); 1188 printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len); 1189 1190 if (blk_pc_request(rq)) { 1191 printk("cdb: "); 1192 for (bit = 0; bit < sizeof(rq->cmd); bit++) 1193 printk("%02x ", rq->cmd[bit]); 1194 printk("\n"); 1195 } 1196 } 1197 1198 EXPORT_SYMBOL(blk_dump_rq_flags); 1199 1200 void blk_recount_segments(struct request_queue *q, struct bio *bio) 1201 { 1202 struct request rq; 1203 struct bio *nxt = bio->bi_next; 1204 rq.q = q; 1205 rq.bio = rq.biotail = bio; 1206 bio->bi_next = NULL; 1207 blk_recalc_rq_segments(&rq); 1208 bio->bi_next = nxt; 1209 bio->bi_phys_segments = rq.nr_phys_segments; 1210 bio->bi_hw_segments = rq.nr_hw_segments; 1211 bio->bi_flags |= (1 << BIO_SEG_VALID); 1212 } 1213 EXPORT_SYMBOL(blk_recount_segments); 1214 1215 static void blk_recalc_rq_segments(struct request *rq) 1216 { 1217 int nr_phys_segs; 1218 int nr_hw_segs; 1219 unsigned int phys_size; 1220 unsigned int hw_size; 1221 struct bio_vec *bv, *bvprv = NULL; 1222 int seg_size; 1223 int hw_seg_size; 1224 int cluster; 1225 struct req_iterator iter; 1226 int high, highprv = 1; 1227 struct request_queue *q = rq->q; 1228 1229 if (!rq->bio) 1230 return; 1231 1232 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); 1233 hw_seg_size = seg_size = 0; 1234 phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0; 1235 rq_for_each_segment(bv, rq, iter) { 1236 /* 1237 * the trick here is making sure that a high page is never 1238 * considered part of another segment, since that might 1239 * change with the bounce page. 1240 */ 1241 high = page_to_pfn(bv->bv_page) > q->bounce_pfn; 1242 if (high || highprv) 1243 goto new_hw_segment; 1244 if (cluster) { 1245 if (seg_size + bv->bv_len > q->max_segment_size) 1246 goto new_segment; 1247 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv)) 1248 goto new_segment; 1249 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) 1250 goto new_segment; 1251 if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) 1252 goto new_hw_segment; 1253 1254 seg_size += bv->bv_len; 1255 hw_seg_size += bv->bv_len; 1256 bvprv = bv; 1257 continue; 1258 } 1259 new_segment: 1260 if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) && 1261 !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) 1262 hw_seg_size += bv->bv_len; 1263 else { 1264 new_hw_segment: 1265 if (nr_hw_segs == 1 && 1266 hw_seg_size > rq->bio->bi_hw_front_size) 1267 rq->bio->bi_hw_front_size = hw_seg_size; 1268 hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len; 1269 nr_hw_segs++; 1270 } 1271 1272 nr_phys_segs++; 1273 bvprv = bv; 1274 seg_size = bv->bv_len; 1275 highprv = high; 1276 } 1277 1278 if (nr_hw_segs == 1 && 1279 hw_seg_size > rq->bio->bi_hw_front_size) 1280 rq->bio->bi_hw_front_size = hw_seg_size; 1281 if (hw_seg_size > rq->biotail->bi_hw_back_size) 1282 rq->biotail->bi_hw_back_size = hw_seg_size; 1283 rq->nr_phys_segments = nr_phys_segs; 1284 rq->nr_hw_segments = nr_hw_segs; 1285 } 1286 1287 static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, 1288 struct bio *nxt) 1289 { 1290 if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER))) 1291 return 0; 1292 1293 if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt))) 1294 return 0; 1295 if (bio->bi_size + nxt->bi_size > q->max_segment_size) 1296 return 0; 1297 1298 /* 1299 * bio and nxt are contigous in memory, check if the queue allows 1300 * these two to be merged into one 1301 */ 1302 if (BIO_SEG_BOUNDARY(q, bio, nxt)) 1303 return 1; 1304 1305 return 0; 1306 } 1307 1308 static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio, 1309 struct bio *nxt) 1310 { 1311 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1312 blk_recount_segments(q, bio); 1313 if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID))) 1314 blk_recount_segments(q, nxt); 1315 if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) || 1316 BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size)) 1317 return 0; 1318 if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size) 1319 return 0; 1320 1321 return 1; 1322 } 1323 1324 /* 1325 * map a request to scatterlist, return number of sg entries setup. Caller 1326 * must make sure sg can hold rq->nr_phys_segments entries 1327 */ 1328 int blk_rq_map_sg(struct request_queue *q, struct request *rq, 1329 struct scatterlist *sglist) 1330 { 1331 struct bio_vec *bvec, *bvprv; 1332 struct req_iterator iter; 1333 struct scatterlist *sg; 1334 int nsegs, cluster; 1335 1336 nsegs = 0; 1337 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); 1338 1339 /* 1340 * for each bio in rq 1341 */ 1342 bvprv = NULL; 1343 sg = NULL; 1344 rq_for_each_segment(bvec, rq, iter) { 1345 int nbytes = bvec->bv_len; 1346 1347 if (bvprv && cluster) { 1348 if (sg->length + nbytes > q->max_segment_size) 1349 goto new_segment; 1350 1351 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) 1352 goto new_segment; 1353 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) 1354 goto new_segment; 1355 1356 sg->length += nbytes; 1357 } else { 1358 new_segment: 1359 if (!sg) 1360 sg = sglist; 1361 else { 1362 /* 1363 * If the driver previously mapped a shorter 1364 * list, we could see a termination bit 1365 * prematurely unless it fully inits the sg 1366 * table on each mapping. We KNOW that there 1367 * must be more entries here or the driver 1368 * would be buggy, so force clear the 1369 * termination bit to avoid doing a full 1370 * sg_init_table() in drivers for each command. 1371 */ 1372 sg->page_link &= ~0x02; 1373 sg = sg_next(sg); 1374 } 1375 1376 sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset); 1377 nsegs++; 1378 } 1379 bvprv = bvec; 1380 } /* segments in rq */ 1381 1382 if (sg) 1383 sg_mark_end(sg); 1384 1385 return nsegs; 1386 } 1387 1388 EXPORT_SYMBOL(blk_rq_map_sg); 1389 1390 /* 1391 * the standard queue merge functions, can be overridden with device 1392 * specific ones if so desired 1393 */ 1394 1395 static inline int ll_new_mergeable(struct request_queue *q, 1396 struct request *req, 1397 struct bio *bio) 1398 { 1399 int nr_phys_segs = bio_phys_segments(q, bio); 1400 1401 if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { 1402 req->cmd_flags |= REQ_NOMERGE; 1403 if (req == q->last_merge) 1404 q->last_merge = NULL; 1405 return 0; 1406 } 1407 1408 /* 1409 * A hw segment is just getting larger, bump just the phys 1410 * counter. 1411 */ 1412 req->nr_phys_segments += nr_phys_segs; 1413 return 1; 1414 } 1415 1416 static inline int ll_new_hw_segment(struct request_queue *q, 1417 struct request *req, 1418 struct bio *bio) 1419 { 1420 int nr_hw_segs = bio_hw_segments(q, bio); 1421 int nr_phys_segs = bio_phys_segments(q, bio); 1422 1423 if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments 1424 || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { 1425 req->cmd_flags |= REQ_NOMERGE; 1426 if (req == q->last_merge) 1427 q->last_merge = NULL; 1428 return 0; 1429 } 1430 1431 /* 1432 * This will form the start of a new hw segment. Bump both 1433 * counters. 1434 */ 1435 req->nr_hw_segments += nr_hw_segs; 1436 req->nr_phys_segments += nr_phys_segs; 1437 return 1; 1438 } 1439 1440 static int ll_back_merge_fn(struct request_queue *q, struct request *req, 1441 struct bio *bio) 1442 { 1443 unsigned short max_sectors; 1444 int len; 1445 1446 if (unlikely(blk_pc_request(req))) 1447 max_sectors = q->max_hw_sectors; 1448 else 1449 max_sectors = q->max_sectors; 1450 1451 if (req->nr_sectors + bio_sectors(bio) > max_sectors) { 1452 req->cmd_flags |= REQ_NOMERGE; 1453 if (req == q->last_merge) 1454 q->last_merge = NULL; 1455 return 0; 1456 } 1457 if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID))) 1458 blk_recount_segments(q, req->biotail); 1459 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1460 blk_recount_segments(q, bio); 1461 len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size; 1462 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) && 1463 !BIOVEC_VIRT_OVERSIZE(len)) { 1464 int mergeable = ll_new_mergeable(q, req, bio); 1465 1466 if (mergeable) { 1467 if (req->nr_hw_segments == 1) 1468 req->bio->bi_hw_front_size = len; 1469 if (bio->bi_hw_segments == 1) 1470 bio->bi_hw_back_size = len; 1471 } 1472 return mergeable; 1473 } 1474 1475 return ll_new_hw_segment(q, req, bio); 1476 } 1477 1478 static int ll_front_merge_fn(struct request_queue *q, struct request *req, 1479 struct bio *bio) 1480 { 1481 unsigned short max_sectors; 1482 int len; 1483 1484 if (unlikely(blk_pc_request(req))) 1485 max_sectors = q->max_hw_sectors; 1486 else 1487 max_sectors = q->max_sectors; 1488 1489 1490 if (req->nr_sectors + bio_sectors(bio) > max_sectors) { 1491 req->cmd_flags |= REQ_NOMERGE; 1492 if (req == q->last_merge) 1493 q->last_merge = NULL; 1494 return 0; 1495 } 1496 len = bio->bi_hw_back_size + req->bio->bi_hw_front_size; 1497 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1498 blk_recount_segments(q, bio); 1499 if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID))) 1500 blk_recount_segments(q, req->bio); 1501 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) && 1502 !BIOVEC_VIRT_OVERSIZE(len)) { 1503 int mergeable = ll_new_mergeable(q, req, bio); 1504 1505 if (mergeable) { 1506 if (bio->bi_hw_segments == 1) 1507 bio->bi_hw_front_size = len; 1508 if (req->nr_hw_segments == 1) 1509 req->biotail->bi_hw_back_size = len; 1510 } 1511 return mergeable; 1512 } 1513 1514 return ll_new_hw_segment(q, req, bio); 1515 } 1516 1517 static int ll_merge_requests_fn(struct request_queue *q, struct request *req, 1518 struct request *next) 1519 { 1520 int total_phys_segments; 1521 int total_hw_segments; 1522 1523 /* 1524 * First check if the either of the requests are re-queued 1525 * requests. Can't merge them if they are. 1526 */ 1527 if (req->special || next->special) 1528 return 0; 1529 1530 /* 1531 * Will it become too large? 1532 */ 1533 if ((req->nr_sectors + next->nr_sectors) > q->max_sectors) 1534 return 0; 1535 1536 total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; 1537 if (blk_phys_contig_segment(q, req->biotail, next->bio)) 1538 total_phys_segments--; 1539 1540 if (total_phys_segments > q->max_phys_segments) 1541 return 0; 1542 1543 total_hw_segments = req->nr_hw_segments + next->nr_hw_segments; 1544 if (blk_hw_contig_segment(q, req->biotail, next->bio)) { 1545 int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size; 1546 /* 1547 * propagate the combined length to the end of the requests 1548 */ 1549 if (req->nr_hw_segments == 1) 1550 req->bio->bi_hw_front_size = len; 1551 if (next->nr_hw_segments == 1) 1552 next->biotail->bi_hw_back_size = len; 1553 total_hw_segments--; 1554 } 1555 1556 if (total_hw_segments > q->max_hw_segments) 1557 return 0; 1558 1559 /* Merge is OK... */ 1560 req->nr_phys_segments = total_phys_segments; 1561 req->nr_hw_segments = total_hw_segments; 1562 return 1; 1563 } 1564 1565 /* 1566 * "plug" the device if there are no outstanding requests: this will 1567 * force the transfer to start only after we have put all the requests 1568 * on the list. 1569 * 1570 * This is called with interrupts off and no requests on the queue and 1571 * with the queue lock held. 1572 */ 1573 void blk_plug_device(struct request_queue *q) 1574 { 1575 WARN_ON(!irqs_disabled()); 1576 1577 /* 1578 * don't plug a stopped queue, it must be paired with blk_start_queue() 1579 * which will restart the queueing 1580 */ 1581 if (blk_queue_stopped(q)) 1582 return; 1583 1584 if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) { 1585 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); 1586 blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG); 1587 } 1588 } 1589 1590 EXPORT_SYMBOL(blk_plug_device); 1591 1592 /* 1593 * remove the queue from the plugged list, if present. called with 1594 * queue lock held and interrupts disabled. 1595 */ 1596 int blk_remove_plug(struct request_queue *q) 1597 { 1598 WARN_ON(!irqs_disabled()); 1599 1600 if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) 1601 return 0; 1602 1603 del_timer(&q->unplug_timer); 1604 return 1; 1605 } 1606 1607 EXPORT_SYMBOL(blk_remove_plug); 1608 1609 /* 1610 * remove the plug and let it rip.. 1611 */ 1612 void __generic_unplug_device(struct request_queue *q) 1613 { 1614 if (unlikely(blk_queue_stopped(q))) 1615 return; 1616 1617 if (!blk_remove_plug(q)) 1618 return; 1619 1620 q->request_fn(q); 1621 } 1622 EXPORT_SYMBOL(__generic_unplug_device); 1623 1624 /** 1625 * generic_unplug_device - fire a request queue 1626 * @q: The &struct request_queue in question 1627 * 1628 * Description: 1629 * Linux uses plugging to build bigger requests queues before letting 1630 * the device have at them. If a queue is plugged, the I/O scheduler 1631 * is still adding and merging requests on the queue. Once the queue 1632 * gets unplugged, the request_fn defined for the queue is invoked and 1633 * transfers started. 1634 **/ 1635 void generic_unplug_device(struct request_queue *q) 1636 { 1637 spin_lock_irq(q->queue_lock); 1638 __generic_unplug_device(q); 1639 spin_unlock_irq(q->queue_lock); 1640 } 1641 EXPORT_SYMBOL(generic_unplug_device); 1642 1643 static void blk_backing_dev_unplug(struct backing_dev_info *bdi, 1644 struct page *page) 1645 { 1646 struct request_queue *q = bdi->unplug_io_data; 1647 1648 blk_unplug(q); 1649 } 1650 1651 static void blk_unplug_work(struct work_struct *work) 1652 { 1653 struct request_queue *q = 1654 container_of(work, struct request_queue, unplug_work); 1655 1656 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, 1657 q->rq.count[READ] + q->rq.count[WRITE]); 1658 1659 q->unplug_fn(q); 1660 } 1661 1662 static void blk_unplug_timeout(unsigned long data) 1663 { 1664 struct request_queue *q = (struct request_queue *)data; 1665 1666 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL, 1667 q->rq.count[READ] + q->rq.count[WRITE]); 1668 1669 kblockd_schedule_work(&q->unplug_work); 1670 } 1671 1672 void blk_unplug(struct request_queue *q) 1673 { 1674 /* 1675 * devices don't necessarily have an ->unplug_fn defined 1676 */ 1677 if (q->unplug_fn) { 1678 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, 1679 q->rq.count[READ] + q->rq.count[WRITE]); 1680 1681 q->unplug_fn(q); 1682 } 1683 } 1684 EXPORT_SYMBOL(blk_unplug); 1685 1686 /** 1687 * blk_start_queue - restart a previously stopped queue 1688 * @q: The &struct request_queue in question 1689 * 1690 * Description: 1691 * blk_start_queue() will clear the stop flag on the queue, and call 1692 * the request_fn for the queue if it was in a stopped state when 1693 * entered. Also see blk_stop_queue(). Queue lock must be held. 1694 **/ 1695 void blk_start_queue(struct request_queue *q) 1696 { 1697 WARN_ON(!irqs_disabled()); 1698 1699 clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); 1700 1701 /* 1702 * one level of recursion is ok and is much faster than kicking 1703 * the unplug handling 1704 */ 1705 if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { 1706 q->request_fn(q); 1707 clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags); 1708 } else { 1709 blk_plug_device(q); 1710 kblockd_schedule_work(&q->unplug_work); 1711 } 1712 } 1713 1714 EXPORT_SYMBOL(blk_start_queue); 1715 1716 /** 1717 * blk_stop_queue - stop a queue 1718 * @q: The &struct request_queue in question 1719 * 1720 * Description: 1721 * The Linux block layer assumes that a block driver will consume all 1722 * entries on the request queue when the request_fn strategy is called. 1723 * Often this will not happen, because of hardware limitations (queue 1724 * depth settings). If a device driver gets a 'queue full' response, 1725 * or if it simply chooses not to queue more I/O at one point, it can 1726 * call this function to prevent the request_fn from being called until 1727 * the driver has signalled it's ready to go again. This happens by calling 1728 * blk_start_queue() to restart queue operations. Queue lock must be held. 1729 **/ 1730 void blk_stop_queue(struct request_queue *q) 1731 { 1732 blk_remove_plug(q); 1733 set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); 1734 } 1735 EXPORT_SYMBOL(blk_stop_queue); 1736 1737 /** 1738 * blk_sync_queue - cancel any pending callbacks on a queue 1739 * @q: the queue 1740 * 1741 * Description: 1742 * The block layer may perform asynchronous callback activity 1743 * on a queue, such as calling the unplug function after a timeout. 1744 * A block device may call blk_sync_queue to ensure that any 1745 * such activity is cancelled, thus allowing it to release resources 1746 * that the callbacks might use. The caller must already have made sure 1747 * that its ->make_request_fn will not re-add plugging prior to calling 1748 * this function. 1749 * 1750 */ 1751 void blk_sync_queue(struct request_queue *q) 1752 { 1753 del_timer_sync(&q->unplug_timer); 1754 kblockd_flush_work(&q->unplug_work); 1755 } 1756 EXPORT_SYMBOL(blk_sync_queue); 1757 1758 /** 1759 * blk_run_queue - run a single device queue 1760 * @q: The queue to run 1761 */ 1762 void blk_run_queue(struct request_queue *q) 1763 { 1764 unsigned long flags; 1765 1766 spin_lock_irqsave(q->queue_lock, flags); 1767 blk_remove_plug(q); 1768 1769 /* 1770 * Only recurse once to avoid overrunning the stack, let the unplug 1771 * handling reinvoke the handler shortly if we already got there. 1772 */ 1773 if (!elv_queue_empty(q)) { 1774 if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { 1775 q->request_fn(q); 1776 clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags); 1777 } else { 1778 blk_plug_device(q); 1779 kblockd_schedule_work(&q->unplug_work); 1780 } 1781 } 1782 1783 spin_unlock_irqrestore(q->queue_lock, flags); 1784 } 1785 EXPORT_SYMBOL(blk_run_queue); 1786 1787 /** 1788 * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed 1789 * @kobj: the kobj belonging of the request queue to be released 1790 * 1791 * Description: 1792 * blk_cleanup_queue is the pair to blk_init_queue() or 1793 * blk_queue_make_request(). It should be called when a request queue is 1794 * being released; typically when a block device is being de-registered. 1795 * Currently, its primary task it to free all the &struct request 1796 * structures that were allocated to the queue and the queue itself. 1797 * 1798 * Caveat: 1799 * Hopefully the low level driver will have finished any 1800 * outstanding requests first... 1801 **/ 1802 static void blk_release_queue(struct kobject *kobj) 1803 { 1804 struct request_queue *q = 1805 container_of(kobj, struct request_queue, kobj); 1806 struct request_list *rl = &q->rq; 1807 1808 blk_sync_queue(q); 1809 1810 if (rl->rq_pool) 1811 mempool_destroy(rl->rq_pool); 1812 1813 if (q->queue_tags) 1814 __blk_queue_free_tags(q); 1815 1816 blk_trace_shutdown(q); 1817 1818 bdi_destroy(&q->backing_dev_info); 1819 kmem_cache_free(requestq_cachep, q); 1820 } 1821 1822 void blk_put_queue(struct request_queue *q) 1823 { 1824 kobject_put(&q->kobj); 1825 } 1826 EXPORT_SYMBOL(blk_put_queue); 1827 1828 void blk_cleanup_queue(struct request_queue * q) 1829 { 1830 mutex_lock(&q->sysfs_lock); 1831 set_bit(QUEUE_FLAG_DEAD, &q->queue_flags); 1832 mutex_unlock(&q->sysfs_lock); 1833 1834 if (q->elevator) 1835 elevator_exit(q->elevator); 1836 1837 blk_put_queue(q); 1838 } 1839 1840 EXPORT_SYMBOL(blk_cleanup_queue); 1841 1842 static int blk_init_free_list(struct request_queue *q) 1843 { 1844 struct request_list *rl = &q->rq; 1845 1846 rl->count[READ] = rl->count[WRITE] = 0; 1847 rl->starved[READ] = rl->starved[WRITE] = 0; 1848 rl->elvpriv = 0; 1849 init_waitqueue_head(&rl->wait[READ]); 1850 init_waitqueue_head(&rl->wait[WRITE]); 1851 1852 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, 1853 mempool_free_slab, request_cachep, q->node); 1854 1855 if (!rl->rq_pool) 1856 return -ENOMEM; 1857 1858 return 0; 1859 } 1860 1861 struct request_queue *blk_alloc_queue(gfp_t gfp_mask) 1862 { 1863 return blk_alloc_queue_node(gfp_mask, -1); 1864 } 1865 EXPORT_SYMBOL(blk_alloc_queue); 1866 1867 static struct kobj_type queue_ktype; 1868 1869 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) 1870 { 1871 struct request_queue *q; 1872 int err; 1873 1874 q = kmem_cache_alloc_node(requestq_cachep, 1875 gfp_mask | __GFP_ZERO, node_id); 1876 if (!q) 1877 return NULL; 1878 1879 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; 1880 q->backing_dev_info.unplug_io_data = q; 1881 err = bdi_init(&q->backing_dev_info); 1882 if (err) { 1883 kmem_cache_free(requestq_cachep, q); 1884 return NULL; 1885 } 1886 1887 init_timer(&q->unplug_timer); 1888 1889 kobject_init(&q->kobj, &queue_ktype); 1890 1891 mutex_init(&q->sysfs_lock); 1892 1893 return q; 1894 } 1895 EXPORT_SYMBOL(blk_alloc_queue_node); 1896 1897 /** 1898 * blk_init_queue - prepare a request queue for use with a block device 1899 * @rfn: The function to be called to process requests that have been 1900 * placed on the queue. 1901 * @lock: Request queue spin lock 1902 * 1903 * Description: 1904 * If a block device wishes to use the standard request handling procedures, 1905 * which sorts requests and coalesces adjacent requests, then it must 1906 * call blk_init_queue(). The function @rfn will be called when there 1907 * are requests on the queue that need to be processed. If the device 1908 * supports plugging, then @rfn may not be called immediately when requests 1909 * are available on the queue, but may be called at some time later instead. 1910 * Plugged queues are generally unplugged when a buffer belonging to one 1911 * of the requests on the queue is needed, or due to memory pressure. 1912 * 1913 * @rfn is not required, or even expected, to remove all requests off the 1914 * queue, but only as many as it can handle at a time. If it does leave 1915 * requests on the queue, it is responsible for arranging that the requests 1916 * get dealt with eventually. 1917 * 1918 * The queue spin lock must be held while manipulating the requests on the 1919 * request queue; this lock will be taken also from interrupt context, so irq 1920 * disabling is needed for it. 1921 * 1922 * Function returns a pointer to the initialized request queue, or NULL if 1923 * it didn't succeed. 1924 * 1925 * Note: 1926 * blk_init_queue() must be paired with a blk_cleanup_queue() call 1927 * when the block device is deactivated (such as at module unload). 1928 **/ 1929 1930 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) 1931 { 1932 return blk_init_queue_node(rfn, lock, -1); 1933 } 1934 EXPORT_SYMBOL(blk_init_queue); 1935 1936 struct request_queue * 1937 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) 1938 { 1939 struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); 1940 1941 if (!q) 1942 return NULL; 1943 1944 q->node = node_id; 1945 if (blk_init_free_list(q)) { 1946 kmem_cache_free(requestq_cachep, q); 1947 return NULL; 1948 } 1949 1950 /* 1951 * if caller didn't supply a lock, they get per-queue locking with 1952 * our embedded lock 1953 */ 1954 if (!lock) { 1955 spin_lock_init(&q->__queue_lock); 1956 lock = &q->__queue_lock; 1957 } 1958 1959 q->request_fn = rfn; 1960 q->prep_rq_fn = NULL; 1961 q->unplug_fn = generic_unplug_device; 1962 q->queue_flags = (1 << QUEUE_FLAG_CLUSTER); 1963 q->queue_lock = lock; 1964 1965 blk_queue_segment_boundary(q, 0xffffffff); 1966 1967 blk_queue_make_request(q, __make_request); 1968 blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE); 1969 1970 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); 1971 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); 1972 1973 q->sg_reserved_size = INT_MAX; 1974 1975 /* 1976 * all done 1977 */ 1978 if (!elevator_init(q, NULL)) { 1979 blk_queue_congestion_threshold(q); 1980 return q; 1981 } 1982 1983 blk_put_queue(q); 1984 return NULL; 1985 } 1986 EXPORT_SYMBOL(blk_init_queue_node); 1987 1988 int blk_get_queue(struct request_queue *q) 1989 { 1990 if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { 1991 kobject_get(&q->kobj); 1992 return 0; 1993 } 1994 1995 return 1; 1996 } 1997 1998 EXPORT_SYMBOL(blk_get_queue); 1999 2000 static inline void blk_free_request(struct request_queue *q, struct request *rq) 2001 { 2002 if (rq->cmd_flags & REQ_ELVPRIV) 2003 elv_put_request(q, rq); 2004 mempool_free(rq, q->rq.rq_pool); 2005 } 2006 2007 static struct request * 2008 blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask) 2009 { 2010 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); 2011 2012 if (!rq) 2013 return NULL; 2014 2015 /* 2016 * first three bits are identical in rq->cmd_flags and bio->bi_rw, 2017 * see bio.h and blkdev.h 2018 */ 2019 rq->cmd_flags = rw | REQ_ALLOCED; 2020 2021 if (priv) { 2022 if (unlikely(elv_set_request(q, rq, gfp_mask))) { 2023 mempool_free(rq, q->rq.rq_pool); 2024 return NULL; 2025 } 2026 rq->cmd_flags |= REQ_ELVPRIV; 2027 } 2028 2029 return rq; 2030 } 2031 2032 /* 2033 * ioc_batching returns true if the ioc is a valid batching request and 2034 * should be given priority access to a request. 2035 */ 2036 static inline int ioc_batching(struct request_queue *q, struct io_context *ioc) 2037 { 2038 if (!ioc) 2039 return 0; 2040 2041 /* 2042 * Make sure the process is able to allocate at least 1 request 2043 * even if the batch times out, otherwise we could theoretically 2044 * lose wakeups. 2045 */ 2046 return ioc->nr_batch_requests == q->nr_batching || 2047 (ioc->nr_batch_requests > 0 2048 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); 2049 } 2050 2051 /* 2052 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This 2053 * will cause the process to be a "batcher" on all queues in the system. This 2054 * is the behaviour we want though - once it gets a wakeup it should be given 2055 * a nice run. 2056 */ 2057 static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) 2058 { 2059 if (!ioc || ioc_batching(q, ioc)) 2060 return; 2061 2062 ioc->nr_batch_requests = q->nr_batching; 2063 ioc->last_waited = jiffies; 2064 } 2065 2066 static void __freed_request(struct request_queue *q, int rw) 2067 { 2068 struct request_list *rl = &q->rq; 2069 2070 if (rl->count[rw] < queue_congestion_off_threshold(q)) 2071 blk_clear_queue_congested(q, rw); 2072 2073 if (rl->count[rw] + 1 <= q->nr_requests) { 2074 if (waitqueue_active(&rl->wait[rw])) 2075 wake_up(&rl->wait[rw]); 2076 2077 blk_clear_queue_full(q, rw); 2078 } 2079 } 2080 2081 /* 2082 * A request has just been released. Account for it, update the full and 2083 * congestion status, wake up any waiters. Called under q->queue_lock. 2084 */ 2085 static void freed_request(struct request_queue *q, int rw, int priv) 2086 { 2087 struct request_list *rl = &q->rq; 2088 2089 rl->count[rw]--; 2090 if (priv) 2091 rl->elvpriv--; 2092 2093 __freed_request(q, rw); 2094 2095 if (unlikely(rl->starved[rw ^ 1])) 2096 __freed_request(q, rw ^ 1); 2097 } 2098 2099 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist) 2100 /* 2101 * Get a free request, queue_lock must be held. 2102 * Returns NULL on failure, with queue_lock held. 2103 * Returns !NULL on success, with queue_lock *not held*. 2104 */ 2105 static struct request *get_request(struct request_queue *q, int rw_flags, 2106 struct bio *bio, gfp_t gfp_mask) 2107 { 2108 struct request *rq = NULL; 2109 struct request_list *rl = &q->rq; 2110 struct io_context *ioc = NULL; 2111 const int rw = rw_flags & 0x01; 2112 int may_queue, priv; 2113 2114 may_queue = elv_may_queue(q, rw_flags); 2115 if (may_queue == ELV_MQUEUE_NO) 2116 goto rq_starved; 2117 2118 if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) { 2119 if (rl->count[rw]+1 >= q->nr_requests) { 2120 ioc = current_io_context(GFP_ATOMIC, q->node); 2121 /* 2122 * The queue will fill after this allocation, so set 2123 * it as full, and mark this process as "batching". 2124 * This process will be allowed to complete a batch of 2125 * requests, others will be blocked. 2126 */ 2127 if (!blk_queue_full(q, rw)) { 2128 ioc_set_batching(q, ioc); 2129 blk_set_queue_full(q, rw); 2130 } else { 2131 if (may_queue != ELV_MQUEUE_MUST 2132 && !ioc_batching(q, ioc)) { 2133 /* 2134 * The queue is full and the allocating 2135 * process is not a "batcher", and not 2136 * exempted by the IO scheduler 2137 */ 2138 goto out; 2139 } 2140 } 2141 } 2142 blk_set_queue_congested(q, rw); 2143 } 2144 2145 /* 2146 * Only allow batching queuers to allocate up to 50% over the defined 2147 * limit of requests, otherwise we could have thousands of requests 2148 * allocated with any setting of ->nr_requests 2149 */ 2150 if (rl->count[rw] >= (3 * q->nr_requests / 2)) 2151 goto out; 2152 2153 rl->count[rw]++; 2154 rl->starved[rw] = 0; 2155 2156 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 2157 if (priv) 2158 rl->elvpriv++; 2159 2160 spin_unlock_irq(q->queue_lock); 2161 2162 rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); 2163 if (unlikely(!rq)) { 2164 /* 2165 * Allocation failed presumably due to memory. Undo anything 2166 * we might have messed up. 2167 * 2168 * Allocating task should really be put onto the front of the 2169 * wait queue, but this is pretty rare. 2170 */ 2171 spin_lock_irq(q->queue_lock); 2172 freed_request(q, rw, priv); 2173 2174 /* 2175 * in the very unlikely event that allocation failed and no 2176 * requests for this direction was pending, mark us starved 2177 * so that freeing of a request in the other direction will 2178 * notice us. another possible fix would be to split the 2179 * rq mempool into READ and WRITE 2180 */ 2181 rq_starved: 2182 if (unlikely(rl->count[rw] == 0)) 2183 rl->starved[rw] = 1; 2184 2185 goto out; 2186 } 2187 2188 /* 2189 * ioc may be NULL here, and ioc_batching will be false. That's 2190 * OK, if the queue is under the request limit then requests need 2191 * not count toward the nr_batch_requests limit. There will always 2192 * be some limit enforced by BLK_BATCH_TIME. 2193 */ 2194 if (ioc_batching(q, ioc)) 2195 ioc->nr_batch_requests--; 2196 2197 rq_init(q, rq); 2198 2199 blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ); 2200 out: 2201 return rq; 2202 } 2203 2204 /* 2205 * No available requests for this queue, unplug the device and wait for some 2206 * requests to become available. 2207 * 2208 * Called with q->queue_lock held, and returns with it unlocked. 2209 */ 2210 static struct request *get_request_wait(struct request_queue *q, int rw_flags, 2211 struct bio *bio) 2212 { 2213 const int rw = rw_flags & 0x01; 2214 struct request *rq; 2215 2216 rq = get_request(q, rw_flags, bio, GFP_NOIO); 2217 while (!rq) { 2218 DEFINE_WAIT(wait); 2219 struct request_list *rl = &q->rq; 2220 2221 prepare_to_wait_exclusive(&rl->wait[rw], &wait, 2222 TASK_UNINTERRUPTIBLE); 2223 2224 rq = get_request(q, rw_flags, bio, GFP_NOIO); 2225 2226 if (!rq) { 2227 struct io_context *ioc; 2228 2229 blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); 2230 2231 __generic_unplug_device(q); 2232 spin_unlock_irq(q->queue_lock); 2233 io_schedule(); 2234 2235 /* 2236 * After sleeping, we become a "batching" process and 2237 * will be able to allocate at least one request, and 2238 * up to a big batch of them for a small period time. 2239 * See ioc_batching, ioc_set_batching 2240 */ 2241 ioc = current_io_context(GFP_NOIO, q->node); 2242 ioc_set_batching(q, ioc); 2243 2244 spin_lock_irq(q->queue_lock); 2245 } 2246 finish_wait(&rl->wait[rw], &wait); 2247 } 2248 2249 return rq; 2250 } 2251 2252 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 2253 { 2254 struct request *rq; 2255 2256 BUG_ON(rw != READ && rw != WRITE); 2257 2258 spin_lock_irq(q->queue_lock); 2259 if (gfp_mask & __GFP_WAIT) { 2260 rq = get_request_wait(q, rw, NULL); 2261 } else { 2262 rq = get_request(q, rw, NULL, gfp_mask); 2263 if (!rq) 2264 spin_unlock_irq(q->queue_lock); 2265 } 2266 /* q->queue_lock is unlocked at this point */ 2267 2268 return rq; 2269 } 2270 EXPORT_SYMBOL(blk_get_request); 2271 2272 /** 2273 * blk_start_queueing - initiate dispatch of requests to device 2274 * @q: request queue to kick into gear 2275 * 2276 * This is basically a helper to remove the need to know whether a queue 2277 * is plugged or not if someone just wants to initiate dispatch of requests 2278 * for this queue. 2279 * 2280 * The queue lock must be held with interrupts disabled. 2281 */ 2282 void blk_start_queueing(struct request_queue *q) 2283 { 2284 if (!blk_queue_plugged(q)) 2285 q->request_fn(q); 2286 else 2287 __generic_unplug_device(q); 2288 } 2289 EXPORT_SYMBOL(blk_start_queueing); 2290 2291 /** 2292 * blk_requeue_request - put a request back on queue 2293 * @q: request queue where request should be inserted 2294 * @rq: request to be inserted 2295 * 2296 * Description: 2297 * Drivers often keep queueing requests until the hardware cannot accept 2298 * more, when that condition happens we need to put the request back 2299 * on the queue. Must be called with queue lock held. 2300 */ 2301 void blk_requeue_request(struct request_queue *q, struct request *rq) 2302 { 2303 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); 2304 2305 if (blk_rq_tagged(rq)) 2306 blk_queue_end_tag(q, rq); 2307 2308 elv_requeue_request(q, rq); 2309 } 2310 2311 EXPORT_SYMBOL(blk_requeue_request); 2312 2313 /** 2314 * blk_insert_request - insert a special request in to a request queue 2315 * @q: request queue where request should be inserted 2316 * @rq: request to be inserted 2317 * @at_head: insert request at head or tail of queue 2318 * @data: private data 2319 * 2320 * Description: 2321 * Many block devices need to execute commands asynchronously, so they don't 2322 * block the whole kernel from preemption during request execution. This is 2323 * accomplished normally by inserting aritficial requests tagged as 2324 * REQ_SPECIAL in to the corresponding request queue, and letting them be 2325 * scheduled for actual execution by the request queue. 2326 * 2327 * We have the option of inserting the head or the tail of the queue. 2328 * Typically we use the tail for new ioctls and so forth. We use the head 2329 * of the queue for things like a QUEUE_FULL message from a device, or a 2330 * host that is unable to accept a particular command. 2331 */ 2332 void blk_insert_request(struct request_queue *q, struct request *rq, 2333 int at_head, void *data) 2334 { 2335 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 2336 unsigned long flags; 2337 2338 /* 2339 * tell I/O scheduler that this isn't a regular read/write (ie it 2340 * must not attempt merges on this) and that it acts as a soft 2341 * barrier 2342 */ 2343 rq->cmd_type = REQ_TYPE_SPECIAL; 2344 rq->cmd_flags |= REQ_SOFTBARRIER; 2345 2346 rq->special = data; 2347 2348 spin_lock_irqsave(q->queue_lock, flags); 2349 2350 /* 2351 * If command is tagged, release the tag 2352 */ 2353 if (blk_rq_tagged(rq)) 2354 blk_queue_end_tag(q, rq); 2355 2356 drive_stat_acct(rq, 1); 2357 __elv_add_request(q, rq, where, 0); 2358 blk_start_queueing(q); 2359 spin_unlock_irqrestore(q->queue_lock, flags); 2360 } 2361 2362 EXPORT_SYMBOL(blk_insert_request); 2363 2364 static int __blk_rq_unmap_user(struct bio *bio) 2365 { 2366 int ret = 0; 2367 2368 if (bio) { 2369 if (bio_flagged(bio, BIO_USER_MAPPED)) 2370 bio_unmap_user(bio); 2371 else 2372 ret = bio_uncopy_user(bio); 2373 } 2374 2375 return ret; 2376 } 2377 2378 int blk_rq_append_bio(struct request_queue *q, struct request *rq, 2379 struct bio *bio) 2380 { 2381 if (!rq->bio) 2382 blk_rq_bio_prep(q, rq, bio); 2383 else if (!ll_back_merge_fn(q, rq, bio)) 2384 return -EINVAL; 2385 else { 2386 rq->biotail->bi_next = bio; 2387 rq->biotail = bio; 2388 2389 rq->data_len += bio->bi_size; 2390 } 2391 return 0; 2392 } 2393 EXPORT_SYMBOL(blk_rq_append_bio); 2394 2395 static int __blk_rq_map_user(struct request_queue *q, struct request *rq, 2396 void __user *ubuf, unsigned int len) 2397 { 2398 unsigned long uaddr; 2399 struct bio *bio, *orig_bio; 2400 int reading, ret; 2401 2402 reading = rq_data_dir(rq) == READ; 2403 2404 /* 2405 * if alignment requirement is satisfied, map in user pages for 2406 * direct dma. else, set up kernel bounce buffers 2407 */ 2408 uaddr = (unsigned long) ubuf; 2409 if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q))) 2410 bio = bio_map_user(q, NULL, uaddr, len, reading); 2411 else 2412 bio = bio_copy_user(q, uaddr, len, reading); 2413 2414 if (IS_ERR(bio)) 2415 return PTR_ERR(bio); 2416 2417 orig_bio = bio; 2418 blk_queue_bounce(q, &bio); 2419 2420 /* 2421 * We link the bounce buffer in and could have to traverse it 2422 * later so we have to get a ref to prevent it from being freed 2423 */ 2424 bio_get(bio); 2425 2426 ret = blk_rq_append_bio(q, rq, bio); 2427 if (!ret) 2428 return bio->bi_size; 2429 2430 /* if it was boucned we must call the end io function */ 2431 bio_endio(bio, 0); 2432 __blk_rq_unmap_user(orig_bio); 2433 bio_put(bio); 2434 return ret; 2435 } 2436 2437 /** 2438 * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage 2439 * @q: request queue where request should be inserted 2440 * @rq: request structure to fill 2441 * @ubuf: the user buffer 2442 * @len: length of user data 2443 * 2444 * Description: 2445 * Data will be mapped directly for zero copy io, if possible. Otherwise 2446 * a kernel bounce buffer is used. 2447 * 2448 * A matching blk_rq_unmap_user() must be issued at the end of io, while 2449 * still in process context. 2450 * 2451 * Note: The mapped bio may need to be bounced through blk_queue_bounce() 2452 * before being submitted to the device, as pages mapped may be out of 2453 * reach. It's the callers responsibility to make sure this happens. The 2454 * original bio must be passed back in to blk_rq_unmap_user() for proper 2455 * unmapping. 2456 */ 2457 int blk_rq_map_user(struct request_queue *q, struct request *rq, 2458 void __user *ubuf, unsigned long len) 2459 { 2460 unsigned long bytes_read = 0; 2461 struct bio *bio = NULL; 2462 int ret; 2463 2464 if (len > (q->max_hw_sectors << 9)) 2465 return -EINVAL; 2466 if (!len || !ubuf) 2467 return -EINVAL; 2468 2469 while (bytes_read != len) { 2470 unsigned long map_len, end, start; 2471 2472 map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE); 2473 end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1) 2474 >> PAGE_SHIFT; 2475 start = (unsigned long)ubuf >> PAGE_SHIFT; 2476 2477 /* 2478 * A bad offset could cause us to require BIO_MAX_PAGES + 1 2479 * pages. If this happens we just lower the requested 2480 * mapping len by a page so that we can fit 2481 */ 2482 if (end - start > BIO_MAX_PAGES) 2483 map_len -= PAGE_SIZE; 2484 2485 ret = __blk_rq_map_user(q, rq, ubuf, map_len); 2486 if (ret < 0) 2487 goto unmap_rq; 2488 if (!bio) 2489 bio = rq->bio; 2490 bytes_read += ret; 2491 ubuf += ret; 2492 } 2493 2494 rq->buffer = rq->data = NULL; 2495 return 0; 2496 unmap_rq: 2497 blk_rq_unmap_user(bio); 2498 return ret; 2499 } 2500 2501 EXPORT_SYMBOL(blk_rq_map_user); 2502 2503 /** 2504 * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage 2505 * @q: request queue where request should be inserted 2506 * @rq: request to map data to 2507 * @iov: pointer to the iovec 2508 * @iov_count: number of elements in the iovec 2509 * @len: I/O byte count 2510 * 2511 * Description: 2512 * Data will be mapped directly for zero copy io, if possible. Otherwise 2513 * a kernel bounce buffer is used. 2514 * 2515 * A matching blk_rq_unmap_user() must be issued at the end of io, while 2516 * still in process context. 2517 * 2518 * Note: The mapped bio may need to be bounced through blk_queue_bounce() 2519 * before being submitted to the device, as pages mapped may be out of 2520 * reach. It's the callers responsibility to make sure this happens. The 2521 * original bio must be passed back in to blk_rq_unmap_user() for proper 2522 * unmapping. 2523 */ 2524 int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, 2525 struct sg_iovec *iov, int iov_count, unsigned int len) 2526 { 2527 struct bio *bio; 2528 2529 if (!iov || iov_count <= 0) 2530 return -EINVAL; 2531 2532 /* we don't allow misaligned data like bio_map_user() does. If the 2533 * user is using sg, they're expected to know the alignment constraints 2534 * and respect them accordingly */ 2535 bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ); 2536 if (IS_ERR(bio)) 2537 return PTR_ERR(bio); 2538 2539 if (bio->bi_size != len) { 2540 bio_endio(bio, 0); 2541 bio_unmap_user(bio); 2542 return -EINVAL; 2543 } 2544 2545 bio_get(bio); 2546 blk_rq_bio_prep(q, rq, bio); 2547 rq->buffer = rq->data = NULL; 2548 return 0; 2549 } 2550 2551 EXPORT_SYMBOL(blk_rq_map_user_iov); 2552 2553 /** 2554 * blk_rq_unmap_user - unmap a request with user data 2555 * @bio: start of bio list 2556 * 2557 * Description: 2558 * Unmap a rq previously mapped by blk_rq_map_user(). The caller must 2559 * supply the original rq->bio from the blk_rq_map_user() return, since 2560 * the io completion may have changed rq->bio. 2561 */ 2562 int blk_rq_unmap_user(struct bio *bio) 2563 { 2564 struct bio *mapped_bio; 2565 int ret = 0, ret2; 2566 2567 while (bio) { 2568 mapped_bio = bio; 2569 if (unlikely(bio_flagged(bio, BIO_BOUNCED))) 2570 mapped_bio = bio->bi_private; 2571 2572 ret2 = __blk_rq_unmap_user(mapped_bio); 2573 if (ret2 && !ret) 2574 ret = ret2; 2575 2576 mapped_bio = bio; 2577 bio = bio->bi_next; 2578 bio_put(mapped_bio); 2579 } 2580 2581 return ret; 2582 } 2583 2584 EXPORT_SYMBOL(blk_rq_unmap_user); 2585 2586 /** 2587 * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage 2588 * @q: request queue where request should be inserted 2589 * @rq: request to fill 2590 * @kbuf: the kernel buffer 2591 * @len: length of user data 2592 * @gfp_mask: memory allocation flags 2593 */ 2594 int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, 2595 unsigned int len, gfp_t gfp_mask) 2596 { 2597 struct bio *bio; 2598 2599 if (len > (q->max_hw_sectors << 9)) 2600 return -EINVAL; 2601 if (!len || !kbuf) 2602 return -EINVAL; 2603 2604 bio = bio_map_kern(q, kbuf, len, gfp_mask); 2605 if (IS_ERR(bio)) 2606 return PTR_ERR(bio); 2607 2608 if (rq_data_dir(rq) == WRITE) 2609 bio->bi_rw |= (1 << BIO_RW); 2610 2611 blk_rq_bio_prep(q, rq, bio); 2612 blk_queue_bounce(q, &rq->bio); 2613 rq->buffer = rq->data = NULL; 2614 return 0; 2615 } 2616 2617 EXPORT_SYMBOL(blk_rq_map_kern); 2618 2619 /** 2620 * blk_execute_rq_nowait - insert a request into queue for execution 2621 * @q: queue to insert the request in 2622 * @bd_disk: matching gendisk 2623 * @rq: request to insert 2624 * @at_head: insert request at head or tail of queue 2625 * @done: I/O completion handler 2626 * 2627 * Description: 2628 * Insert a fully prepared request at the back of the io scheduler queue 2629 * for execution. Don't wait for completion. 2630 */ 2631 void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, 2632 struct request *rq, int at_head, 2633 rq_end_io_fn *done) 2634 { 2635 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 2636 2637 rq->rq_disk = bd_disk; 2638 rq->cmd_flags |= REQ_NOMERGE; 2639 rq->end_io = done; 2640 WARN_ON(irqs_disabled()); 2641 spin_lock_irq(q->queue_lock); 2642 __elv_add_request(q, rq, where, 1); 2643 __generic_unplug_device(q); 2644 spin_unlock_irq(q->queue_lock); 2645 } 2646 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); 2647 2648 /** 2649 * blk_execute_rq - insert a request into queue for execution 2650 * @q: queue to insert the request in 2651 * @bd_disk: matching gendisk 2652 * @rq: request to insert 2653 * @at_head: insert request at head or tail of queue 2654 * 2655 * Description: 2656 * Insert a fully prepared request at the back of the io scheduler queue 2657 * for execution and wait for completion. 2658 */ 2659 int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, 2660 struct request *rq, int at_head) 2661 { 2662 DECLARE_COMPLETION_ONSTACK(wait); 2663 char sense[SCSI_SENSE_BUFFERSIZE]; 2664 int err = 0; 2665 2666 /* 2667 * we need an extra reference to the request, so we can look at 2668 * it after io completion 2669 */ 2670 rq->ref_count++; 2671 2672 if (!rq->sense) { 2673 memset(sense, 0, sizeof(sense)); 2674 rq->sense = sense; 2675 rq->sense_len = 0; 2676 } 2677 2678 rq->end_io_data = &wait; 2679 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); 2680 wait_for_completion(&wait); 2681 2682 if (rq->errors) 2683 err = -EIO; 2684 2685 return err; 2686 } 2687 2688 EXPORT_SYMBOL(blk_execute_rq); 2689 2690 static void bio_end_empty_barrier(struct bio *bio, int err) 2691 { 2692 if (err) 2693 clear_bit(BIO_UPTODATE, &bio->bi_flags); 2694 2695 complete(bio->bi_private); 2696 } 2697 2698 /** 2699 * blkdev_issue_flush - queue a flush 2700 * @bdev: blockdev to issue flush for 2701 * @error_sector: error sector 2702 * 2703 * Description: 2704 * Issue a flush for the block device in question. Caller can supply 2705 * room for storing the error offset in case of a flush error, if they 2706 * wish to. Caller must run wait_for_completion() on its own. 2707 */ 2708 int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) 2709 { 2710 DECLARE_COMPLETION_ONSTACK(wait); 2711 struct request_queue *q; 2712 struct bio *bio; 2713 int ret; 2714 2715 if (bdev->bd_disk == NULL) 2716 return -ENXIO; 2717 2718 q = bdev_get_queue(bdev); 2719 if (!q) 2720 return -ENXIO; 2721 2722 bio = bio_alloc(GFP_KERNEL, 0); 2723 if (!bio) 2724 return -ENOMEM; 2725 2726 bio->bi_end_io = bio_end_empty_barrier; 2727 bio->bi_private = &wait; 2728 bio->bi_bdev = bdev; 2729 submit_bio(1 << BIO_RW_BARRIER, bio); 2730 2731 wait_for_completion(&wait); 2732 2733 /* 2734 * The driver must store the error location in ->bi_sector, if 2735 * it supports it. For non-stacked drivers, this should be copied 2736 * from rq->sector. 2737 */ 2738 if (error_sector) 2739 *error_sector = bio->bi_sector; 2740 2741 ret = 0; 2742 if (!bio_flagged(bio, BIO_UPTODATE)) 2743 ret = -EIO; 2744 2745 bio_put(bio); 2746 return ret; 2747 } 2748 2749 EXPORT_SYMBOL(blkdev_issue_flush); 2750 2751 static void drive_stat_acct(struct request *rq, int new_io) 2752 { 2753 int rw = rq_data_dir(rq); 2754 2755 if (!blk_fs_request(rq) || !rq->rq_disk) 2756 return; 2757 2758 if (!new_io) { 2759 __disk_stat_inc(rq->rq_disk, merges[rw]); 2760 } else { 2761 disk_round_stats(rq->rq_disk); 2762 rq->rq_disk->in_flight++; 2763 } 2764 } 2765 2766 /* 2767 * add-request adds a request to the linked list. 2768 * queue lock is held and interrupts disabled, as we muck with the 2769 * request queue list. 2770 */ 2771 static inline void add_request(struct request_queue * q, struct request * req) 2772 { 2773 drive_stat_acct(req, 1); 2774 2775 /* 2776 * elevator indicated where it wants this request to be 2777 * inserted at elevator_merge time 2778 */ 2779 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); 2780 } 2781 2782 /* 2783 * disk_round_stats() - Round off the performance stats on a struct 2784 * disk_stats. 2785 * 2786 * The average IO queue length and utilisation statistics are maintained 2787 * by observing the current state of the queue length and the amount of 2788 * time it has been in this state for. 2789 * 2790 * Normally, that accounting is done on IO completion, but that can result 2791 * in more than a second's worth of IO being accounted for within any one 2792 * second, leading to >100% utilisation. To deal with that, we call this 2793 * function to do a round-off before returning the results when reading 2794 * /proc/diskstats. This accounts immediately for all queue usage up to 2795 * the current jiffies and restarts the counters again. 2796 */ 2797 void disk_round_stats(struct gendisk *disk) 2798 { 2799 unsigned long now = jiffies; 2800 2801 if (now == disk->stamp) 2802 return; 2803 2804 if (disk->in_flight) { 2805 __disk_stat_add(disk, time_in_queue, 2806 disk->in_flight * (now - disk->stamp)); 2807 __disk_stat_add(disk, io_ticks, (now - disk->stamp)); 2808 } 2809 disk->stamp = now; 2810 } 2811 2812 EXPORT_SYMBOL_GPL(disk_round_stats); 2813 2814 /* 2815 * queue lock must be held 2816 */ 2817 void __blk_put_request(struct request_queue *q, struct request *req) 2818 { 2819 if (unlikely(!q)) 2820 return; 2821 if (unlikely(--req->ref_count)) 2822 return; 2823 2824 elv_completed_request(q, req); 2825 2826 /* 2827 * Request may not have originated from ll_rw_blk. if not, 2828 * it didn't come out of our reserved rq pools 2829 */ 2830 if (req->cmd_flags & REQ_ALLOCED) { 2831 int rw = rq_data_dir(req); 2832 int priv = req->cmd_flags & REQ_ELVPRIV; 2833 2834 BUG_ON(!list_empty(&req->queuelist)); 2835 BUG_ON(!hlist_unhashed(&req->hash)); 2836 2837 blk_free_request(q, req); 2838 freed_request(q, rw, priv); 2839 } 2840 } 2841 2842 EXPORT_SYMBOL_GPL(__blk_put_request); 2843 2844 void blk_put_request(struct request *req) 2845 { 2846 unsigned long flags; 2847 struct request_queue *q = req->q; 2848 2849 /* 2850 * Gee, IDE calls in w/ NULL q. Fix IDE and remove the 2851 * following if (q) test. 2852 */ 2853 if (q) { 2854 spin_lock_irqsave(q->queue_lock, flags); 2855 __blk_put_request(q, req); 2856 spin_unlock_irqrestore(q->queue_lock, flags); 2857 } 2858 } 2859 2860 EXPORT_SYMBOL(blk_put_request); 2861 2862 /** 2863 * blk_end_sync_rq - executes a completion event on a request 2864 * @rq: request to complete 2865 * @error: end io status of the request 2866 */ 2867 void blk_end_sync_rq(struct request *rq, int error) 2868 { 2869 struct completion *waiting = rq->end_io_data; 2870 2871 rq->end_io_data = NULL; 2872 __blk_put_request(rq->q, rq); 2873 2874 /* 2875 * complete last, if this is a stack request the process (and thus 2876 * the rq pointer) could be invalid right after this complete() 2877 */ 2878 complete(waiting); 2879 } 2880 EXPORT_SYMBOL(blk_end_sync_rq); 2881 2882 /* 2883 * Has to be called with the request spinlock acquired 2884 */ 2885 static int attempt_merge(struct request_queue *q, struct request *req, 2886 struct request *next) 2887 { 2888 if (!rq_mergeable(req) || !rq_mergeable(next)) 2889 return 0; 2890 2891 /* 2892 * not contiguous 2893 */ 2894 if (req->sector + req->nr_sectors != next->sector) 2895 return 0; 2896 2897 if (rq_data_dir(req) != rq_data_dir(next) 2898 || req->rq_disk != next->rq_disk 2899 || next->special) 2900 return 0; 2901 2902 /* 2903 * If we are allowed to merge, then append bio list 2904 * from next to rq and release next. merge_requests_fn 2905 * will have updated segment counts, update sector 2906 * counts here. 2907 */ 2908 if (!ll_merge_requests_fn(q, req, next)) 2909 return 0; 2910 2911 /* 2912 * At this point we have either done a back merge 2913 * or front merge. We need the smaller start_time of 2914 * the merged requests to be the current request 2915 * for accounting purposes. 2916 */ 2917 if (time_after(req->start_time, next->start_time)) 2918 req->start_time = next->start_time; 2919 2920 req->biotail->bi_next = next->bio; 2921 req->biotail = next->biotail; 2922 2923 req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors; 2924 2925 elv_merge_requests(q, req, next); 2926 2927 if (req->rq_disk) { 2928 disk_round_stats(req->rq_disk); 2929 req->rq_disk->in_flight--; 2930 } 2931 2932 req->ioprio = ioprio_best(req->ioprio, next->ioprio); 2933 2934 __blk_put_request(q, next); 2935 return 1; 2936 } 2937 2938 static inline int attempt_back_merge(struct request_queue *q, 2939 struct request *rq) 2940 { 2941 struct request *next = elv_latter_request(q, rq); 2942 2943 if (next) 2944 return attempt_merge(q, rq, next); 2945 2946 return 0; 2947 } 2948 2949 static inline int attempt_front_merge(struct request_queue *q, 2950 struct request *rq) 2951 { 2952 struct request *prev = elv_former_request(q, rq); 2953 2954 if (prev) 2955 return attempt_merge(q, prev, rq); 2956 2957 return 0; 2958 } 2959 2960 static void init_request_from_bio(struct request *req, struct bio *bio) 2961 { 2962 req->cmd_type = REQ_TYPE_FS; 2963 2964 /* 2965 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) 2966 */ 2967 if (bio_rw_ahead(bio) || bio_failfast(bio)) 2968 req->cmd_flags |= REQ_FAILFAST; 2969 2970 /* 2971 * REQ_BARRIER implies no merging, but lets make it explicit 2972 */ 2973 if (unlikely(bio_barrier(bio))) 2974 req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE); 2975 2976 if (bio_sync(bio)) 2977 req->cmd_flags |= REQ_RW_SYNC; 2978 if (bio_rw_meta(bio)) 2979 req->cmd_flags |= REQ_RW_META; 2980 2981 req->errors = 0; 2982 req->hard_sector = req->sector = bio->bi_sector; 2983 req->ioprio = bio_prio(bio); 2984 req->start_time = jiffies; 2985 blk_rq_bio_prep(req->q, req, bio); 2986 } 2987 2988 static int __make_request(struct request_queue *q, struct bio *bio) 2989 { 2990 struct request *req; 2991 int el_ret, nr_sectors, barrier, err; 2992 const unsigned short prio = bio_prio(bio); 2993 const int sync = bio_sync(bio); 2994 int rw_flags; 2995 2996 nr_sectors = bio_sectors(bio); 2997 2998 /* 2999 * low level driver can indicate that it wants pages above a 3000 * certain limit bounced to low memory (ie for highmem, or even 3001 * ISA dma in theory) 3002 */ 3003 blk_queue_bounce(q, &bio); 3004 3005 barrier = bio_barrier(bio); 3006 if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) { 3007 err = -EOPNOTSUPP; 3008 goto end_io; 3009 } 3010 3011 spin_lock_irq(q->queue_lock); 3012 3013 if (unlikely(barrier) || elv_queue_empty(q)) 3014 goto get_rq; 3015 3016 el_ret = elv_merge(q, &req, bio); 3017 switch (el_ret) { 3018 case ELEVATOR_BACK_MERGE: 3019 BUG_ON(!rq_mergeable(req)); 3020 3021 if (!ll_back_merge_fn(q, req, bio)) 3022 break; 3023 3024 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); 3025 3026 req->biotail->bi_next = bio; 3027 req->biotail = bio; 3028 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 3029 req->ioprio = ioprio_best(req->ioprio, prio); 3030 drive_stat_acct(req, 0); 3031 if (!attempt_back_merge(q, req)) 3032 elv_merged_request(q, req, el_ret); 3033 goto out; 3034 3035 case ELEVATOR_FRONT_MERGE: 3036 BUG_ON(!rq_mergeable(req)); 3037 3038 if (!ll_front_merge_fn(q, req, bio)) 3039 break; 3040 3041 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); 3042 3043 bio->bi_next = req->bio; 3044 req->bio = bio; 3045 3046 /* 3047 * may not be valid. if the low level driver said 3048 * it didn't need a bounce buffer then it better 3049 * not touch req->buffer either... 3050 */ 3051 req->buffer = bio_data(bio); 3052 req->current_nr_sectors = bio_cur_sectors(bio); 3053 req->hard_cur_sectors = req->current_nr_sectors; 3054 req->sector = req->hard_sector = bio->bi_sector; 3055 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 3056 req->ioprio = ioprio_best(req->ioprio, prio); 3057 drive_stat_acct(req, 0); 3058 if (!attempt_front_merge(q, req)) 3059 elv_merged_request(q, req, el_ret); 3060 goto out; 3061 3062 /* ELV_NO_MERGE: elevator says don't/can't merge. */ 3063 default: 3064 ; 3065 } 3066 3067 get_rq: 3068 /* 3069 * This sync check and mask will be re-done in init_request_from_bio(), 3070 * but we need to set it earlier to expose the sync flag to the 3071 * rq allocator and io schedulers. 3072 */ 3073 rw_flags = bio_data_dir(bio); 3074 if (sync) 3075 rw_flags |= REQ_RW_SYNC; 3076 3077 /* 3078 * Grab a free request. This is might sleep but can not fail. 3079 * Returns with the queue unlocked. 3080 */ 3081 req = get_request_wait(q, rw_flags, bio); 3082 3083 /* 3084 * After dropping the lock and possibly sleeping here, our request 3085 * may now be mergeable after it had proven unmergeable (above). 3086 * We don't worry about that case for efficiency. It won't happen 3087 * often, and the elevators are able to handle it. 3088 */ 3089 init_request_from_bio(req, bio); 3090 3091 spin_lock_irq(q->queue_lock); 3092 if (elv_queue_empty(q)) 3093 blk_plug_device(q); 3094 add_request(q, req); 3095 out: 3096 if (sync) 3097 __generic_unplug_device(q); 3098 3099 spin_unlock_irq(q->queue_lock); 3100 return 0; 3101 3102 end_io: 3103 bio_endio(bio, err); 3104 return 0; 3105 } 3106 3107 /* 3108 * If bio->bi_dev is a partition, remap the location 3109 */ 3110 static inline void blk_partition_remap(struct bio *bio) 3111 { 3112 struct block_device *bdev = bio->bi_bdev; 3113 3114 if (bio_sectors(bio) && bdev != bdev->bd_contains) { 3115 struct hd_struct *p = bdev->bd_part; 3116 const int rw = bio_data_dir(bio); 3117 3118 p->sectors[rw] += bio_sectors(bio); 3119 p->ios[rw]++; 3120 3121 bio->bi_sector += p->start_sect; 3122 bio->bi_bdev = bdev->bd_contains; 3123 3124 blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio, 3125 bdev->bd_dev, bio->bi_sector, 3126 bio->bi_sector - p->start_sect); 3127 } 3128 } 3129 3130 static void handle_bad_sector(struct bio *bio) 3131 { 3132 char b[BDEVNAME_SIZE]; 3133 3134 printk(KERN_INFO "attempt to access beyond end of device\n"); 3135 printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", 3136 bdevname(bio->bi_bdev, b), 3137 bio->bi_rw, 3138 (unsigned long long)bio->bi_sector + bio_sectors(bio), 3139 (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); 3140 3141 set_bit(BIO_EOF, &bio->bi_flags); 3142 } 3143 3144 #ifdef CONFIG_FAIL_MAKE_REQUEST 3145 3146 static DECLARE_FAULT_ATTR(fail_make_request); 3147 3148 static int __init setup_fail_make_request(char *str) 3149 { 3150 return setup_fault_attr(&fail_make_request, str); 3151 } 3152 __setup("fail_make_request=", setup_fail_make_request); 3153 3154 static int should_fail_request(struct bio *bio) 3155 { 3156 if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) || 3157 (bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail)) 3158 return should_fail(&fail_make_request, bio->bi_size); 3159 3160 return 0; 3161 } 3162 3163 static int __init fail_make_request_debugfs(void) 3164 { 3165 return init_fault_attr_dentries(&fail_make_request, 3166 "fail_make_request"); 3167 } 3168 3169 late_initcall(fail_make_request_debugfs); 3170 3171 #else /* CONFIG_FAIL_MAKE_REQUEST */ 3172 3173 static inline int should_fail_request(struct bio *bio) 3174 { 3175 return 0; 3176 } 3177 3178 #endif /* CONFIG_FAIL_MAKE_REQUEST */ 3179 3180 /* 3181 * Check whether this bio extends beyond the end of the device. 3182 */ 3183 static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) 3184 { 3185 sector_t maxsector; 3186 3187 if (!nr_sectors) 3188 return 0; 3189 3190 /* Test device or partition size, when known. */ 3191 maxsector = bio->bi_bdev->bd_inode->i_size >> 9; 3192 if (maxsector) { 3193 sector_t sector = bio->bi_sector; 3194 3195 if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { 3196 /* 3197 * This may well happen - the kernel calls bread() 3198 * without checking the size of the device, e.g., when 3199 * mounting a device. 3200 */ 3201 handle_bad_sector(bio); 3202 return 1; 3203 } 3204 } 3205 3206 return 0; 3207 } 3208 3209 /** 3210 * generic_make_request: hand a buffer to its device driver for I/O 3211 * @bio: The bio describing the location in memory and on the device. 3212 * 3213 * generic_make_request() is used to make I/O requests of block 3214 * devices. It is passed a &struct bio, which describes the I/O that needs 3215 * to be done. 3216 * 3217 * generic_make_request() does not return any status. The 3218 * success/failure status of the request, along with notification of 3219 * completion, is delivered asynchronously through the bio->bi_end_io 3220 * function described (one day) else where. 3221 * 3222 * The caller of generic_make_request must make sure that bi_io_vec 3223 * are set to describe the memory buffer, and that bi_dev and bi_sector are 3224 * set to describe the device address, and the 3225 * bi_end_io and optionally bi_private are set to describe how 3226 * completion notification should be signaled. 3227 * 3228 * generic_make_request and the drivers it calls may use bi_next if this 3229 * bio happens to be merged with someone else, and may change bi_dev and 3230 * bi_sector for remaps as it sees fit. So the values of these fields 3231 * should NOT be depended on after the call to generic_make_request. 3232 */ 3233 static inline void __generic_make_request(struct bio *bio) 3234 { 3235 struct request_queue *q; 3236 sector_t old_sector; 3237 int ret, nr_sectors = bio_sectors(bio); 3238 dev_t old_dev; 3239 int err = -EIO; 3240 3241 might_sleep(); 3242 3243 if (bio_check_eod(bio, nr_sectors)) 3244 goto end_io; 3245 3246 /* 3247 * Resolve the mapping until finished. (drivers are 3248 * still free to implement/resolve their own stacking 3249 * by explicitly returning 0) 3250 * 3251 * NOTE: we don't repeat the blk_size check for each new device. 3252 * Stacking drivers are expected to know what they are doing. 3253 */ 3254 old_sector = -1; 3255 old_dev = 0; 3256 do { 3257 char b[BDEVNAME_SIZE]; 3258 3259 q = bdev_get_queue(bio->bi_bdev); 3260 if (!q) { 3261 printk(KERN_ERR 3262 "generic_make_request: Trying to access " 3263 "nonexistent block-device %s (%Lu)\n", 3264 bdevname(bio->bi_bdev, b), 3265 (long long) bio->bi_sector); 3266 end_io: 3267 bio_endio(bio, err); 3268 break; 3269 } 3270 3271 if (unlikely(nr_sectors > q->max_hw_sectors)) { 3272 printk("bio too big device %s (%u > %u)\n", 3273 bdevname(bio->bi_bdev, b), 3274 bio_sectors(bio), 3275 q->max_hw_sectors); 3276 goto end_io; 3277 } 3278 3279 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) 3280 goto end_io; 3281 3282 if (should_fail_request(bio)) 3283 goto end_io; 3284 3285 /* 3286 * If this device has partitions, remap block n 3287 * of partition p to block n+start(p) of the disk. 3288 */ 3289 blk_partition_remap(bio); 3290 3291 if (old_sector != -1) 3292 blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 3293 old_sector); 3294 3295 blk_add_trace_bio(q, bio, BLK_TA_QUEUE); 3296 3297 old_sector = bio->bi_sector; 3298 old_dev = bio->bi_bdev->bd_dev; 3299 3300 if (bio_check_eod(bio, nr_sectors)) 3301 goto end_io; 3302 if (bio_empty_barrier(bio) && !q->prepare_flush_fn) { 3303 err = -EOPNOTSUPP; 3304 goto end_io; 3305 } 3306 3307 ret = q->make_request_fn(q, bio); 3308 } while (ret); 3309 } 3310 3311 /* 3312 * We only want one ->make_request_fn to be active at a time, 3313 * else stack usage with stacked devices could be a problem. 3314 * So use current->bio_{list,tail} to keep a list of requests 3315 * submited by a make_request_fn function. 3316 * current->bio_tail is also used as a flag to say if 3317 * generic_make_request is currently active in this task or not. 3318 * If it is NULL, then no make_request is active. If it is non-NULL, 3319 * then a make_request is active, and new requests should be added 3320 * at the tail 3321 */ 3322 void generic_make_request(struct bio *bio) 3323 { 3324 if (current->bio_tail) { 3325 /* make_request is active */ 3326 *(current->bio_tail) = bio; 3327 bio->bi_next = NULL; 3328 current->bio_tail = &bio->bi_next; 3329 return; 3330 } 3331 /* following loop may be a bit non-obvious, and so deserves some 3332 * explanation. 3333 * Before entering the loop, bio->bi_next is NULL (as all callers 3334 * ensure that) so we have a list with a single bio. 3335 * We pretend that we have just taken it off a longer list, so 3336 * we assign bio_list to the next (which is NULL) and bio_tail 3337 * to &bio_list, thus initialising the bio_list of new bios to be 3338 * added. __generic_make_request may indeed add some more bios 3339 * through a recursive call to generic_make_request. If it 3340 * did, we find a non-NULL value in bio_list and re-enter the loop 3341 * from the top. In this case we really did just take the bio 3342 * of the top of the list (no pretending) and so fixup bio_list and 3343 * bio_tail or bi_next, and call into __generic_make_request again. 3344 * 3345 * The loop was structured like this to make only one call to 3346 * __generic_make_request (which is important as it is large and 3347 * inlined) and to keep the structure simple. 3348 */ 3349 BUG_ON(bio->bi_next); 3350 do { 3351 current->bio_list = bio->bi_next; 3352 if (bio->bi_next == NULL) 3353 current->bio_tail = &current->bio_list; 3354 else 3355 bio->bi_next = NULL; 3356 __generic_make_request(bio); 3357 bio = current->bio_list; 3358 } while (bio); 3359 current->bio_tail = NULL; /* deactivate */ 3360 } 3361 3362 EXPORT_SYMBOL(generic_make_request); 3363 3364 /** 3365 * submit_bio: submit a bio to the block device layer for I/O 3366 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) 3367 * @bio: The &struct bio which describes the I/O 3368 * 3369 * submit_bio() is very similar in purpose to generic_make_request(), and 3370 * uses that function to do most of the work. Both are fairly rough 3371 * interfaces, @bio must be presetup and ready for I/O. 3372 * 3373 */ 3374 void submit_bio(int rw, struct bio *bio) 3375 { 3376 int count = bio_sectors(bio); 3377 3378 bio->bi_rw |= rw; 3379 3380 /* 3381 * If it's a regular read/write or a barrier with data attached, 3382 * go through the normal accounting stuff before submission. 3383 */ 3384 if (!bio_empty_barrier(bio)) { 3385 3386 BIO_BUG_ON(!bio->bi_size); 3387 BIO_BUG_ON(!bio->bi_io_vec); 3388 3389 if (rw & WRITE) { 3390 count_vm_events(PGPGOUT, count); 3391 } else { 3392 task_io_account_read(bio->bi_size); 3393 count_vm_events(PGPGIN, count); 3394 } 3395 3396 if (unlikely(block_dump)) { 3397 char b[BDEVNAME_SIZE]; 3398 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", 3399 current->comm, task_pid_nr(current), 3400 (rw & WRITE) ? "WRITE" : "READ", 3401 (unsigned long long)bio->bi_sector, 3402 bdevname(bio->bi_bdev,b)); 3403 } 3404 } 3405 3406 generic_make_request(bio); 3407 } 3408 3409 EXPORT_SYMBOL(submit_bio); 3410 3411 static void blk_recalc_rq_sectors(struct request *rq, int nsect) 3412 { 3413 if (blk_fs_request(rq)) { 3414 rq->hard_sector += nsect; 3415 rq->hard_nr_sectors -= nsect; 3416 3417 /* 3418 * Move the I/O submission pointers ahead if required. 3419 */ 3420 if ((rq->nr_sectors >= rq->hard_nr_sectors) && 3421 (rq->sector <= rq->hard_sector)) { 3422 rq->sector = rq->hard_sector; 3423 rq->nr_sectors = rq->hard_nr_sectors; 3424 rq->hard_cur_sectors = bio_cur_sectors(rq->bio); 3425 rq->current_nr_sectors = rq->hard_cur_sectors; 3426 rq->buffer = bio_data(rq->bio); 3427 } 3428 3429 /* 3430 * if total number of sectors is less than the first segment 3431 * size, something has gone terribly wrong 3432 */ 3433 if (rq->nr_sectors < rq->current_nr_sectors) { 3434 printk("blk: request botched\n"); 3435 rq->nr_sectors = rq->current_nr_sectors; 3436 } 3437 } 3438 } 3439 3440 static int __end_that_request_first(struct request *req, int uptodate, 3441 int nr_bytes) 3442 { 3443 int total_bytes, bio_nbytes, error, next_idx = 0; 3444 struct bio *bio; 3445 3446 blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE); 3447 3448 /* 3449 * extend uptodate bool to allow < 0 value to be direct io error 3450 */ 3451 error = 0; 3452 if (end_io_error(uptodate)) 3453 error = !uptodate ? -EIO : uptodate; 3454 3455 /* 3456 * for a REQ_BLOCK_PC request, we want to carry any eventual 3457 * sense key with us all the way through 3458 */ 3459 if (!blk_pc_request(req)) 3460 req->errors = 0; 3461 3462 if (!uptodate) { 3463 if (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET)) 3464 printk("end_request: I/O error, dev %s, sector %llu\n", 3465 req->rq_disk ? req->rq_disk->disk_name : "?", 3466 (unsigned long long)req->sector); 3467 } 3468 3469 if (blk_fs_request(req) && req->rq_disk) { 3470 const int rw = rq_data_dir(req); 3471 3472 disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9); 3473 } 3474 3475 total_bytes = bio_nbytes = 0; 3476 while ((bio = req->bio) != NULL) { 3477 int nbytes; 3478 3479 /* 3480 * For an empty barrier request, the low level driver must 3481 * store a potential error location in ->sector. We pass 3482 * that back up in ->bi_sector. 3483 */ 3484 if (blk_empty_barrier(req)) 3485 bio->bi_sector = req->sector; 3486 3487 if (nr_bytes >= bio->bi_size) { 3488 req->bio = bio->bi_next; 3489 nbytes = bio->bi_size; 3490 req_bio_endio(req, bio, nbytes, error); 3491 next_idx = 0; 3492 bio_nbytes = 0; 3493 } else { 3494 int idx = bio->bi_idx + next_idx; 3495 3496 if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { 3497 blk_dump_rq_flags(req, "__end_that"); 3498 printk("%s: bio idx %d >= vcnt %d\n", 3499 __FUNCTION__, 3500 bio->bi_idx, bio->bi_vcnt); 3501 break; 3502 } 3503 3504 nbytes = bio_iovec_idx(bio, idx)->bv_len; 3505 BIO_BUG_ON(nbytes > bio->bi_size); 3506 3507 /* 3508 * not a complete bvec done 3509 */ 3510 if (unlikely(nbytes > nr_bytes)) { 3511 bio_nbytes += nr_bytes; 3512 total_bytes += nr_bytes; 3513 break; 3514 } 3515 3516 /* 3517 * advance to the next vector 3518 */ 3519 next_idx++; 3520 bio_nbytes += nbytes; 3521 } 3522 3523 total_bytes += nbytes; 3524 nr_bytes -= nbytes; 3525 3526 if ((bio = req->bio)) { 3527 /* 3528 * end more in this run, or just return 'not-done' 3529 */ 3530 if (unlikely(nr_bytes <= 0)) 3531 break; 3532 } 3533 } 3534 3535 /* 3536 * completely done 3537 */ 3538 if (!req->bio) 3539 return 0; 3540 3541 /* 3542 * if the request wasn't completed, update state 3543 */ 3544 if (bio_nbytes) { 3545 req_bio_endio(req, bio, bio_nbytes, error); 3546 bio->bi_idx += next_idx; 3547 bio_iovec(bio)->bv_offset += nr_bytes; 3548 bio_iovec(bio)->bv_len -= nr_bytes; 3549 } 3550 3551 blk_recalc_rq_sectors(req, total_bytes >> 9); 3552 blk_recalc_rq_segments(req); 3553 return 1; 3554 } 3555 3556 /** 3557 * end_that_request_first - end I/O on a request 3558 * @req: the request being processed 3559 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error 3560 * @nr_sectors: number of sectors to end I/O on 3561 * 3562 * Description: 3563 * Ends I/O on a number of sectors attached to @req, and sets it up 3564 * for the next range of segments (if any) in the cluster. 3565 * 3566 * Return: 3567 * 0 - we are done with this request, call end_that_request_last() 3568 * 1 - still buffers pending for this request 3569 **/ 3570 int end_that_request_first(struct request *req, int uptodate, int nr_sectors) 3571 { 3572 return __end_that_request_first(req, uptodate, nr_sectors << 9); 3573 } 3574 3575 EXPORT_SYMBOL(end_that_request_first); 3576 3577 /** 3578 * end_that_request_chunk - end I/O on a request 3579 * @req: the request being processed 3580 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error 3581 * @nr_bytes: number of bytes to complete 3582 * 3583 * Description: 3584 * Ends I/O on a number of bytes attached to @req, and sets it up 3585 * for the next range of segments (if any). Like end_that_request_first(), 3586 * but deals with bytes instead of sectors. 3587 * 3588 * Return: 3589 * 0 - we are done with this request, call end_that_request_last() 3590 * 1 - still buffers pending for this request 3591 **/ 3592 int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes) 3593 { 3594 return __end_that_request_first(req, uptodate, nr_bytes); 3595 } 3596 3597 EXPORT_SYMBOL(end_that_request_chunk); 3598 3599 /* 3600 * splice the completion data to a local structure and hand off to 3601 * process_completion_queue() to complete the requests 3602 */ 3603 static void blk_done_softirq(struct softirq_action *h) 3604 { 3605 struct list_head *cpu_list, local_list; 3606 3607 local_irq_disable(); 3608 cpu_list = &__get_cpu_var(blk_cpu_done); 3609 list_replace_init(cpu_list, &local_list); 3610 local_irq_enable(); 3611 3612 while (!list_empty(&local_list)) { 3613 struct request *rq = list_entry(local_list.next, struct request, donelist); 3614 3615 list_del_init(&rq->donelist); 3616 rq->q->softirq_done_fn(rq); 3617 } 3618 } 3619 3620 static int __cpuinit blk_cpu_notify(struct notifier_block *self, unsigned long action, 3621 void *hcpu) 3622 { 3623 /* 3624 * If a CPU goes away, splice its entries to the current CPU 3625 * and trigger a run of the softirq 3626 */ 3627 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 3628 int cpu = (unsigned long) hcpu; 3629 3630 local_irq_disable(); 3631 list_splice_init(&per_cpu(blk_cpu_done, cpu), 3632 &__get_cpu_var(blk_cpu_done)); 3633 raise_softirq_irqoff(BLOCK_SOFTIRQ); 3634 local_irq_enable(); 3635 } 3636 3637 return NOTIFY_OK; 3638 } 3639 3640 3641 static struct notifier_block blk_cpu_notifier __cpuinitdata = { 3642 .notifier_call = blk_cpu_notify, 3643 }; 3644 3645 /** 3646 * blk_complete_request - end I/O on a request 3647 * @req: the request being processed 3648 * 3649 * Description: 3650 * Ends all I/O on a request. It does not handle partial completions, 3651 * unless the driver actually implements this in its completion callback 3652 * through requeueing. The actual completion happens out-of-order, 3653 * through a softirq handler. The user must have registered a completion 3654 * callback through blk_queue_softirq_done(). 3655 **/ 3656 3657 void blk_complete_request(struct request *req) 3658 { 3659 struct list_head *cpu_list; 3660 unsigned long flags; 3661 3662 BUG_ON(!req->q->softirq_done_fn); 3663 3664 local_irq_save(flags); 3665 3666 cpu_list = &__get_cpu_var(blk_cpu_done); 3667 list_add_tail(&req->donelist, cpu_list); 3668 raise_softirq_irqoff(BLOCK_SOFTIRQ); 3669 3670 local_irq_restore(flags); 3671 } 3672 3673 EXPORT_SYMBOL(blk_complete_request); 3674 3675 /* 3676 * queue lock must be held 3677 */ 3678 void end_that_request_last(struct request *req, int uptodate) 3679 { 3680 struct gendisk *disk = req->rq_disk; 3681 int error; 3682 3683 /* 3684 * extend uptodate bool to allow < 0 value to be direct io error 3685 */ 3686 error = 0; 3687 if (end_io_error(uptodate)) 3688 error = !uptodate ? -EIO : uptodate; 3689 3690 if (unlikely(laptop_mode) && blk_fs_request(req)) 3691 laptop_io_completion(); 3692 3693 /* 3694 * Account IO completion. bar_rq isn't accounted as a normal 3695 * IO on queueing nor completion. Accounting the containing 3696 * request is enough. 3697 */ 3698 if (disk && blk_fs_request(req) && req != &req->q->bar_rq) { 3699 unsigned long duration = jiffies - req->start_time; 3700 const int rw = rq_data_dir(req); 3701 3702 __disk_stat_inc(disk, ios[rw]); 3703 __disk_stat_add(disk, ticks[rw], duration); 3704 disk_round_stats(disk); 3705 disk->in_flight--; 3706 } 3707 if (req->end_io) 3708 req->end_io(req, error); 3709 else 3710 __blk_put_request(req->q, req); 3711 } 3712 3713 EXPORT_SYMBOL(end_that_request_last); 3714 3715 static inline void __end_request(struct request *rq, int uptodate, 3716 unsigned int nr_bytes, int dequeue) 3717 { 3718 if (!end_that_request_chunk(rq, uptodate, nr_bytes)) { 3719 if (dequeue) 3720 blkdev_dequeue_request(rq); 3721 add_disk_randomness(rq->rq_disk); 3722 end_that_request_last(rq, uptodate); 3723 } 3724 } 3725 3726 static unsigned int rq_byte_size(struct request *rq) 3727 { 3728 if (blk_fs_request(rq)) 3729 return rq->hard_nr_sectors << 9; 3730 3731 return rq->data_len; 3732 } 3733 3734 /** 3735 * end_queued_request - end all I/O on a queued request 3736 * @rq: the request being processed 3737 * @uptodate: error value or 0/1 uptodate flag 3738 * 3739 * Description: 3740 * Ends all I/O on a request, and removes it from the block layer queues. 3741 * Not suitable for normal IO completion, unless the driver still has 3742 * the request attached to the block layer. 3743 * 3744 **/ 3745 void end_queued_request(struct request *rq, int uptodate) 3746 { 3747 __end_request(rq, uptodate, rq_byte_size(rq), 1); 3748 } 3749 EXPORT_SYMBOL(end_queued_request); 3750 3751 /** 3752 * end_dequeued_request - end all I/O on a dequeued request 3753 * @rq: the request being processed 3754 * @uptodate: error value or 0/1 uptodate flag 3755 * 3756 * Description: 3757 * Ends all I/O on a request. The request must already have been 3758 * dequeued using blkdev_dequeue_request(), as is normally the case 3759 * for most drivers. 3760 * 3761 **/ 3762 void end_dequeued_request(struct request *rq, int uptodate) 3763 { 3764 __end_request(rq, uptodate, rq_byte_size(rq), 0); 3765 } 3766 EXPORT_SYMBOL(end_dequeued_request); 3767 3768 3769 /** 3770 * end_request - end I/O on the current segment of the request 3771 * @req: the request being processed 3772 * @uptodate: error value or 0/1 uptodate flag 3773 * 3774 * Description: 3775 * Ends I/O on the current segment of a request. If that is the only 3776 * remaining segment, the request is also completed and freed. 3777 * 3778 * This is a remnant of how older block drivers handled IO completions. 3779 * Modern drivers typically end IO on the full request in one go, unless 3780 * they have a residual value to account for. For that case this function 3781 * isn't really useful, unless the residual just happens to be the 3782 * full current segment. In other words, don't use this function in new 3783 * code. Either use end_request_completely(), or the 3784 * end_that_request_chunk() (along with end_that_request_last()) for 3785 * partial completions. 3786 * 3787 **/ 3788 void end_request(struct request *req, int uptodate) 3789 { 3790 __end_request(req, uptodate, req->hard_cur_sectors << 9, 1); 3791 } 3792 EXPORT_SYMBOL(end_request); 3793 3794 static void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 3795 struct bio *bio) 3796 { 3797 /* first two bits are identical in rq->cmd_flags and bio->bi_rw */ 3798 rq->cmd_flags |= (bio->bi_rw & 3); 3799 3800 rq->nr_phys_segments = bio_phys_segments(q, bio); 3801 rq->nr_hw_segments = bio_hw_segments(q, bio); 3802 rq->current_nr_sectors = bio_cur_sectors(bio); 3803 rq->hard_cur_sectors = rq->current_nr_sectors; 3804 rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio); 3805 rq->buffer = bio_data(bio); 3806 rq->data_len = bio->bi_size; 3807 3808 rq->bio = rq->biotail = bio; 3809 3810 if (bio->bi_bdev) 3811 rq->rq_disk = bio->bi_bdev->bd_disk; 3812 } 3813 3814 int kblockd_schedule_work(struct work_struct *work) 3815 { 3816 return queue_work(kblockd_workqueue, work); 3817 } 3818 3819 EXPORT_SYMBOL(kblockd_schedule_work); 3820 3821 void kblockd_flush_work(struct work_struct *work) 3822 { 3823 cancel_work_sync(work); 3824 } 3825 EXPORT_SYMBOL(kblockd_flush_work); 3826 3827 int __init blk_dev_init(void) 3828 { 3829 int i; 3830 3831 kblockd_workqueue = create_workqueue("kblockd"); 3832 if (!kblockd_workqueue) 3833 panic("Failed to create kblockd\n"); 3834 3835 request_cachep = kmem_cache_create("blkdev_requests", 3836 sizeof(struct request), 0, SLAB_PANIC, NULL); 3837 3838 requestq_cachep = kmem_cache_create("blkdev_queue", 3839 sizeof(struct request_queue), 0, SLAB_PANIC, NULL); 3840 3841 iocontext_cachep = kmem_cache_create("blkdev_ioc", 3842 sizeof(struct io_context), 0, SLAB_PANIC, NULL); 3843 3844 for_each_possible_cpu(i) 3845 INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); 3846 3847 open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); 3848 register_hotcpu_notifier(&blk_cpu_notifier); 3849 3850 blk_max_low_pfn = max_low_pfn - 1; 3851 blk_max_pfn = max_pfn - 1; 3852 3853 return 0; 3854 } 3855 3856 /* 3857 * IO Context helper functions 3858 */ 3859 void put_io_context(struct io_context *ioc) 3860 { 3861 if (ioc == NULL) 3862 return; 3863 3864 BUG_ON(atomic_read(&ioc->refcount) == 0); 3865 3866 if (atomic_dec_and_test(&ioc->refcount)) { 3867 struct cfq_io_context *cic; 3868 3869 rcu_read_lock(); 3870 if (ioc->aic && ioc->aic->dtor) 3871 ioc->aic->dtor(ioc->aic); 3872 if (ioc->cic_root.rb_node != NULL) { 3873 struct rb_node *n = rb_first(&ioc->cic_root); 3874 3875 cic = rb_entry(n, struct cfq_io_context, rb_node); 3876 cic->dtor(ioc); 3877 } 3878 rcu_read_unlock(); 3879 3880 kmem_cache_free(iocontext_cachep, ioc); 3881 } 3882 } 3883 EXPORT_SYMBOL(put_io_context); 3884 3885 /* Called by the exitting task */ 3886 void exit_io_context(void) 3887 { 3888 struct io_context *ioc; 3889 struct cfq_io_context *cic; 3890 3891 task_lock(current); 3892 ioc = current->io_context; 3893 current->io_context = NULL; 3894 task_unlock(current); 3895 3896 ioc->task = NULL; 3897 if (ioc->aic && ioc->aic->exit) 3898 ioc->aic->exit(ioc->aic); 3899 if (ioc->cic_root.rb_node != NULL) { 3900 cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node); 3901 cic->exit(ioc); 3902 } 3903 3904 put_io_context(ioc); 3905 } 3906 3907 /* 3908 * If the current task has no IO context then create one and initialise it. 3909 * Otherwise, return its existing IO context. 3910 * 3911 * This returned IO context doesn't have a specifically elevated refcount, 3912 * but since the current task itself holds a reference, the context can be 3913 * used in general code, so long as it stays within `current` context. 3914 */ 3915 static struct io_context *current_io_context(gfp_t gfp_flags, int node) 3916 { 3917 struct task_struct *tsk = current; 3918 struct io_context *ret; 3919 3920 ret = tsk->io_context; 3921 if (likely(ret)) 3922 return ret; 3923 3924 ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); 3925 if (ret) { 3926 atomic_set(&ret->refcount, 1); 3927 ret->task = current; 3928 ret->ioprio_changed = 0; 3929 ret->last_waited = jiffies; /* doesn't matter... */ 3930 ret->nr_batch_requests = 0; /* because this is 0 */ 3931 ret->aic = NULL; 3932 ret->cic_root.rb_node = NULL; 3933 ret->ioc_data = NULL; 3934 /* make sure set_task_ioprio() sees the settings above */ 3935 smp_wmb(); 3936 tsk->io_context = ret; 3937 } 3938 3939 return ret; 3940 } 3941 3942 /* 3943 * If the current task has no IO context then create one and initialise it. 3944 * If it does have a context, take a ref on it. 3945 * 3946 * This is always called in the context of the task which submitted the I/O. 3947 */ 3948 struct io_context *get_io_context(gfp_t gfp_flags, int node) 3949 { 3950 struct io_context *ret; 3951 ret = current_io_context(gfp_flags, node); 3952 if (likely(ret)) 3953 atomic_inc(&ret->refcount); 3954 return ret; 3955 } 3956 EXPORT_SYMBOL(get_io_context); 3957 3958 void copy_io_context(struct io_context **pdst, struct io_context **psrc) 3959 { 3960 struct io_context *src = *psrc; 3961 struct io_context *dst = *pdst; 3962 3963 if (src) { 3964 BUG_ON(atomic_read(&src->refcount) == 0); 3965 atomic_inc(&src->refcount); 3966 put_io_context(dst); 3967 *pdst = src; 3968 } 3969 } 3970 EXPORT_SYMBOL(copy_io_context); 3971 3972 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2) 3973 { 3974 struct io_context *temp; 3975 temp = *ioc1; 3976 *ioc1 = *ioc2; 3977 *ioc2 = temp; 3978 } 3979 EXPORT_SYMBOL(swap_io_context); 3980 3981 /* 3982 * sysfs parts below 3983 */ 3984 struct queue_sysfs_entry { 3985 struct attribute attr; 3986 ssize_t (*show)(struct request_queue *, char *); 3987 ssize_t (*store)(struct request_queue *, const char *, size_t); 3988 }; 3989 3990 static ssize_t 3991 queue_var_show(unsigned int var, char *page) 3992 { 3993 return sprintf(page, "%d\n", var); 3994 } 3995 3996 static ssize_t 3997 queue_var_store(unsigned long *var, const char *page, size_t count) 3998 { 3999 char *p = (char *) page; 4000 4001 *var = simple_strtoul(p, &p, 10); 4002 return count; 4003 } 4004 4005 static ssize_t queue_requests_show(struct request_queue *q, char *page) 4006 { 4007 return queue_var_show(q->nr_requests, (page)); 4008 } 4009 4010 static ssize_t 4011 queue_requests_store(struct request_queue *q, const char *page, size_t count) 4012 { 4013 struct request_list *rl = &q->rq; 4014 unsigned long nr; 4015 int ret = queue_var_store(&nr, page, count); 4016 if (nr < BLKDEV_MIN_RQ) 4017 nr = BLKDEV_MIN_RQ; 4018 4019 spin_lock_irq(q->queue_lock); 4020 q->nr_requests = nr; 4021 blk_queue_congestion_threshold(q); 4022 4023 if (rl->count[READ] >= queue_congestion_on_threshold(q)) 4024 blk_set_queue_congested(q, READ); 4025 else if (rl->count[READ] < queue_congestion_off_threshold(q)) 4026 blk_clear_queue_congested(q, READ); 4027 4028 if (rl->count[WRITE] >= queue_congestion_on_threshold(q)) 4029 blk_set_queue_congested(q, WRITE); 4030 else if (rl->count[WRITE] < queue_congestion_off_threshold(q)) 4031 blk_clear_queue_congested(q, WRITE); 4032 4033 if (rl->count[READ] >= q->nr_requests) { 4034 blk_set_queue_full(q, READ); 4035 } else if (rl->count[READ]+1 <= q->nr_requests) { 4036 blk_clear_queue_full(q, READ); 4037 wake_up(&rl->wait[READ]); 4038 } 4039 4040 if (rl->count[WRITE] >= q->nr_requests) { 4041 blk_set_queue_full(q, WRITE); 4042 } else if (rl->count[WRITE]+1 <= q->nr_requests) { 4043 blk_clear_queue_full(q, WRITE); 4044 wake_up(&rl->wait[WRITE]); 4045 } 4046 spin_unlock_irq(q->queue_lock); 4047 return ret; 4048 } 4049 4050 static ssize_t queue_ra_show(struct request_queue *q, char *page) 4051 { 4052 int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); 4053 4054 return queue_var_show(ra_kb, (page)); 4055 } 4056 4057 static ssize_t 4058 queue_ra_store(struct request_queue *q, const char *page, size_t count) 4059 { 4060 unsigned long ra_kb; 4061 ssize_t ret = queue_var_store(&ra_kb, page, count); 4062 4063 spin_lock_irq(q->queue_lock); 4064 q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10); 4065 spin_unlock_irq(q->queue_lock); 4066 4067 return ret; 4068 } 4069 4070 static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) 4071 { 4072 int max_sectors_kb = q->max_sectors >> 1; 4073 4074 return queue_var_show(max_sectors_kb, (page)); 4075 } 4076 4077 static ssize_t 4078 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) 4079 { 4080 unsigned long max_sectors_kb, 4081 max_hw_sectors_kb = q->max_hw_sectors >> 1, 4082 page_kb = 1 << (PAGE_CACHE_SHIFT - 10); 4083 ssize_t ret = queue_var_store(&max_sectors_kb, page, count); 4084 4085 if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) 4086 return -EINVAL; 4087 /* 4088 * Take the queue lock to update the readahead and max_sectors 4089 * values synchronously: 4090 */ 4091 spin_lock_irq(q->queue_lock); 4092 q->max_sectors = max_sectors_kb << 1; 4093 spin_unlock_irq(q->queue_lock); 4094 4095 return ret; 4096 } 4097 4098 static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) 4099 { 4100 int max_hw_sectors_kb = q->max_hw_sectors >> 1; 4101 4102 return queue_var_show(max_hw_sectors_kb, (page)); 4103 } 4104 4105 4106 static struct queue_sysfs_entry queue_requests_entry = { 4107 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, 4108 .show = queue_requests_show, 4109 .store = queue_requests_store, 4110 }; 4111 4112 static struct queue_sysfs_entry queue_ra_entry = { 4113 .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR }, 4114 .show = queue_ra_show, 4115 .store = queue_ra_store, 4116 }; 4117 4118 static struct queue_sysfs_entry queue_max_sectors_entry = { 4119 .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR }, 4120 .show = queue_max_sectors_show, 4121 .store = queue_max_sectors_store, 4122 }; 4123 4124 static struct queue_sysfs_entry queue_max_hw_sectors_entry = { 4125 .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO }, 4126 .show = queue_max_hw_sectors_show, 4127 }; 4128 4129 static struct queue_sysfs_entry queue_iosched_entry = { 4130 .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR }, 4131 .show = elv_iosched_show, 4132 .store = elv_iosched_store, 4133 }; 4134 4135 static struct attribute *default_attrs[] = { 4136 &queue_requests_entry.attr, 4137 &queue_ra_entry.attr, 4138 &queue_max_hw_sectors_entry.attr, 4139 &queue_max_sectors_entry.attr, 4140 &queue_iosched_entry.attr, 4141 NULL, 4142 }; 4143 4144 #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) 4145 4146 static ssize_t 4147 queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 4148 { 4149 struct queue_sysfs_entry *entry = to_queue(attr); 4150 struct request_queue *q = 4151 container_of(kobj, struct request_queue, kobj); 4152 ssize_t res; 4153 4154 if (!entry->show) 4155 return -EIO; 4156 mutex_lock(&q->sysfs_lock); 4157 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { 4158 mutex_unlock(&q->sysfs_lock); 4159 return -ENOENT; 4160 } 4161 res = entry->show(q, page); 4162 mutex_unlock(&q->sysfs_lock); 4163 return res; 4164 } 4165 4166 static ssize_t 4167 queue_attr_store(struct kobject *kobj, struct attribute *attr, 4168 const char *page, size_t length) 4169 { 4170 struct queue_sysfs_entry *entry = to_queue(attr); 4171 struct request_queue *q = container_of(kobj, struct request_queue, kobj); 4172 4173 ssize_t res; 4174 4175 if (!entry->store) 4176 return -EIO; 4177 mutex_lock(&q->sysfs_lock); 4178 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { 4179 mutex_unlock(&q->sysfs_lock); 4180 return -ENOENT; 4181 } 4182 res = entry->store(q, page, length); 4183 mutex_unlock(&q->sysfs_lock); 4184 return res; 4185 } 4186 4187 static struct sysfs_ops queue_sysfs_ops = { 4188 .show = queue_attr_show, 4189 .store = queue_attr_store, 4190 }; 4191 4192 static struct kobj_type queue_ktype = { 4193 .sysfs_ops = &queue_sysfs_ops, 4194 .default_attrs = default_attrs, 4195 .release = blk_release_queue, 4196 }; 4197 4198 int blk_register_queue(struct gendisk *disk) 4199 { 4200 int ret; 4201 4202 struct request_queue *q = disk->queue; 4203 4204 if (!q || !q->request_fn) 4205 return -ENXIO; 4206 4207 ret = kobject_add(&q->kobj, kobject_get(&disk->dev.kobj), 4208 "%s", "queue"); 4209 if (ret < 0) 4210 return ret; 4211 4212 kobject_uevent(&q->kobj, KOBJ_ADD); 4213 4214 ret = elv_register_queue(q); 4215 if (ret) { 4216 kobject_uevent(&q->kobj, KOBJ_REMOVE); 4217 kobject_del(&q->kobj); 4218 return ret; 4219 } 4220 4221 return 0; 4222 } 4223 4224 void blk_unregister_queue(struct gendisk *disk) 4225 { 4226 struct request_queue *q = disk->queue; 4227 4228 if (q && q->request_fn) { 4229 elv_unregister_queue(q); 4230 4231 kobject_uevent(&q->kobj, KOBJ_REMOVE); 4232 kobject_del(&q->kobj); 4233 kobject_put(&disk->dev.kobj); 4234 } 4235 }

Cache object: 3e739f6926cd56daabb2c9ad2ef6cdab

FreeBSD/Linux Kernel Cross Reference sys/block/ll_rw_blk.c

FreeBSD/Linux Kernel Cross Reference
sys/block/ll_rw_blk.c