[BLOCK] Reimplement elevator switch

author Tejun Heo <htejun@gmail.com>

Fri, 28 Oct 2005 06:29:39 +0000 (08:29 +0200)

committer Jens Axboe <axboe@nelson.home.kernel.dk>

Fri, 28 Oct 2005 06:48:12 +0000 (08:48 +0200)
author Tejun Heo <htejun@gmail.com>
Fri, 28 Oct 2005 06:29:39 +0000 (08:29 +0200)
committer Jens Axboe <axboe@nelson.home.kernel.dk>
Fri, 28 Oct 2005 06:48:12 +0000 (08:48 +0200)
diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c

index af2388e73f614343a2ec6540085591f49084b0ee..272d939466211a73e0f97ef5f24b5a60e6d5586c 100644 (file)
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -34,6 +34,7 @@
  #include <linux/slab.h>
  #include <linux/init.h>
  #include <linux/compiler.h>
+#include <linux/delay.h>
  
  #include <asm/uaccess.h>
  
@@ -131,11 +132,7 @@ static int elevator_attach(request_queue_t *q, struct elevator_type *e,
         eq->ops = &e->ops;
         eq->elevator_type = e;
  
-       INIT_LIST_HEAD(&q->queue_head);
-       q->last_merge = NULL;
         q->elevator = eq;
-       q->end_sector = 0;
-       q->boundary_rq = NULL;
  
         if (eq->ops->elevator_init_fn)
                 ret = eq->ops->elevator_init_fn(q, eq);
@@ -184,6 +181,12 @@ int elevator_init(request_queue_t *q, char *name)
         struct elevator_queue *eq;
         int ret = 0;
  
+       INIT_LIST_HEAD(&q->queue_head);
+       q->last_merge = NULL;
+       q->end_sector = 0;
+       q->boundary_rq = NULL;
+       q->max_back_kb = 0;
+
         elevator_setup_default();
  
         if (!name)
@@ -336,23 +339,14 @@ void __elv_add_request(request_queue_t *q, struct request *rq, int where,
                         q->end_sector = rq_end_sector(rq);
                         q->boundary_rq = rq;
                 }
-       }
+       } else if (!(rq->flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT)
+               where = ELEVATOR_INSERT_BACK;
  
         if (plug)
                 blk_plug_device(q);
  
         rq->q = q;
  
-       if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) {
-               /*
-                * if drain is set, store the request "locally". when the drain
-                * is finished, the requests will be handed ordered to the io
-                * scheduler
-                */
-               list_add_tail(&rq->queuelist, &q->drain_list);
-               return;
-       }
-
         switch (where) {
         case ELEVATOR_INSERT_FRONT:
                 rq->flags |= REQ_SOFTBARRIER;
@@ -659,25 +653,36 @@ EXPORT_SYMBOL_GPL(elv_unregister);
   * switch to new_e io scheduler. be careful not to introduce deadlocks -
   * we don't free the old io scheduler, before we have allocated what we
   * need for the new one. this way we have a chance of going back to the old
- * one, if the new one fails init for some reason. we also do an intermediate
- * switch to noop to ensure safety with stack-allocated requests, since they
- * don't originate from the block layer allocator. noop is safe here, because
- * it never needs to touch the elevator itself for completion events. DRAIN
- * flags will make sure we don't touch it for additions either.
+ * one, if the new one fails init for some reason.
   */
  static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
  {
-       elevator_t *e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
-       struct elevator_type *noop_elevator = NULL;
-       elevator_t *old_elevator;
+       elevator_t *old_elevator, *e;
  
+       /*
+        * Allocate new elevator
+        */
+       e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
         if (!e)
                 goto error;
  
         /*
-        * first step, drain requests from the block freelist
+        * Turn on BYPASS and drain all requests w/ elevator private data
          */
-       blk_wait_queue_drained(q, 0);
+       spin_lock_irq(q->queue_lock);
+
+       set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+
+       while (q->elevator->ops->elevator_dispatch_fn(q, 1))
+               ;
+
+       while (q->rq.elvpriv) {
+               spin_unlock_irq(q->queue_lock);
+               msleep(100);
+               spin_lock_irq(q->queue_lock);
+       }
+
+       spin_unlock_irq(q->queue_lock);
  
         /*
          * unregister old elevator data
@@ -685,18 +690,6 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
         elv_unregister_queue(q);
         old_elevator = q->elevator;
  
-       /*
-        * next step, switch to noop since it uses no private rq structures
-        * and doesn't allocate any memory for anything. then wait for any
-        * non-fs requests in-flight
-        */
-       noop_elevator = elevator_get("noop");
-       spin_lock_irq(q->queue_lock);
-       elevator_attach(q, noop_elevator, e);
-       spin_unlock_irq(q->queue_lock);
-
-       blk_wait_queue_drained(q, 1);
-
         /*
          * attach and start new elevator
          */
@@ -707,11 +700,10 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
                 goto fail_register;
  
         /*
-        * finally exit old elevator and start queue again
+        * finally exit old elevator and turn off BYPASS.
          */
         elevator_exit(old_elevator);
-       blk_finish_queue_drain(q);
-       elevator_put(noop_elevator);
+       clear_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
         return;
  
  fail_register:
@@ -720,13 +712,13 @@ fail_register:
          * one again (along with re-adding the sysfs dir)
          */
         elevator_exit(e);
+       e = NULL;
  fail:
         q->elevator = old_elevator;
         elv_register_queue(q);
-       blk_finish_queue_drain(q);
+       clear_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+       kfree(e);
  error:
-       if (noop_elevator)
-               elevator_put(noop_elevator);
         elevator_put(new_e);
         printk(KERN_ERR "elevator: switch to %s failed\n",new_e->elevator_name);
  }
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c

index d2a66fd309c3df6c323dbb00156450679e623227..f7c9931cb3803e64590d6998e9645ffff153b6e2 100644 (file)
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -263,8 +263,6 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
         blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
  
         blk_queue_activity_fn(q, NULL, NULL);
-
-       INIT_LIST_HEAD(&q->drain_list);
  }
  
  EXPORT_SYMBOL(blk_queue_make_request);
@@ -1050,6 +1048,7 @@ static char *rq_flags[] = {
         "REQ_STARTED",
         "REQ_DONTPREP",
         "REQ_QUEUED",
+       "REQ_ELVPRIV",
         "REQ_PC",
         "REQ_BLOCK_PC",
         "REQ_SENSE",
@@ -1640,9 +1639,9 @@ static int blk_init_free_list(request_queue_t *q)
  
         rl->count[READ] = rl->count[WRITE] = 0;
         rl->starved[READ] = rl->starved[WRITE] = 0;
+       rl->elvpriv = 0;
         init_waitqueue_head(&rl->wait[READ]);
         init_waitqueue_head(&rl->wait[WRITE]);
-       init_waitqueue_head(&rl->drain);
  
         rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
                                 mempool_free_slab, request_cachep, q->node);
@@ -1785,12 +1784,14 @@ EXPORT_SYMBOL(blk_get_queue);
  
  static inline void blk_free_request(request_queue_t *q, struct request *rq)
  {
-       elv_put_request(q, rq);
+       if (rq->flags & REQ_ELVPRIV)
+               elv_put_request(q, rq);
         mempool_free(rq, q->rq.rq_pool);
  }
  
  static inline struct request *
-blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask)
+blk_alloc_request(request_queue_t *q, int rw, struct bio *bio,
+                 int priv, int gfp_mask)
  {
         struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
  
@@ -1803,11 +1804,15 @@ blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask)
          */
         rq->flags = rw;
  
-       if (!elv_set_request(q, rq, bio, gfp_mask))
-               return rq;
+       if (priv) {
+               if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) {
+                       mempool_free(rq, q->rq.rq_pool);
+                       return NULL;
+               }
+               rq->flags |= REQ_ELVPRIV;
+       }
  
-       mempool_free(rq, q->rq.rq_pool);
-       return NULL;
+       return rq;
  }
  
  /*
@@ -1863,22 +1868,18 @@ static void __freed_request(request_queue_t *q, int rw)
   * A request has just been released.  Account for it, update the full and
   * congestion status, wake up any waiters.   Called under q->queue_lock.
   */
-static void freed_request(request_queue_t *q, int rw)
+static void freed_request(request_queue_t *q, int rw, int priv)
  {
         struct request_list *rl = &q->rq;
  
         rl->count[rw]--;
+       if (priv)
+               rl->elvpriv--;
  
         __freed_request(q, rw);
  
         if (unlikely(rl->starved[rw ^ 1]))
                 __freed_request(q, rw ^ 1);
-
-       if (!rl->count[READ] && !rl->count[WRITE]) {
-               smp_mb();
-               if (unlikely(waitqueue_active(&rl->drain)))
-                       wake_up(&rl->drain);
-       }
  }
  
  #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
@@ -1893,9 +1894,7 @@ static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
         struct request *rq = NULL;
         struct request_list *rl = &q->rq;
         struct io_context *ioc = current_io_context(GFP_ATOMIC);
-
-       if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)))
-               goto out;
+       int priv;
  
         if (rl->count[rw]+1 >= q->nr_requests) {
                 /*
@@ -1940,9 +1939,14 @@ get_rq:
         rl->starved[rw] = 0;
         if (rl->count[rw] >= queue_congestion_on_threshold(q))
                 set_queue_congested(q, rw);
+
+       priv = !test_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+       if (priv)
+               rl->elvpriv++;
+
         spin_unlock_irq(q->queue_lock);
  
-       rq = blk_alloc_request(q, rw, bio, gfp_mask);
+       rq = blk_alloc_request(q, rw, bio, priv, gfp_mask);
         if (!rq) {
                 /*
                  * Allocation failed presumably due to memory. Undo anything
@@ -1952,7 +1956,7 @@ get_rq:
                  * wait queue, but this is pretty rare.
                  */
                 spin_lock_irq(q->queue_lock);
-               freed_request(q, rw);
+               freed_request(q, rw, priv);
  
                 /*
                  * in the very unlikely event that allocation failed and no
@@ -2470,11 +2474,12 @@ static void __blk_put_request(request_queue_t *q, struct request *req)
          */
         if (rl) {
                 int rw = rq_data_dir(req);
+               int priv = req->flags & REQ_ELVPRIV;
  
                 BUG_ON(!list_empty(&req->queuelist));
  
                 blk_free_request(q, req);
-               freed_request(q, rw);
+               freed_request(q, rw, priv);
         }
  }
  
@@ -2802,97 +2807,6 @@ static inline void blk_partition_remap(struct bio *bio)
         }
  }
  
-void blk_finish_queue_drain(request_queue_t *q)
-{
-       struct request_list *rl = &q->rq;
-       struct request *rq;
-       int requeued = 0;
-
-       spin_lock_irq(q->queue_lock);
-       clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
-
-       while (!list_empty(&q->drain_list)) {
-               rq = list_entry_rq(q->drain_list.next);
-
-               list_del_init(&rq->queuelist);
-               elv_requeue_request(q, rq);
-               requeued++;
-       }
-
-       if (requeued)
-               q->request_fn(q);
-
-       spin_unlock_irq(q->queue_lock);
-
-       wake_up(&rl->wait[0]);
-       wake_up(&rl->wait[1]);
-       wake_up(&rl->drain);
-}
-
-static int wait_drain(request_queue_t *q, struct request_list *rl, int dispatch)
-{
-       int wait = rl->count[READ] + rl->count[WRITE];
-
-       if (dispatch)
-               wait += !list_empty(&q->queue_head);
-
-       return wait;
-}
-
-/*
- * We rely on the fact that only requests allocated through blk_alloc_request()
- * have io scheduler private data structures associated with them. Any other
- * type of request (allocated on stack or through kmalloc()) should not go
- * to the io scheduler core, but be attached to the queue head instead.
- */
-void blk_wait_queue_drained(request_queue_t *q, int wait_dispatch)
-{
-       struct request_list *rl = &q->rq;
-       DEFINE_WAIT(wait);
-
-       spin_lock_irq(q->queue_lock);
-       set_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
-
-       while (wait_drain(q, rl, wait_dispatch)) {
-               prepare_to_wait(&rl->drain, &wait, TASK_UNINTERRUPTIBLE);
-
-               if (wait_drain(q, rl, wait_dispatch)) {
-                       __generic_unplug_device(q);
-                       spin_unlock_irq(q->queue_lock);
-                       io_schedule();
-                       spin_lock_irq(q->queue_lock);
-               }
-
-               finish_wait(&rl->drain, &wait);
-       }
-
-       spin_unlock_irq(q->queue_lock);
-}
-
-/*
- * block waiting for the io scheduler being started again.
- */
-static inline void block_wait_queue_running(request_queue_t *q)
-{
-       DEFINE_WAIT(wait);
-
-       while (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))) {
-               struct request_list *rl = &q->rq;
-
-               prepare_to_wait_exclusive(&rl->drain, &wait,
-                               TASK_UNINTERRUPTIBLE);
-
-               /*
-                * re-check the condition. avoids using prepare_to_wait()
-                * in the fast path (queue is running)
-                */
-               if (test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags))
-                       io_schedule();
-
-               finish_wait(&rl->drain, &wait);
-       }
-}
-
  static void handle_bad_sector(struct bio *bio)
  {
         char b[BDEVNAME_SIZE];
@@ -2988,8 +2902,6 @@ end_io:
                 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
                         goto end_io;
  
-               block_wait_queue_running(q);
-
                 /*
                  * If this device has partitions, remap block n
                  * of partition p to block n+start(p) of the disk.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index 159dbcd2eb5915e461ccdef889f5b00ef791cfd9..6186d5e2110fed06a5315a3692b75fdf21504b02 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -107,9 +107,9 @@ typedef void (rq_end_io_fn)(struct request *);
  struct request_list {
         int count[2];
         int starved[2];
+       int elvpriv;
         mempool_t *rq_pool;
         wait_queue_head_t wait[2];
-       wait_queue_head_t drain;
  };
  
  #define BLK_MAX_CDB    16
@@ -211,6 +211,7 @@ enum rq_flag_bits {
         __REQ_STARTED,          /* drive already may have started this one */
         __REQ_DONTPREP,         /* don't call prep for this one */
         __REQ_QUEUED,           /* uses queueing */
+       __REQ_ELVPRIV,          /* elevator private data attached */
         /*
          * for ATA/ATAPI devices
          */
@@ -244,6 +245,7 @@ enum rq_flag_bits {
  #define REQ_STARTED    (1 << __REQ_STARTED)
  #define REQ_DONTPREP   (1 << __REQ_DONTPREP)
  #define REQ_QUEUED     (1 << __REQ_QUEUED)
+#define REQ_ELVPRIV    (1 << __REQ_ELVPRIV)
  #define REQ_PC         (1 << __REQ_PC)
  #define REQ_BLOCK_PC   (1 << __REQ_BLOCK_PC)
  #define REQ_SENSE      (1 << __REQ_SENSE)
@@ -413,8 +415,6 @@ struct request_queue
         unsigned int            sg_reserved_size;
         int                     node;
  
-       struct list_head        drain_list;
-
         /*
          * reserved for flush operations
          */
@@ -442,7 +442,7 @@ enum {
  #define QUEUE_FLAG_DEAD                5       /* queue being torn down */
  #define QUEUE_FLAG_REENTER     6       /* Re-entrancy avoidance */
  #define QUEUE_FLAG_PLUGGED     7       /* queue is plugged */
-#define QUEUE_FLAG_DRAIN       8       /* draining queue for sched switch */
+#define QUEUE_FLAG_BYPASS      8       /* don't use elevator, just do FIFO */
  #define QUEUE_FLAG_FLUSH       9       /* doing barrier flush sequence */
  
  #define blk_queue_plugged(q)   test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
@@ -668,8 +668,6 @@ extern void blk_dump_rq_flags(struct request *, char *);
  extern void generic_unplug_device(request_queue_t *);
  extern void __generic_unplug_device(request_queue_t *);
  extern long nr_blockdev_pages(void);
-extern void blk_wait_queue_drained(request_queue_t *, int);
-extern void blk_finish_queue_drain(request_queue_t *);
  
  int blk_get_queue(request_queue_t *);
  request_queue_t *blk_alloc_queue(int gfp_mask);
author	Tejun Heo <htejun@gmail.com>
	Fri, 28 Oct 2005 06:29:39 +0000 (08:29 +0200)
committer	Jens Axboe <axboe@nelson.home.kernel.dk>
	Fri, 28 Oct 2005 06:48:12 +0000 (08:48 +0200)
drivers/block/elevator.c		patch \| blob \| history
drivers/block/ll_rw_blk.c		patch \| blob \| history
include/linux/blkdev.h		patch \| blob \| history