SlideShare a Scribd company logo
1 of 37
Download to read offline
sys_read()                                                                    fs/read_write.c
                                                                              mm/filemap.c
   fget_light()                                                               include/linux/fs.h
                                                                              fs/ext2/file.c
   vfs_read()


       file->f_op->read()   do_sync_read()


                               filp->f_op->aio_read()


                               generic_file_aio_read()


                                   generic_file_direct_IO()   do_generic_file_read()


                                                                  do_generic_mapping_read()



                                                                       Page Cache
                                       Generic Block Layer
                                             Elevator
                                          I/O Scheduler
                                         Request Queue                                             1
                                          Device Driver
mm/filemap.c
do_generic_mapping_read()




                                                                  no
                              find page
                                                              cached page



                 page cache
                 readahead

                                page
                                                               readpage
                                  not      readpage
                                                                 error
                              up to date


                  continue




                              page ok                                out



                                                                            2
do_generic_mapping_read()                     mm/readahead.c
                                                                           mm/filemap.c
page_cache_readahead()                                                     fs/ext2/inode.c
                                                                           fs/mpage.c
   blockable_page_cache_readahead()

       __do_page_cache_readahead()

           read_pages()

               mapping->a_ops->readpages()               mapping->a_ops->readpage()

               ext2_readpages()   ext2_get_block()       ext2_readpage()

                  mpage_readpages()                        mpage_readpage()

                             do_mpage_readpage()

                                  mpage_bio_submit()

                                      submit_bio()

                                   Generic Block Layer
                                         Elevator
                                      I/O Scheduler
                                      Request Queue                                          3
                                      Device Driver
submit_bio()                                                                     mm/mpage.c
                                                                                  include/linux/blkdev.h
    generic_make_request(bio)            BLK_TA_QUEUE                             block/elevator.c
                                                                                  block/ll_rw_blk.c
       q->make_request_fn(q,bio)                                                  block/as-iosched.c

          __make_request(q,bio)

                elv_merge(q,req,bio)

                                                                             BLK_TA_SLEEPRQ
                    q->back_merge_fn()             init_request_from_bio()
                                                                                      BLK_TA_GETRQ
BLK_TA_MERGE        ll_merge_requests_fn()            get_request_wait()
                                                                                  elv_may_queue()
                elv_merged_request()               add_request()
                                                                                  get_request()

                    e->ops->elevator_merged_fn()      __elv_add_request()
                                                                                      current_io_context()

                                                          elv_insert()                blk_alloc_request()
                                             BLK_TA_INSERT
                                                               e->ops->elevator_add_req_fn(q, rq)


                                              I/O Scheduler
                                             Request Queue                                            4
                                              Device Driver
drivers/ide/ide-generic.c
ide_generic_init()                                                                         drivers/ide/ide-probes.c
                                                                                           drivers/ide-io.c
    ideprobe_init()
                                                                                           block/ll_rw_blk.c

        hwif_init()


             init_irq()


                 request_irq(hwif->irq,&ide_intr, ...)

                                                                   register IRQ handler
                 ide_init_queue()


                      blk_init_queue_node(do_ide_request, ...)                                 ide_intr()

                          q->request_fn = rfn

                                                         register I/O request dispatcher


                                                                     do_ide_request()


                                                                                     ide_do_request()
                                                                                                               5
include/linux/blkdev.h
 q->make_request_fn()                                                                block/elevator.c

    add_request()                                                                    block/ll_rw_blk.c
                                                                                     block/as-iosched.c
       __elv_add_request()            BLK_TA_PLUG

          elv_insert()                 Disk request queue without an I/O scheduler

             list_add_tail()

             q->request_fn()



BLK_TA_INSERT
                               ide_do_request()

                                  elv_next_request()

                                      rq = __elv_next_request(q)



                                  start_request()
BLK_TA_ISSUE(D)                                                                               Disk
                                                          interrupt for completion
                                                                                                          6
include/linux/blkdev.h
 q->make_request_fn()                                                              block/elevator.c
                                                                                   block/ll_rw_blk.c
    add_request()
                                                                                   block/as-iosched.c
       __elv_add_request()          BLK_TA_PLUG

          elv_insert()               Disk request queue with an I/O scheduler

             e->ops->elevator_add_req_fn(q, rq)

             q->request_fn()



BLK_TA_INSERT
                             ide_do_request()

                                elv_next_request()

                                    rq = __elv_next_request(q)

                                    e->ops->elevator_dispatch_fn()

                                start_request()
BLK_TA_ISSUE(D)                                                                             Disk
                                                        interrupt for completion
                                                                                                        7
drivers/ide/ide-io.c
ide_do_request()                                                   block/elevator.c
                                                                   block/as-iosched.c
    rq = elv_next_request(drive->queue)                            block/ll_rw_blk.c


         rq = __elv_next_request(q)


              q->elevator->ops->elevator_dispatch_fn(q, 0))
     ISSUE
    start_request(drive, rq)



static inline struct request *__elv_next_request(request_queue_t *q)
{
         struct request *rq;

             while (1) {                                                           Disk
                      while (!list_empty(&q->queue_head)) {
                               rq = list_entry_rq(q->queue_head.next);
                               if (blk_do_ordered(q, &rq))
                                        return rq;
                      }
                      if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
                               return NULL;
             }                                                                            8
}
static void *noop_init_queue(request_queue_t *q, elevator_t *e)                       block/noop-iosched.c
{
             struct noop_data *nd;

           nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
           if (!nd)                                                                    Eevator private
                      return NULL;                                                     data structure
           INIT_LIST_HEAD(&nd->queue);
           return nd;
}
static void noop_add_request(request_queue_t *q, struct request *rq)
{
             struct noop_data *nd = q->elevator->elevator_data;

           list_add_tail(&rq->queuelist, &nd->queue);
}

static int noop_dispatch(request_queue_t *q, int force)
{
             struct noop_data *nd = q->elevator->elevator_data;
                                                                                                    Disk
           if (!list_empty(&nd->queue)) {
                        struct request *rq;
                        rq = list_entry(nd->queue.next, struct request, queuelist);
                        list_del_init(&rq->queuelist);
                        elv_dispatch_sort(q, rq);
                        return 1;
           }
           return 0;                                                                                     9
}
The runtime data of Deadline i/o scheduler                      block/deadline-iosched.c

struct deadline_data {

     /* requests (deadline_rq s) are present on both sort_list and fifo_list       */
     struct rb_root sort_list[2];
     struct list_head fifo_list[2];

     /* next in sort order. read, write or both are NULL */
     struct request *next_rq[2];
     unsigned int batching;        /* number of sequential requests made */
     sector_t last_sector;         /* head position */
     unsigned int starved;         /* times reads have starved writes */

     /* settings that change how the i/o scheduler behaves */
     int fifo_expire[2];
     int fifo_batch;
     int writes_starved;
     int front_merges;
};

 struct list_head fifo_list[READ]        6   4   5

  struct rb_root sort_list[READ]
                                         6   5   4   next_rq[READ]


                                                     next_rq[WRITE]
 struct rb_root sort_list[WRITE]         9   8   7

 struct list_head fifo_list[WRITE]                                               10
                                         7   8   9
block/deadline-iosched.c

Add a request to both rb tree and fifo list

static void
deadline_add_request(struct request_queue *q, struct request *rq)
{
         struct deadline_data *dd = q->elevator->elevator_data;
         const int data_dir = rq_data_dir(rq);

          deadline_add_rq_rb(dd, rq);

          /*
           * set expire time (only used for reads) and add to fifo list
           */
          rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]);
          list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
}


 struct list_head fifo_list[READ]             6   4   5

 struct rb_root sort_list[READ]
                                              6   5   4   next_rq[READ]


                                                          next_rq[WRITE]
struct rb_root sort_list[WRITE]               9   8   7

 struct list_head fifo_list[WRITE]                                                  11
                                              7   8   9
struct list_head fifo_list[READ]           6   4   5                      block/deadline-iosched.c

struct rb_root sort_list[READ]
                                           6   5   4         next_rq[READ]


                                                             next_rq[WRITE]
struct rb_root sort_list[WRITE]            9   8   7

struct list_head fifo_list[WRITE]
                                           7   8   9




1.Check if we are running a sequential batch, and it is still entitled.
             if (dd->next_rq[WRITE])
                      rq = dd->next_rq[WRITE];
             else
                      rq = dd->next_rq[READ];
             if (rq) {
                      /* we have a "next request" */

                       if (dd->last_sector != rq->sector)
                                /* end the batch on a non sequential request */
                                dd->batching += dd->fifo_batch;

                       if (dd->batching < dd->fifo_batch)
                                /* we are still entitled to batch */
                                goto dispatch_request;
             }                                                                             12
struct list_head fifo_list[READ]        6   4   5                   block/deadline-iosched.c

struct rb_root sort_list[READ]
                                        6   5   4


struct rb_root sort_list[WRITE]         9   8   7

struct list_head fifo_list[WRITE]
                                        7   8   9


2. If we are not running a batch. Choose a new direction to serve requests.
   A read request is always favored, unless write has been starved.
            if (reads) {
                     if (writes && (dd->starved++ >= dd->writes_starved))
                              goto dispatch_writes;

                      data_dir = READ;
                      goto dispatch_find_request;
            }

            if (writes) {
   dispatch_writes:
                     dd->starved = 0;
                     data_dir = WRITE;
                     goto dispatch_find_request;
            }
                                                                                     13
struct list_head fifo_list[READ]            6   4   5                      block/deadline-iosched.c


struct rb_root sort_list[READ]
                                            6   5   4             next_rq[READ]

struct rb_root sort_list[READ]
                                            6   5    4            next_rq[READ]

 3.Choose an appropriate request..
   If the first request of the fifo list has expired, serve it.
   Otherwise, behave as an “One-way Elevator”

  dispatch_find_request:

             if (deadline_check_fifo(dd, data_dir)) {
                      dd->batching = 0;
                      rq = rq_entry_fifo(dd->fifo_list[data_dir].next);

             } else if (dd->next_rq[data_dir]) {

                        rq = dd->next_rq[data_dir];
             } else {
                        struct rb_node *node;
                        dd->batching = 0;
                        node = rb_first(&dd->sort_list[data_dir]);
                        if (node)
                                 rq = rb_entry_rq(node);
             }                                                                              14
block/deadline-iosched.c
Dispatch the request, remove it from the elevator’s private queue
and put it in the dispatch queue.
Also update the information about the “last” and the “next” request.


struct list_head fifo_list[READ]                 6    4      5
                                                                       4
struct rb_root sort_list[READ]
                                                 6    5

    Suppose the request 4 was picked in the previous step.       next_rq[READ]

static void
deadline_move_request(struct deadline_data *dd, struct request *rq)
{
         const int data_dir = rq_data_dir(rq);
         struct rb_node *rbnext = rb_next(&rq->rb_node);                                  Disk

             dd->next_rq[READ] = NULL;
             dd->next_rq[WRITE] = NULL;

             if (rbnext)
                      dd->next_rq[data_dir] = rb_entry_rq(rbnext);

             dd->last_sector = rq->sector + rq->nr_sectors;

             deadline_move_to_dispatch(dd, rq);
}                                                                                           15
block/as-iosched.c

static void as_add_request(request_queue_t *q, struct request *rq)
{
         struct as_data *ad = q->elevator->elevator_data;
         int data_dir;

        RQ_SET_STATE(rq, AS_RQ_NEW);

        data_dir = rq_is_sync(rq);

        rq->elevator_private = as_get_io_context(q->node);

        if (RQ_IOC(rq)) {
                 as_update_iohist(ad, RQ_IOC(rq)->aic, rq);
                 atomic_inc(&RQ_IOC(rq)->aic->nr_queued);
        }

        as_add_rq_rb(ad, rq);

        /*
         * set expire time (only used for reads) and add to fifo list
         */
        rq_set_fifo_time(rq, jiffies + ad->fifo_expire[data_dir]);
        list_add_tail(&rq->queuelist, &ad->fifo_list[data_dir]);

        as_update_rq(ad, rq); /* keep state machine up to date */
        RQ_SET_STATE(rq, AS_RQ_QUEUED);
}
                                                                              16
as_add_request()                                                           include/linux/list.h
                                                                           include/linux/elevator.h
     as_get_io_context(q->node)
                                                                           block/ll_rw_blk.c
           get_io_context()
                                                                           block/as-iosched.c
                   current_io_context()

           alloc_as_io_context()             task_struct     io_context        as_io_context
     as_update_iohist()

           as_update_thinktime()

           as_update_seekdist()

     as_add_rq_rb()
                                                    request          request      request
     rq_set_fifo_time()

     list_add_tail()

     as_update_rq(ad, rq)
                                                task_struct      io_context        as_io_context
           as_choose_req()

           as_can_break_anticipation()

                   as_update_iohist()

           as_antic_stop()
                                                           request              request
                   del_timer()
                                                                                               17
                   kblockd_schedule_work()
as_add_request()                                                           include/linux/list.h
                                                                           include/linux/elevator.h
     as_get_io_context(q->node)
                                                                           block/ll_rw_blk.c
           get_io_context()
                                                                           block/as-iosched.c
                   current_io_context()

           alloc_as_io_context()             task_struct     io_context        as_io_context
     as_update_iohist()

           as_update_thinktime()

           as_update_seekdist()

     as_add_rq_rb()
                                                    request          request      request
     rq_set_fifo_time()

     list_add_tail()

     as_update_rq(ad, rq)
                                                task_struct      io_context        as_io_context
           as_choose_req()

           as_can_break_anticipation()

                   as_update_iohist()

           as_antic_stop()
                                                           request              request
                   del_timer()
                                                                                               18
                   kblockd_schedule_work()
static void
as_update_iohist(struct as_data *ad, struct as_io_context *aic, struct request *rq)
{
    ...
    if (data_dir == REQ_SYNC) {
        unsigned long in_flight = atomic_read(&aic->nr_queued)
                                  + atomic_read(&aic->nr_dispatched);
        spin_lock(&aic->lock);
        if (test_bit(AS_TASK_IORUNNING, &aic->state) ||
             test_bit(AS_TASK_IOSTARTED, &aic->state)) {
             /* Calculate read -> read thinktime */
             if (test_bit(AS_TASK_IORUNNING, &aic->state) && in_flight == 0) {
                 thinktime = jiffies - aic->last_end_request;
                 thinktime = min(thinktime, MAX_THINKTIME-1);
             }
             as_update_thinktime(ad, aic, thinktime);

            /* Calculate read -> read seek distance */
            if (aic->last_request_pos < rq->sector)
                 seek_dist = rq->sector - aic->last_request_pos;
            else
                 seek_dist = aic->last_request_pos - rq->sector;
            as_update_seekdist(ad, aic, seek_dist);
        }
        aic->last_request_pos = rq->sector + rq->nr_sectors;
        set_bit(AS_TASK_IOSTARTED, &aic->state);
        spin_unlock(&aic->lock);
    }                                                                       19
}
as_add_request()                                                           include/linux/list.h
                                                                           include/linux/elevator.h
     as_get_io_context(q->node)
                                                                           block/ll_rw_blk.c
           get_io_context()
                                                                           block/as-iosched.c
                   current_io_context()

           alloc_as_io_context()             task_struct     io_context        as_io_context
     as_update_iohist()

           as_update_thinktime()

           as_update_seekdist()

     as_add_rq_rb()
                                                    request          request      request
     rq_set_fifo_time()

     list_add_tail()

     as_update_rq(ad, rq)
                                                task_struct      io_context        as_io_context
           as_choose_req()

           as_can_break_anticipation()

                   as_update_iohist()

           as_antic_stop()
                                                           request              request
                   del_timer()
                                                                                               20
                   kblockd_schedule_work()
enum anticipation_status {
     ANTIC_OFF = 0,   /* Not anticipating (normal operation) */
     ANTIC_WAIT_REQ, /* The last read has not yet completed */
     ANTIC_WAIT_NEXT, /* Currently anticipating a request vs
                              last read (which has completed) */
     ANTIC_FINISHED, /* Anticipating but have found a candidate                   or timed out */
 };


        as_add_request()                   as_dispatch_request()        as_completed_request()


                    as_add_rq_rb()

as_update_rq()
                         as_move_to_dispatch()               as_antic_waitreq()



      as_antic_stop()      as_antic_timeout()                                 as_antic_waitnext()




                 ANTIC_FINISHED         ANTIC_OFF           ANTIC_WAIT_REQ        ANTIC_WAIT_NEXT




                                                                                                 21
                    kblockd_schedule_work()
/*
 * This is called directly by the functions in this file to stop anticipation.
 * We kill the timer and schedule a call to the request_fn asap.
 */
static void as_antic_stop(struct as_data *ad)
{
         int status = ad->antic_status;

        if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) {
                 if (status == ANTIC_WAIT_NEXT)
                          del_timer(&ad->antic_timer);
                 ad->antic_status = ANTIC_FINISHED;
                 /* see as_work_handler */
                 kblockd_schedule_work(&ad->antic_work);
        }
}




                                                                            22
/*
 * as_update_rq must be called whenever a request (rq) is added to
 * the sort_list. This function keeps caches up to date, and checks if the
 * request might be one we are "anticipating"
 */
static void as_update_rq(struct as_data *ad, struct request *rq)
{
         const int data_dir = rq_is_sync(rq);

        /* keep the next_rq cache up to date */
        ad->next_rq[data_dir] = as_choose_req(ad, rq, ad->next_rq[data_dir]);

        /*
         * have we been anticipating this request?
         * or does it come from the same process as the one we are anticipating
         * for?
         */
        if (ad->antic_status == ANTIC_WAIT_REQ
                          || ad->antic_status == ANTIC_WAIT_NEXT) {
                 if (as_can_break_anticipation(ad, rq))
                          as_antic_stop(ad);
        }
}




                                                                             23
/*
 * This is executed in a "deferred" process context,   by kblockd. It calls the
 * driver's request_fn so the driver can submit that   request.
 *
 * IMPORTANT! This guy will reenter the elevator, so   set up all queue global
 * state before calling, and don't rely on any state   over calls.
 *
 * FIXME! dispatch queue is not a queue at all!
 */
static void as_work_handler(void *data)
{
         struct request_queue *q = data;
         unsigned long flags;

        spin_lock_irqsave(q->queue_lock, flags);
        blk_start_queueing(q);
        spin_unlock_irqrestore(q->queue_lock, flags);
}




                                                                                 24
/*
 * as_antic_timeout is the timer function set by as_antic_waitnext.
 */
static void as_antic_timeout(unsigned long data)
{
         struct request_queue *q = (struct request_queue *)data;
         struct as_data *ad = q->elevator->elevator_data;
         unsigned long flags;

        spin_lock_irqsave(q->queue_lock, flags);
        if (ad->antic_status == ANTIC_WAIT_REQ
                          || ad->antic_status == ANTIC_WAIT_NEXT) {
                 struct as_io_context *aic = ad->io_context->aic;

                 ad->antic_status = ANTIC_FINISHED;
                 kblockd_schedule_work(&ad->antic_work);

                 if (aic->ttime_samples == 0) {
                          /* process anticipated on has exited or timed out*/
                          ad->exit_prob = (7*ad->exit_prob + 256)/8;
                 }
                 if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
                          /* process not "saved" by a cooperating request */
                          ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8;
                 }
        }
        spin_unlock_irqrestore(q->queue_lock, flags);
}
                                                                            25
static void as_put_io_context(struct request *rq)
{
             struct as_io_context *aic;

            if (unlikely(!RQ_IOC(rq)))
                          return;

            aic = RQ_IOC(rq)->aic;

            if (rq_is_sync(rq) && aic) {
                         spin_lock(&aic->lock);
                         set_bit(AS_TASK_IORUNNING, &aic->state);
                         aic->last_end_request = jiffies;
                         spin_unlock(&aic->lock);
            }

            put_io_context(RQ_IOC(rq));
}




                                                                    26
as_update_rq()

                                  as_choose_req()                     as_can_anticipate()
as_update_seekdist()
                                  as_can_break_anticipation()            as_can_break_anticipation()
as_update_iohist()

as_update_thinktime()               as_antic_stop()                         as_antic_expired()

                                                                            as_close_req()
                                                                            as_update_iohist()
     as_completed_request()             as_dispatch_request(()

        kblockd_schedule_work()            as_move_to_dispatch()

        update_write_batch()                    as_antic_stop()                  as_get_io_context()
        as_antic_waitnext()                     copy_io_context()
                                                                                    get_io_context()
        as_put_io_context()                     put_io_context()
                                                                                    alloc_as_io_context()
                                                as_remove_queued_request()

                                                elv_dispatch_sort()


         as_find_next_rq()                      as_antic_waitreq()

                                                as_fifo_expired()
                                                                                as_batch_expired()     27
genhd_device_init()
   genhd_device_init()
       blk_dev_init()
          create_workqueue("kblockd")




                                        28
void blk_start_queueing(request_queue_t *q)
{
            if (!blk_queue_plugged(q))                              blk_start_queueing()
                        q->request_fn(q);
            else
                        __generic_unplug_device(q);
}                                                                   blk_queue_plugged()

                                                             not plugged         plugged
void __generic_unplug_device(request_queue_t *q)
{
          if (unlikely(blk_queue_stopped(q)))                    __generic_unplug_device()
                        return;

           if (!blk_remove_plug(q))                                  blk_remove_plug()
                       return;

           q->request_fn(q);                                               del_timer()
}


int blk_remove_plug(request_queue_t *q)                               q->request_fn()
{
           WARN_ON(!irqs_disabled());

          if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
                       return 0;

           del_timer(&q->unplug_timer);
           return 1;                                                                       29
}
block/ll_rw_blk.c
void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
{
                      q->nr_requests = BLKDEV_MAX_RQ;
           blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
           blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
           q->make_request_fn = mfn;
           q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
           q->backing_dev_info.state = 0;
           q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
           blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
           blk_queue_hardsect_size(q, 512);
           blk_queue_dma_alignment(q, 511);
           blk_queue_congestion_threshold(q);
           q->nr_batching = BLK_BATCH_REQ;

         q->unplug_thresh = 4;              /* hmm */
         q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */
         if (q->unplug_delay == 0)
                     q->unplug_delay = 1;

         INIT_WORK(&q->unplug_work, blk_unplug_work, q);

         q->unplug_timer.function = blk_unplug_timeout;
         q->unplug_timer.data = (unsigned long)q;

         blk_queue_activity_fn(q, NULL, NULL);
}                                                                                        30
include/linux/blkdev.h
request_queue_t *
blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
{
         request_queue_t *q = blk_alloc_queue_node(GFP_KERNEL, node_id);

        if (!q)
                  return NULL;

        q->node = node_id;
        blk_init_free_list(q);
        q->request_fn             =   rfn;
        q->back_merge_fn          =   ll_back_merge_fn;
        q->front_merge_fn         =   ll_front_merge_fn;
        q->merge_requests_fn      =   ll_merge_requests_fn;
        q->prep_rq_fn             =   NULL;
        q->unplug_fn              =   generic_unplug_device;
        q->queue_flags            =   (1 << QUEUE_FLAG_CLUSTER);
        q->queue_lock             =   lock;

        blk_queue_segment_boundary(q, 0xffffffff);

        blk_queue_make_request(q, __make_request);

        blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);

        /* all done */
        elevator_init(q, NULL);                                                 31
}
sys_read()
ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
{
   struct file *file;
   ssize_t ret = -EBADF;
   int fput_needed;

    file = fget_light(fd, &fput_needed);
    if (file) {
         loff_t pos = file_pos_read(file);
         ret = vfs_read(file, buf, count, &pos);
         file_pos_write(file, pos);
         fput_light(file, fput_needed);
    }

    return ret;
}




                                                                32
vfs_read()
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
   if (!(file->f_mode & FMODE_READ))
         return -EBADF;
   if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
         return -EINVAL;
   if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
         return -EFAULT;

    ret = rw_verify_area(READ, file, pos, count);
    if (ret >= 0) {
         count = ret;
         if (file->f_op->read)
                  ret = file->f_op->read(file, buf, count, pos);
         else
                  ret = do_sync_read(file, buf, count, pos);
         if (ret > 0) {
                  fsnotify_access(file->f_dentry);
                  current->rchar += ret;
         }
         current->syscr++;
    }
    return ret;
}                                                                          33
do_sync_read()
ssize_t do_sync_read(struct file *filp, char __user *buf,
                 size_t len, loff_t *ppos)
{
   struct iovec iov = { .iov_base = buf, .iov_len = len };
   struct kiocb kiocb;
   ssize_t ret;

    init_sync_kiocb(&kiocb, filp);
    kiocb.ki_pos = *ppos;
    kiocb.ki_left = len;

    for (;;) {
         ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
         if (ret != -EIOCBRETRY)
                  break;
         wait_on_retry_sync_kiocb(&kiocb);
    }

    if (-EIOCBQUEUED == ret)
         ret = wait_on_sync_kiocb(&kiocb);
    *ppos = kiocb.ki_pos;
    return ret;
}                                                                     34
generic_file_aio_read()                                 1/2


ssize_t generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
         unsigned long nr_segs, loff_t pos)
{
   struct file *filp = iocb->ki_filp;
   ssize_t retval;
   unsigned long seg;
   size_t count;
   loff_t *ppos = &iocb->ki_pos;

   count = 0;
   for (seg = 0; seg < nr_segs; seg++) {
        const struct iovec *iv = &iov[seg];
        ...
   }

   /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
   if (filp->f_flags & O_DIRECT) {
        ...
   }



                                                                         35
generic_file_aio_read()                                2/2

   retval = 0;
   if (count) {
        for (seg = 0; seg < nr_segs; seg++) {
                 read_descriptor_t desc;

                desc.written = 0;
                desc.arg.buf = iov[seg].iov_base;
                desc.count = iov[seg].iov_len;
                if (desc.count == 0)
                         continue;
                desc.error = 0;
                do_generic_file_read(filp,ppos,&desc,file_read_actor);
                retval += desc.written;
                if (desc.error) {
                         retval = retval ?: desc.error;
                         break;
                }
        }
   }
out:
   return retval;
}
                                                                         36
do_generic_file_read()
static inline void do_generic_file_read(struct file * filp,
                               loff_t *ppos,
                               read_descriptor_t * desc,
                               read_actor_t actor)
{
   do_generic_mapping_read(filp->f_mapping,
                       &filp->f_ra,
                       filp,
                       ppos,
                       desc,
                       actor);
}




                                                              37

More Related Content

Viewers also liked

Characteristics of narration
Characteristics of  narrationCharacteristics of  narration
Characteristics of narrationphoebinku
 
Orange Sparkle Ball: Who We Are and What We Do
Orange Sparkle Ball: Who We Are and What We DoOrange Sparkle Ball: Who We Are and What We Do
Orange Sparkle Ball: Who We Are and What We DoOrange Sparkle Ball, Inc.
 
The Praying Indians of Megunko
The Praying Indians of MegunkoThe Praying Indians of Megunko
The Praying Indians of Megunkopebrodeur
 
Lookbook "The ballet of the Tsars"
Lookbook "The ballet of the Tsars"Lookbook "The ballet of the Tsars"
Lookbook "The ballet of the Tsars"Patricia Rosales
 
The history of video games goes as far back as the early 1940s
The history of video games goes as far back as the early 1940sThe history of video games goes as far back as the early 1940s
The history of video games goes as far back as the early 1940sJian Li
 
Formula of failure and success
Formula of failure and successFormula of failure and success
Formula of failure and successUjjwal Panda
 

Viewers also liked (13)

Abb v2
Abb v2Abb v2
Abb v2
 
Backtrack 3 USB
Backtrack 3 USBBacktrack 3 USB
Backtrack 3 USB
 
Unit 1 the universe
Unit 1 the universeUnit 1 the universe
Unit 1 the universe
 
Characteristics of narration
Characteristics of  narrationCharacteristics of  narration
Characteristics of narration
 
Orange Sparkle Ball: Who We Are and What We Do
Orange Sparkle Ball: Who We Are and What We DoOrange Sparkle Ball: Who We Are and What We Do
Orange Sparkle Ball: Who We Are and What We Do
 
The Praying Indians of Megunko
The Praying Indians of MegunkoThe Praying Indians of Megunko
The Praying Indians of Megunko
 
Jft 13-desktop-optical-power-meter-jfopt
Jft 13-desktop-optical-power-meter-jfoptJft 13-desktop-optical-power-meter-jfopt
Jft 13-desktop-optical-power-meter-jfopt
 
No tlp Polisi
No tlp PolisiNo tlp Polisi
No tlp Polisi
 
Hugps138
Hugps138Hugps138
Hugps138
 
CAMPUSMATE
CAMPUSMATECAMPUSMATE
CAMPUSMATE
 
Lookbook "The ballet of the Tsars"
Lookbook "The ballet of the Tsars"Lookbook "The ballet of the Tsars"
Lookbook "The ballet of the Tsars"
 
The history of video games goes as far back as the early 1940s
The history of video games goes as far back as the early 1940sThe history of video games goes as far back as the early 1940s
The history of video games goes as far back as the early 1940s
 
Formula of failure and success
Formula of failure and successFormula of failure and success
Formula of failure and success
 

Similar to Linux I/O path_20070116

2005_Structures and functions of Makefile
2005_Structures and functions of Makefile2005_Structures and functions of Makefile
2005_Structures and functions of MakefileNakCheon Jung
 
Advfs system calls & kernel interfaces
Advfs system calls & kernel interfacesAdvfs system calls & kernel interfaces
Advfs system calls & kernel interfacesJustin Goldberg
 
Head First Zend Framework - Part 1 Project & Application
Head First Zend Framework - Part 1 Project & ApplicationHead First Zend Framework - Part 1 Project & Application
Head First Zend Framework - Part 1 Project & ApplicationJace Ju
 
Linux Porting
Linux PortingLinux Porting
Linux PortingChamp Yen
 
Kernel Recipes 2019 - ftrace: Where modifying a running kernel all started
Kernel Recipes 2019 - ftrace: Where modifying a running kernel all startedKernel Recipes 2019 - ftrace: Where modifying a running kernel all started
Kernel Recipes 2019 - ftrace: Where modifying a running kernel all startedAnne Nicolas
 
Disksim with SSD_extension
Disksim with SSD_extensionDisksim with SSD_extension
Disksim with SSD_extensioncucufrog
 
Symfony internals [english]
Symfony internals [english]Symfony internals [english]
Symfony internals [english]Raul Fraile
 
ch6-pv2-device-drivers
ch6-pv2-device-driversch6-pv2-device-drivers
ch6-pv2-device-driversyushiang fu
 
Linux kernel-rootkit-dev - Wonokaerun
Linux kernel-rootkit-dev - WonokaerunLinux kernel-rootkit-dev - Wonokaerun
Linux kernel-rootkit-dev - Wonokaerunidsecconf
 
Die .htaccess richtig nutzen
Die .htaccess richtig nutzenDie .htaccess richtig nutzen
Die .htaccess richtig nutzenWalter Ebert
 
Oaf development-guide
Oaf development-guideOaf development-guide
Oaf development-guide俊 朱
 
Re-Design with Elixir/OTP
Re-Design with Elixir/OTPRe-Design with Elixir/OTP
Re-Design with Elixir/OTPMustafa TURAN
 
Zend Framework 2 - presentation
Zend Framework 2 - presentationZend Framework 2 - presentation
Zend Framework 2 - presentationyamcsha
 
Web Application Firewall: Suckseed or Succeed
Web Application Firewall: Suckseed or SucceedWeb Application Firewall: Suckseed or Succeed
Web Application Firewall: Suckseed or SucceedPrathan Phongthiproek
 
Physical Memory Management.pdf
Physical Memory Management.pdfPhysical Memory Management.pdf
Physical Memory Management.pdfAdrian Huang
 
Drive by-download attack evolution zero nights v3
Drive by-download attack evolution zero nights v3Drive by-download attack evolution zero nights v3
Drive by-download attack evolution zero nights v3Sergey Soldatov
 
Vladimir Kropotov - Drive-By-Download attack evolution before and after vulne...
Vladimir Kropotov - Drive-By-Download attack evolution before and after vulne...Vladimir Kropotov - Drive-By-Download attack evolution before and after vulne...
Vladimir Kropotov - Drive-By-Download attack evolution before and after vulne...DefconRussia
 

Similar to Linux I/O path_20070116 (20)

2005_Structures and functions of Makefile
2005_Structures and functions of Makefile2005_Structures and functions of Makefile
2005_Structures and functions of Makefile
 
Advfs system calls & kernel interfaces
Advfs system calls & kernel interfacesAdvfs system calls & kernel interfaces
Advfs system calls & kernel interfaces
 
Head First Zend Framework - Part 1 Project & Application
Head First Zend Framework - Part 1 Project & ApplicationHead First Zend Framework - Part 1 Project & Application
Head First Zend Framework - Part 1 Project & Application
 
Linux Porting
Linux PortingLinux Porting
Linux Porting
 
Kernel Recipes 2019 - ftrace: Where modifying a running kernel all started
Kernel Recipes 2019 - ftrace: Where modifying a running kernel all startedKernel Recipes 2019 - ftrace: Where modifying a running kernel all started
Kernel Recipes 2019 - ftrace: Where modifying a running kernel all started
 
Disksim with SSD_extension
Disksim with SSD_extensionDisksim with SSD_extension
Disksim with SSD_extension
 
Symfony internals [english]
Symfony internals [english]Symfony internals [english]
Symfony internals [english]
 
ch6-pv2-device-drivers
ch6-pv2-device-driversch6-pv2-device-drivers
ch6-pv2-device-drivers
 
Understanding the Dalvik Virtual Machine
Understanding the Dalvik Virtual MachineUnderstanding the Dalvik Virtual Machine
Understanding the Dalvik Virtual Machine
 
Linux kernel-rootkit-dev - Wonokaerun
Linux kernel-rootkit-dev - WonokaerunLinux kernel-rootkit-dev - Wonokaerun
Linux kernel-rootkit-dev - Wonokaerun
 
Die .htaccess richtig nutzen
Die .htaccess richtig nutzenDie .htaccess richtig nutzen
Die .htaccess richtig nutzen
 
Oaf development-guide
Oaf development-guideOaf development-guide
Oaf development-guide
 
Re-Design with Elixir/OTP
Re-Design with Elixir/OTPRe-Design with Elixir/OTP
Re-Design with Elixir/OTP
 
BioMake BOSC 2004
BioMake BOSC 2004BioMake BOSC 2004
BioMake BOSC 2004
 
Php version 7
Php version 7Php version 7
Php version 7
 
Zend Framework 2 - presentation
Zend Framework 2 - presentationZend Framework 2 - presentation
Zend Framework 2 - presentation
 
Web Application Firewall: Suckseed or Succeed
Web Application Firewall: Suckseed or SucceedWeb Application Firewall: Suckseed or Succeed
Web Application Firewall: Suckseed or Succeed
 
Physical Memory Management.pdf
Physical Memory Management.pdfPhysical Memory Management.pdf
Physical Memory Management.pdf
 
Drive by-download attack evolution zero nights v3
Drive by-download attack evolution zero nights v3Drive by-download attack evolution zero nights v3
Drive by-download attack evolution zero nights v3
 
Vladimir Kropotov - Drive-By-Download attack evolution before and after vulne...
Vladimir Kropotov - Drive-By-Download attack evolution before and after vulne...Vladimir Kropotov - Drive-By-Download attack evolution before and after vulne...
Vladimir Kropotov - Drive-By-Download attack evolution before and after vulne...
 

Linux I/O path_20070116

  • 1. sys_read() fs/read_write.c mm/filemap.c fget_light() include/linux/fs.h fs/ext2/file.c vfs_read() file->f_op->read() do_sync_read() filp->f_op->aio_read() generic_file_aio_read() generic_file_direct_IO() do_generic_file_read() do_generic_mapping_read() Page Cache Generic Block Layer Elevator I/O Scheduler Request Queue 1 Device Driver
  • 2. mm/filemap.c do_generic_mapping_read() no find page cached page page cache readahead page readpage not readpage error up to date continue page ok out 2
  • 3. do_generic_mapping_read() mm/readahead.c mm/filemap.c page_cache_readahead() fs/ext2/inode.c fs/mpage.c blockable_page_cache_readahead() __do_page_cache_readahead() read_pages() mapping->a_ops->readpages() mapping->a_ops->readpage() ext2_readpages() ext2_get_block() ext2_readpage() mpage_readpages() mpage_readpage() do_mpage_readpage() mpage_bio_submit() submit_bio() Generic Block Layer Elevator I/O Scheduler Request Queue 3 Device Driver
  • 4. submit_bio() mm/mpage.c include/linux/blkdev.h generic_make_request(bio) BLK_TA_QUEUE block/elevator.c block/ll_rw_blk.c q->make_request_fn(q,bio) block/as-iosched.c __make_request(q,bio) elv_merge(q,req,bio) BLK_TA_SLEEPRQ q->back_merge_fn() init_request_from_bio() BLK_TA_GETRQ BLK_TA_MERGE ll_merge_requests_fn() get_request_wait() elv_may_queue() elv_merged_request() add_request() get_request() e->ops->elevator_merged_fn() __elv_add_request() current_io_context() elv_insert() blk_alloc_request() BLK_TA_INSERT e->ops->elevator_add_req_fn(q, rq) I/O Scheduler Request Queue 4 Device Driver
  • 5. drivers/ide/ide-generic.c ide_generic_init() drivers/ide/ide-probes.c drivers/ide-io.c ideprobe_init() block/ll_rw_blk.c hwif_init() init_irq() request_irq(hwif->irq,&ide_intr, ...) register IRQ handler ide_init_queue() blk_init_queue_node(do_ide_request, ...) ide_intr() q->request_fn = rfn register I/O request dispatcher do_ide_request() ide_do_request() 5
  • 6. include/linux/blkdev.h q->make_request_fn() block/elevator.c add_request() block/ll_rw_blk.c block/as-iosched.c __elv_add_request() BLK_TA_PLUG elv_insert() Disk request queue without an I/O scheduler list_add_tail() q->request_fn() BLK_TA_INSERT ide_do_request() elv_next_request() rq = __elv_next_request(q) start_request() BLK_TA_ISSUE(D) Disk interrupt for completion 6
  • 7. include/linux/blkdev.h q->make_request_fn() block/elevator.c block/ll_rw_blk.c add_request() block/as-iosched.c __elv_add_request() BLK_TA_PLUG elv_insert() Disk request queue with an I/O scheduler e->ops->elevator_add_req_fn(q, rq) q->request_fn() BLK_TA_INSERT ide_do_request() elv_next_request() rq = __elv_next_request(q) e->ops->elevator_dispatch_fn() start_request() BLK_TA_ISSUE(D) Disk interrupt for completion 7
  • 8. drivers/ide/ide-io.c ide_do_request() block/elevator.c block/as-iosched.c rq = elv_next_request(drive->queue) block/ll_rw_blk.c rq = __elv_next_request(q) q->elevator->ops->elevator_dispatch_fn(q, 0)) ISSUE start_request(drive, rq) static inline struct request *__elv_next_request(request_queue_t *q) { struct request *rq; while (1) { Disk while (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); if (blk_do_ordered(q, &rq)) return rq; } if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) return NULL; } 8 }
  • 9. static void *noop_init_queue(request_queue_t *q, elevator_t *e) block/noop-iosched.c { struct noop_data *nd; nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); if (!nd) Eevator private return NULL; data structure INIT_LIST_HEAD(&nd->queue); return nd; } static void noop_add_request(request_queue_t *q, struct request *rq) { struct noop_data *nd = q->elevator->elevator_data; list_add_tail(&rq->queuelist, &nd->queue); } static int noop_dispatch(request_queue_t *q, int force) { struct noop_data *nd = q->elevator->elevator_data; Disk if (!list_empty(&nd->queue)) { struct request *rq; rq = list_entry(nd->queue.next, struct request, queuelist); list_del_init(&rq->queuelist); elv_dispatch_sort(q, rq); return 1; } return 0; 9 }
  • 10. The runtime data of Deadline i/o scheduler block/deadline-iosched.c struct deadline_data { /* requests (deadline_rq s) are present on both sort_list and fifo_list */ struct rb_root sort_list[2]; struct list_head fifo_list[2]; /* next in sort order. read, write or both are NULL */ struct request *next_rq[2]; unsigned int batching; /* number of sequential requests made */ sector_t last_sector; /* head position */ unsigned int starved; /* times reads have starved writes */ /* settings that change how the i/o scheduler behaves */ int fifo_expire[2]; int fifo_batch; int writes_starved; int front_merges; }; struct list_head fifo_list[READ] 6 4 5 struct rb_root sort_list[READ] 6 5 4 next_rq[READ] next_rq[WRITE] struct rb_root sort_list[WRITE] 9 8 7 struct list_head fifo_list[WRITE] 10 7 8 9
  • 11. block/deadline-iosched.c Add a request to both rb tree and fifo list static void deadline_add_request(struct request_queue *q, struct request *rq) { struct deadline_data *dd = q->elevator->elevator_data; const int data_dir = rq_data_dir(rq); deadline_add_rq_rb(dd, rq); /* * set expire time (only used for reads) and add to fifo list */ rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]); list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); } struct list_head fifo_list[READ] 6 4 5 struct rb_root sort_list[READ] 6 5 4 next_rq[READ] next_rq[WRITE] struct rb_root sort_list[WRITE] 9 8 7 struct list_head fifo_list[WRITE] 11 7 8 9
  • 12. struct list_head fifo_list[READ] 6 4 5 block/deadline-iosched.c struct rb_root sort_list[READ] 6 5 4 next_rq[READ] next_rq[WRITE] struct rb_root sort_list[WRITE] 9 8 7 struct list_head fifo_list[WRITE] 7 8 9 1.Check if we are running a sequential batch, and it is still entitled. if (dd->next_rq[WRITE]) rq = dd->next_rq[WRITE]; else rq = dd->next_rq[READ]; if (rq) { /* we have a "next request" */ if (dd->last_sector != rq->sector) /* end the batch on a non sequential request */ dd->batching += dd->fifo_batch; if (dd->batching < dd->fifo_batch) /* we are still entitled to batch */ goto dispatch_request; } 12
  • 13. struct list_head fifo_list[READ] 6 4 5 block/deadline-iosched.c struct rb_root sort_list[READ] 6 5 4 struct rb_root sort_list[WRITE] 9 8 7 struct list_head fifo_list[WRITE] 7 8 9 2. If we are not running a batch. Choose a new direction to serve requests. A read request is always favored, unless write has been starved. if (reads) { if (writes && (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = READ; goto dispatch_find_request; } if (writes) { dispatch_writes: dd->starved = 0; data_dir = WRITE; goto dispatch_find_request; } 13
  • 14. struct list_head fifo_list[READ] 6 4 5 block/deadline-iosched.c struct rb_root sort_list[READ] 6 5 4 next_rq[READ] struct rb_root sort_list[READ] 6 5 4 next_rq[READ] 3.Choose an appropriate request.. If the first request of the fifo list has expired, serve it. Otherwise, behave as an “One-way Elevator” dispatch_find_request: if (deadline_check_fifo(dd, data_dir)) { dd->batching = 0; rq = rq_entry_fifo(dd->fifo_list[data_dir].next); } else if (dd->next_rq[data_dir]) { rq = dd->next_rq[data_dir]; } else { struct rb_node *node; dd->batching = 0; node = rb_first(&dd->sort_list[data_dir]); if (node) rq = rb_entry_rq(node); } 14
  • 15. block/deadline-iosched.c Dispatch the request, remove it from the elevator’s private queue and put it in the dispatch queue. Also update the information about the “last” and the “next” request. struct list_head fifo_list[READ] 6 4 5 4 struct rb_root sort_list[READ] 6 5 Suppose the request 4 was picked in the previous step. next_rq[READ] static void deadline_move_request(struct deadline_data *dd, struct request *rq) { const int data_dir = rq_data_dir(rq); struct rb_node *rbnext = rb_next(&rq->rb_node); Disk dd->next_rq[READ] = NULL; dd->next_rq[WRITE] = NULL; if (rbnext) dd->next_rq[data_dir] = rb_entry_rq(rbnext); dd->last_sector = rq->sector + rq->nr_sectors; deadline_move_to_dispatch(dd, rq); } 15
  • 16. block/as-iosched.c static void as_add_request(request_queue_t *q, struct request *rq) { struct as_data *ad = q->elevator->elevator_data; int data_dir; RQ_SET_STATE(rq, AS_RQ_NEW); data_dir = rq_is_sync(rq); rq->elevator_private = as_get_io_context(q->node); if (RQ_IOC(rq)) { as_update_iohist(ad, RQ_IOC(rq)->aic, rq); atomic_inc(&RQ_IOC(rq)->aic->nr_queued); } as_add_rq_rb(ad, rq); /* * set expire time (only used for reads) and add to fifo list */ rq_set_fifo_time(rq, jiffies + ad->fifo_expire[data_dir]); list_add_tail(&rq->queuelist, &ad->fifo_list[data_dir]); as_update_rq(ad, rq); /* keep state machine up to date */ RQ_SET_STATE(rq, AS_RQ_QUEUED); } 16
  • 17. as_add_request() include/linux/list.h include/linux/elevator.h as_get_io_context(q->node) block/ll_rw_blk.c get_io_context() block/as-iosched.c current_io_context() alloc_as_io_context() task_struct io_context as_io_context as_update_iohist() as_update_thinktime() as_update_seekdist() as_add_rq_rb() request request request rq_set_fifo_time() list_add_tail() as_update_rq(ad, rq) task_struct io_context as_io_context as_choose_req() as_can_break_anticipation() as_update_iohist() as_antic_stop() request request del_timer() 17 kblockd_schedule_work()
  • 18. as_add_request() include/linux/list.h include/linux/elevator.h as_get_io_context(q->node) block/ll_rw_blk.c get_io_context() block/as-iosched.c current_io_context() alloc_as_io_context() task_struct io_context as_io_context as_update_iohist() as_update_thinktime() as_update_seekdist() as_add_rq_rb() request request request rq_set_fifo_time() list_add_tail() as_update_rq(ad, rq) task_struct io_context as_io_context as_choose_req() as_can_break_anticipation() as_update_iohist() as_antic_stop() request request del_timer() 18 kblockd_schedule_work()
  • 19. static void as_update_iohist(struct as_data *ad, struct as_io_context *aic, struct request *rq) { ... if (data_dir == REQ_SYNC) { unsigned long in_flight = atomic_read(&aic->nr_queued) + atomic_read(&aic->nr_dispatched); spin_lock(&aic->lock); if (test_bit(AS_TASK_IORUNNING, &aic->state) || test_bit(AS_TASK_IOSTARTED, &aic->state)) { /* Calculate read -> read thinktime */ if (test_bit(AS_TASK_IORUNNING, &aic->state) && in_flight == 0) { thinktime = jiffies - aic->last_end_request; thinktime = min(thinktime, MAX_THINKTIME-1); } as_update_thinktime(ad, aic, thinktime); /* Calculate read -> read seek distance */ if (aic->last_request_pos < rq->sector) seek_dist = rq->sector - aic->last_request_pos; else seek_dist = aic->last_request_pos - rq->sector; as_update_seekdist(ad, aic, seek_dist); } aic->last_request_pos = rq->sector + rq->nr_sectors; set_bit(AS_TASK_IOSTARTED, &aic->state); spin_unlock(&aic->lock); } 19 }
  • 20. as_add_request() include/linux/list.h include/linux/elevator.h as_get_io_context(q->node) block/ll_rw_blk.c get_io_context() block/as-iosched.c current_io_context() alloc_as_io_context() task_struct io_context as_io_context as_update_iohist() as_update_thinktime() as_update_seekdist() as_add_rq_rb() request request request rq_set_fifo_time() list_add_tail() as_update_rq(ad, rq) task_struct io_context as_io_context as_choose_req() as_can_break_anticipation() as_update_iohist() as_antic_stop() request request del_timer() 20 kblockd_schedule_work()
  • 21. enum anticipation_status { ANTIC_OFF = 0, /* Not anticipating (normal operation) */ ANTIC_WAIT_REQ, /* The last read has not yet completed */ ANTIC_WAIT_NEXT, /* Currently anticipating a request vs last read (which has completed) */ ANTIC_FINISHED, /* Anticipating but have found a candidate or timed out */ }; as_add_request() as_dispatch_request() as_completed_request() as_add_rq_rb() as_update_rq() as_move_to_dispatch() as_antic_waitreq() as_antic_stop() as_antic_timeout() as_antic_waitnext() ANTIC_FINISHED ANTIC_OFF ANTIC_WAIT_REQ ANTIC_WAIT_NEXT 21 kblockd_schedule_work()
  • 22. /* * This is called directly by the functions in this file to stop anticipation. * We kill the timer and schedule a call to the request_fn asap. */ static void as_antic_stop(struct as_data *ad) { int status = ad->antic_status; if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) { if (status == ANTIC_WAIT_NEXT) del_timer(&ad->antic_timer); ad->antic_status = ANTIC_FINISHED; /* see as_work_handler */ kblockd_schedule_work(&ad->antic_work); } } 22
  • 23. /* * as_update_rq must be called whenever a request (rq) is added to * the sort_list. This function keeps caches up to date, and checks if the * request might be one we are "anticipating" */ static void as_update_rq(struct as_data *ad, struct request *rq) { const int data_dir = rq_is_sync(rq); /* keep the next_rq cache up to date */ ad->next_rq[data_dir] = as_choose_req(ad, rq, ad->next_rq[data_dir]); /* * have we been anticipating this request? * or does it come from the same process as the one we are anticipating * for? */ if (ad->antic_status == ANTIC_WAIT_REQ || ad->antic_status == ANTIC_WAIT_NEXT) { if (as_can_break_anticipation(ad, rq)) as_antic_stop(ad); } } 23
  • 24. /* * This is executed in a "deferred" process context, by kblockd. It calls the * driver's request_fn so the driver can submit that request. * * IMPORTANT! This guy will reenter the elevator, so set up all queue global * state before calling, and don't rely on any state over calls. * * FIXME! dispatch queue is not a queue at all! */ static void as_work_handler(void *data) { struct request_queue *q = data; unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); blk_start_queueing(q); spin_unlock_irqrestore(q->queue_lock, flags); } 24
  • 25. /* * as_antic_timeout is the timer function set by as_antic_waitnext. */ static void as_antic_timeout(unsigned long data) { struct request_queue *q = (struct request_queue *)data; struct as_data *ad = q->elevator->elevator_data; unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); if (ad->antic_status == ANTIC_WAIT_REQ || ad->antic_status == ANTIC_WAIT_NEXT) { struct as_io_context *aic = ad->io_context->aic; ad->antic_status = ANTIC_FINISHED; kblockd_schedule_work(&ad->antic_work); if (aic->ttime_samples == 0) { /* process anticipated on has exited or timed out*/ ad->exit_prob = (7*ad->exit_prob + 256)/8; } if (!test_bit(AS_TASK_RUNNING, &aic->state)) { /* process not "saved" by a cooperating request */ ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8; } } spin_unlock_irqrestore(q->queue_lock, flags); } 25
  • 26. static void as_put_io_context(struct request *rq) { struct as_io_context *aic; if (unlikely(!RQ_IOC(rq))) return; aic = RQ_IOC(rq)->aic; if (rq_is_sync(rq) && aic) { spin_lock(&aic->lock); set_bit(AS_TASK_IORUNNING, &aic->state); aic->last_end_request = jiffies; spin_unlock(&aic->lock); } put_io_context(RQ_IOC(rq)); } 26
  • 27. as_update_rq() as_choose_req() as_can_anticipate() as_update_seekdist() as_can_break_anticipation() as_can_break_anticipation() as_update_iohist() as_update_thinktime() as_antic_stop() as_antic_expired() as_close_req() as_update_iohist() as_completed_request() as_dispatch_request(() kblockd_schedule_work() as_move_to_dispatch() update_write_batch() as_antic_stop() as_get_io_context() as_antic_waitnext() copy_io_context() get_io_context() as_put_io_context() put_io_context() alloc_as_io_context() as_remove_queued_request() elv_dispatch_sort() as_find_next_rq() as_antic_waitreq() as_fifo_expired() as_batch_expired() 27
  • 28. genhd_device_init() genhd_device_init() blk_dev_init() create_workqueue("kblockd") 28
  • 29. void blk_start_queueing(request_queue_t *q) { if (!blk_queue_plugged(q)) blk_start_queueing() q->request_fn(q); else __generic_unplug_device(q); } blk_queue_plugged() not plugged plugged void __generic_unplug_device(request_queue_t *q) { if (unlikely(blk_queue_stopped(q))) __generic_unplug_device() return; if (!blk_remove_plug(q)) blk_remove_plug() return; q->request_fn(q); del_timer() } int blk_remove_plug(request_queue_t *q) q->request_fn() { WARN_ON(!irqs_disabled()); if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) return 0; del_timer(&q->unplug_timer); return 1; 29 }
  • 30. block/ll_rw_blk.c void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn) { q->nr_requests = BLKDEV_MAX_RQ; blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); q->make_request_fn = mfn; q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; q->backing_dev_info.state = 0; q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; blk_queue_max_sectors(q, SAFE_MAX_SECTORS); blk_queue_hardsect_size(q, 512); blk_queue_dma_alignment(q, 511); blk_queue_congestion_threshold(q); q->nr_batching = BLK_BATCH_REQ; q->unplug_thresh = 4; /* hmm */ q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ if (q->unplug_delay == 0) q->unplug_delay = 1; INIT_WORK(&q->unplug_work, blk_unplug_work, q); q->unplug_timer.function = blk_unplug_timeout; q->unplug_timer.data = (unsigned long)q; blk_queue_activity_fn(q, NULL, NULL); } 30
  • 31. include/linux/blkdev.h request_queue_t * blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) { request_queue_t *q = blk_alloc_queue_node(GFP_KERNEL, node_id); if (!q) return NULL; q->node = node_id; blk_init_free_list(q); q->request_fn = rfn; q->back_merge_fn = ll_back_merge_fn; q->front_merge_fn = ll_front_merge_fn; q->merge_requests_fn = ll_merge_requests_fn; q->prep_rq_fn = NULL; q->unplug_fn = generic_unplug_device; q->queue_flags = (1 << QUEUE_FLAG_CLUSTER); q->queue_lock = lock; blk_queue_segment_boundary(q, 0xffffffff); blk_queue_make_request(q, __make_request); blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); /* all done */ elevator_init(q, NULL); 31 }
  • 32. sys_read() ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) { struct file *file; ssize_t ret = -EBADF; int fput_needed; file = fget_light(fd, &fput_needed); if (file) { loff_t pos = file_pos_read(file); ret = vfs_read(file, buf, count, &pos); file_pos_write(file, pos); fput_light(file, fput_needed); } return ret; } 32
  • 33. vfs_read() ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read)) return -EINVAL; if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); if (ret >= 0) { count = ret; if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); else ret = do_sync_read(file, buf, count, pos); if (ret > 0) { fsnotify_access(file->f_dentry); current->rchar += ret; } current->syscr++; } return ret; } 33
  • 34. do_sync_read() ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = len }; struct kiocb kiocb; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; kiocb.ki_left = len; for (;;) { ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); if (ret != -EIOCBRETRY) break; wait_on_retry_sync_kiocb(&kiocb); } if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; return ret; } 34
  • 35. generic_file_aio_read() 1/2 ssize_t generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *filp = iocb->ki_filp; ssize_t retval; unsigned long seg; size_t count; loff_t *ppos = &iocb->ki_pos; count = 0; for (seg = 0; seg < nr_segs; seg++) { const struct iovec *iv = &iov[seg]; ... } /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { ... } 35
  • 36. generic_file_aio_read() 2/2 retval = 0; if (count) { for (seg = 0; seg < nr_segs; seg++) { read_descriptor_t desc; desc.written = 0; desc.arg.buf = iov[seg].iov_base; desc.count = iov[seg].iov_len; if (desc.count == 0) continue; desc.error = 0; do_generic_file_read(filp,ppos,&desc,file_read_actor); retval += desc.written; if (desc.error) { retval = retval ?: desc.error; break; } } } out: return retval; } 36
  • 37. do_generic_file_read() static inline void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor) { do_generic_mapping_read(filp->f_mapping, &filp->f_ra, filp, ppos, desc, actor); } 37