[PATCH] Initial md/raid5 support for 2.5 (with bio)

author Neil Brown <neilb@cse.unsw.edu.au>

Sat, 18 May 2002 05:03:33 +0000 (22:03 -0700)

committer Christoph Hellwig <hch@sb.bsdonline.org>

Sat, 18 May 2002 05:03:33 +0000 (22:03 -0700)
author Neil Brown <neilb@cse.unsw.edu.au>
Sat, 18 May 2002 05:03:33 +0000 (22:03 -0700)
committer Christoph Hellwig <hch@sb.bsdonline.org>
Sat, 18 May 2002 05:03:33 +0000 (22:03 -0700)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 2b1a05ae583777521c8602c3edd574df01ea30b7..8133b7c0952fb47baec12469712189dc3fd13207 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -24,19 +24,19 @@
  #include <asm/bitops.h>
  #include <asm/atomic.h>
  
-static mdk_personality_t raid5_personality;
-
  /*
   * Stripe cache
   */
  
  #define NR_STRIPES             256
+#define STRIPE_SIZE            PAGE_SIZE
+#define STRIPE_SECTORS         (STRIPE_SIZE>>9)
  #define        IO_THRESHOLD            1
  #define HASH_PAGES             1
  #define HASH_PAGES_ORDER       0
  #define NR_HASH                        (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
  #define HASH_MASK              (NR_HASH - 1)
-#define stripe_hash(conf, sect)        ((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK])
+#define stripe_hash(conf, sect)        ((conf)->stripe_hashtbl[((sect) / STRIPE_SECTORS) & HASH_MASK])
  
  /*
   * The following can be used to debug the driver
@@ -142,47 +142,36 @@ out:
  
  static void shrink_buffers(struct stripe_head *sh, int num)
  {
-       struct buffer_head *bh;
+       struct page *p;
         int i;
  
         for (i=0; i<num ; i++) {
-               bh = sh->bh_cache[i];
-               if (!bh)
-                       return;
-               sh->bh_cache[i] = NULL;
-               free_page((unsigned long) bh->b_data);
-               kfree(bh);
+               p = sh->dev[i].page;
+               if (!p)
+                       continue;
+               sh->dev[i].page = NULL;
+               page_cache_release(p);
         }
  }
  
-static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)
+static int grow_buffers(struct stripe_head *sh, int num)
  {
-       struct buffer_head *bh;
         int i;
  
         for (i=0; i<num; i++) {
                 struct page *page;
-               bh = kmalloc(sizeof(struct buffer_head), priority);
-               if (!bh)
-                       return 1;
-               memset(bh, 0, sizeof (struct buffer_head));
-               if ((page = alloc_page(priority)))
-                       bh->b_data = page_address(page);
-               else {
-                       kfree(bh);
+
+               if (!(page = alloc_page(GFP_KERNEL))) {
                         return 1;
                 }
-               atomic_set(&bh->b_count, 0);
-               bh->b_page = page;
-               sh->bh_cache[i] = bh;
-
+               sh->dev[i].page = page;
         }
         return 0;
  }
  
-static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i);
+static void raid5_build_block (struct stripe_head *sh, int i);
  
-static inline void init_stripe(struct stripe_head *sh, unsigned long sector)
+static inline void init_stripe(struct stripe_head *sh, unsigned long sector, int pd_idx)
  {
         raid5_conf_t *conf = sh->raid_conf;
         int disks = conf->raid_disks, i;
@@ -198,40 +187,26 @@ static inline void init_stripe(struct stripe_head *sh, unsigned long sector)
         remove_hash(sh);
         
         sh->sector = sector;
-       sh->size = conf->buffer_size;
+       sh->pd_idx = pd_idx;
         sh->state = 0;
  
         for (i=disks; i--; ) {
-               if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
-                   buffer_locked(sh->bh_cache[i])) {
+               struct r5dev *dev = &sh->dev[i];
+
+               if (dev->toread || dev->towrite || dev->written ||
+                   test_bit(R5_LOCKED, &dev->flags)) {
                         printk("sector=%lx i=%d %p %p %p %d\n",
-                              sh->sector, i, sh->bh_read[i],
-                              sh->bh_write[i], sh->bh_written[i],
-                              buffer_locked(sh->bh_cache[i]));
+                              sh->sector, i, dev->toread,
+                              dev->towrite, dev->written,
+                              test_bit(R5_LOCKED, &dev->flags));
                         BUG();
                 }
-               clear_buffer_uptodate(sh->bh_cache[i]);
+               dev->flags = 0;
                 raid5_build_block(sh, i);
         }
         insert_hash(conf, sh);
  }
  
-/* the buffer size has changed, so unhash all stripes
- * as active stripes complete, they will go onto inactive list
- */
-static void shrink_stripe_cache(raid5_conf_t *conf)
-{
-       int i;
-       CHECK_DEVLOCK();
-       if (atomic_read(&conf->active_stripes))
-               BUG();
-       for (i=0; i < NR_HASH; i++) {
-               struct stripe_head *sh;
-               while ((sh = conf->stripe_hashtbl[i])) 
-                       remove_hash(sh);
-       }
-}
-
  static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
  {
         struct stripe_head *sh;
@@ -245,7 +220,8 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long secto
         return NULL;
  }
  
-static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, int size, int noblock) 
+static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, 
+                                            int pd_idx, int noblock) 
  {
         struct stripe_head *sh;
  
@@ -254,44 +230,6 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long s
         spin_lock_irq(&conf->device_lock);
  
         do {
-               if (conf->buffer_size == 0 ||
-                   (size && size != conf->buffer_size)) {
-                       /* either the size is being changed (buffer_size==0) or
-                        * we need to change it.
-                        * If size==0, we can proceed as soon as buffer_size gets set.
-                        * If size>0, we can proceed when active_stripes reaches 0, or
-                        * when someone else sets the buffer_size to size.
-                        * If someone sets the buffer size to something else, we will need to
-                        * assert that we want to change it again
-                        */
-                       int oldsize = conf->buffer_size;
-                       PRINTK("get_stripe %ld/%d buffer_size is %d, %d active\n", sector, size, conf->buffer_size, atomic_read(&conf->active_stripes));
-                       if (size==0)
-                               wait_event_lock_irq(conf->wait_for_stripe,
-                                                   conf->buffer_size,
-                                                   conf->device_lock);
-                       else {
-                               while (conf->buffer_size != size && atomic_read(&conf->active_stripes)) {
-                                       conf->buffer_size = 0;
-                                       wait_event_lock_irq(conf->wait_for_stripe,
-                                                           atomic_read(&conf->active_stripes)==0 || conf->buffer_size,
-                                                           conf->device_lock);
-                                       PRINTK("waited and now  %ld/%d buffer_size is %d - %d active\n", sector, size,
-                                              conf->buffer_size, atomic_read(&conf->active_stripes));
-                               }
-
-                               if (conf->buffer_size != size) {
-                                       printk("raid5: switching cache buffer size, %d --> %d\n", oldsize, size);
-                                       shrink_stripe_cache(conf);
-                                       if (size==0) BUG();
-                                       conf->buffer_size = size;
-                                       PRINTK("size now %d\n", conf->buffer_size);
-                               }
-                       }
-               }
-               if (size == 0)
-                       sector -= sector & ((conf->buffer_size>>9)-1);
-
                 sh = __find_stripe(conf, sector);
                 if (!sh) {
                         if (!conf->inactive_blocked)
@@ -307,7 +245,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long s
                                                     conf->device_lock);
                                 conf->inactive_blocked = 0;
                         } else
-                               init_stripe(sh, sector);
+                               init_stripe(sh, sector, pd_idx);
                 } else {
                         if (atomic_read(&sh->count)) {
                                 if (!list_empty(&sh->lru))
@@ -329,21 +267,31 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long s
         return sh;
  }
  
-static int grow_stripes(raid5_conf_t *conf, int num, int priority)
+static int grow_stripes(raid5_conf_t *conf, int num)
  {
         struct stripe_head *sh;
+       kmem_cache_t *sc;
+       int devs = conf->raid_disks;
+
+       sprintf(conf->cache_name, "md/raid5-%d", conf->mddev->__minor);
  
+       sc = kmem_cache_create(conf->cache_name, 
+                              sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
+                              0, 0, NULL, NULL);
+       if (!sc)
+               return 1;
+       conf->slab_cache = sc;
         while (num--) {
-               sh = kmalloc(sizeof(struct stripe_head), priority);
+               sh = kmem_cache_alloc(sc, GFP_KERNEL);
                 if (!sh)
                         return 1;
-               memset(sh, 0, sizeof(*sh));
+               memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev));
                 sh->raid_conf = conf;
                 sh->lock = SPIN_LOCK_UNLOCKED;
  
-               if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) {
+               if (grow_buffers(sh, conf->raid_disks)) {
                         shrink_buffers(sh, conf->raid_disks);
-                       kfree(sh);
+                       kmem_cache_free(sc, sh);
                         return 1;
                 }
                 /* we just created an active stripe so... */
@@ -355,11 +303,11 @@ static int grow_stripes(raid5_conf_t *conf, int num, int priority)
         return 0;
  }
  
-static void shrink_stripes(raid5_conf_t *conf, int num)
+static void shrink_stripes(raid5_conf_t *conf)
  {
         struct stripe_head *sh;
  
-       while (num--) {
+       while (1) {
                 spin_lock_irq(&conf->device_lock);
                 sh = get_free_stripe(conf);
                 spin_unlock_irq(&conf->device_lock);
@@ -368,21 +316,22 @@ static void shrink_stripes(raid5_conf_t *conf, int num)
                 if (atomic_read(&sh->count))
                         BUG();
                 shrink_buffers(sh, conf->raid_disks);
-               kfree(sh);
+               kmem_cache_free(conf->slab_cache, sh);
                 atomic_dec(&conf->active_stripes);
         }
+       kmem_cache_destroy(conf->slab_cache);
+       conf->slab_cache = NULL;
  }
  
-
-static void raid5_end_read_request (struct buffer_head * bh, int uptodate)
+static void raid5_end_read_request (struct bio * bi)
  {
-       struct stripe_head *sh = bh->b_private;
+       struct stripe_head *sh = bi->bi_private;
         raid5_conf_t *conf = sh->raid_conf;
         int disks = conf->raid_disks, i;
-       unsigned long flags;
+       int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
  
         for (i=0 ; i<disks; i++)
-               if (bh == sh->bh_cache[i])
+               if (bi == &sh->dev[i].req)
                         break;
  
         PRINTK("end_read_request %lu/%d, count: %d, uptodate %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
@@ -392,7 +341,9 @@ static void raid5_end_read_request (struct buffer_head * bh, int uptodate)
         }
  
         if (uptodate) {
-               struct buffer_head *buffer;
+#if 0
+               struct bio *bio;
+               unsigned long flags;
                 spin_lock_irqsave(&conf->device_lock, flags);
                 /* we can return a buffer if we bypassed the cache or
                  * if the top buffer is not in highmem.  If there are
@@ -409,38 +360,43 @@ static void raid5_end_read_request (struct buffer_head * bh, int uptodate)
                 } else
                         buffer = NULL;
                 spin_unlock_irqrestore(&conf->device_lock, flags);
-               if (sh->bh_page[i]==NULL)
+               if (sh->bh_page[i]==bh->b_page)
                         set_buffer_uptodate(bh);
                 if (buffer) {
                         if (buffer->b_page != bh->b_page)
                                 memcpy(buffer->b_data, bh->b_data, bh->b_size);
                         buffer->b_end_io(buffer, 1);
                 }
+#else
+               set_bit(R5_UPTODATE, &sh->dev[i].flags);
+#endif         
         } else {
-               md_error(conf->mddev, bh->b_bdev);
-               clear_buffer_uptodate(bh);
+               md_error(conf->mddev, bi->bi_bdev);
+               clear_bit(R5_UPTODATE, &sh->dev[i].flags);
         }
+#if 0
         /* must restore b_page before unlocking buffer... */
-       if (sh->bh_page[i]) {
+       if (sh->bh_page[i] != bh->b_page) {
                 bh->b_page = sh->bh_page[i];
                 bh->b_data = page_address(bh->b_page);
-               sh->bh_page[i] = NULL;
                 clear_buffer_uptodate(bh);
         }
-       clear_buffer_locked(bh);
+#endif
+       clear_bit(R5_LOCKED, &sh->dev[i].flags);
         set_bit(STRIPE_HANDLE, &sh->state);
         release_stripe(sh);
  }
  
-static void raid5_end_write_request (struct buffer_head *bh, int uptodate)
+static void raid5_end_write_request (struct bio *bi)
  {
-       struct stripe_head *sh = bh->b_private;
+       struct stripe_head *sh = bi->bi_private;
         raid5_conf_t *conf = sh->raid_conf;
         int disks = conf->raid_disks, i;
         unsigned long flags;
+       int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
  
         for (i=0 ; i<disks; i++)
-               if (bh == sh->bh_cache[i])
+               if (bi == &sh->dev[i].req)
                         break;
  
         PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
@@ -451,29 +407,36 @@ static void raid5_end_write_request (struct buffer_head *bh, int uptodate)
  
         spin_lock_irqsave(&conf->device_lock, flags);
         if (!uptodate)
-               md_error(conf->mddev, bh->b_bdev);
-       clear_buffer_locked(bh);
+               md_error(conf->mddev, bi->bi_bdev);
+       
+       clear_bit(R5_LOCKED, &sh->dev[i].flags);
         set_bit(STRIPE_HANDLE, &sh->state);
         __release_stripe(conf, sh);
         spin_unlock_irqrestore(&conf->device_lock, flags);
  }
-       
  
  
-static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
+static unsigned long compute_blocknr(struct stripe_head *sh, int i);
+       
+static void raid5_build_block (struct stripe_head *sh, int i)
  {
         raid5_conf_t *conf = sh->raid_conf;
-       struct buffer_head *bh = sh->bh_cache[i];
-       unsigned long block = sh->sector / (sh->size >> 9);
-
-       init_buffer(bh, raid5_end_read_request, sh);
-       bh->b_dev       = conf->disks[i].dev;
-       /* FIXME - later we will need bdev here */
-       bh->b_blocknr   = block;
-
-       bh->b_state     = (1 << BH_Req) | (1 << BH_Mapped);
-       bh->b_size      = sh->size;
-       return bh;
+       struct r5dev *dev = &sh->dev[i];
+
+       bio_init(&dev->req);
+       dev->req.bi_io_vec = &dev->vec;
+       dev->req.bi_vcnt++;
+       dev->vec.bv_page = dev->page;
+       dev->vec.bv_len = STRIPE_SIZE;
+       dev->vec.bv_offset = 0;
+
+       dev->req.bi_bdev = conf->disks[i].bdev;
+       dev->req.bi_sector = sh->sector;
+       dev->req.bi_private = sh;
+
+       dev->flags = 0;
+       if (i != sh->pd_idx)
+               dev->sector = compute_blocknr(sh, i);
  }
  
  static int error (mddev_t *mddev, kdev_t dev)
@@ -544,14 +507,14 @@ static int error (mddev_t *mddev, kdev_t dev)
   * Input: a 'big' sector number,
   * Output: index of the data and parity disk, and the sector # in them.
   */
-static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks,
+static unsigned long raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
                         unsigned int data_disks, unsigned int * dd_idx,
                         unsigned int * pd_idx, raid5_conf_t *conf)
  {
-       unsigned long stripe;
+       sector_t stripe;
         unsigned long chunk_number;
         unsigned int chunk_offset;
-       unsigned long new_sector;
+       sector_t new_sector;
         int sectors_per_chunk = conf->chunk_size >> 9;
  
         /* First compute the information on this sector */
@@ -607,17 +570,17 @@ static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int r
         return new_sector;
  }
  
-#if 0
-static unsigned long compute_blocknr(struct stripe_head *sh, int i)
+
+static sector_t compute_blocknr(struct stripe_head *sh, int i)
  {
         raid5_conf_t *conf = sh->raid_conf;
         int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
-       unsigned long new_sector = sh->sector, check;
+       sector_t new_sector = sh->sector, check;
         int sectors_per_chunk = conf->chunk_size >> 9;
-       unsigned long stripe = new_sector / sectors_per_chunk;
+       sector_t stripe = new_sector / sectors_per_chunk;
         int chunk_offset = new_sector % sectors_per_chunk;
         int chunk_number, dummy1, dummy2, dd_idx = i;
-       unsigned long r_sector, blocknr;
+       sector_t r_sector;
  
         switch (conf->algorithm) {
                 case ALGORITHM_LEFT_ASYMMETRIC:
@@ -637,22 +600,72 @@ static unsigned long compute_blocknr(struct stripe_head *sh, int i)
  
         chunk_number = stripe * data_disks + i;
         r_sector = chunk_number * sectors_per_chunk + chunk_offset;
-       blocknr = r_sector / (sh->size >> 9);
  
         check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
         if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
                 printk("compute_blocknr: map not correct\n");
                 return 0;
         }
-       return blocknr;
+       return r_sector;
  }
-#endif
  
-#define check_xor()    do {                                    \
-                          if (count == MAX_XOR_BLOCKS) {       \
-                               xor_block(count, bh_ptr);       \
-                               count = 1;                      \
-                          }                                    \
+
+
+/*
+ * Copy data between a page in the stripe cache, and one or more bion
+ * The page could align with the middle of the bio, or there could be 
+ * several bion, each with several bio_vecs, which cover part of the page
+ * Multiple bion are linked together on bi_next.  There may be extras
+ * at the end of this list.  We ignore them.
+ */
+static void copy_data(int frombio, struct bio *bio,
+                    struct page *page,
+                    sector_t sector)
+{
+       char *pa = page_address(page);
+       struct bio_vec *bvl;
+       int i;
+
+       for (;bio && bio->bi_sector < sector+STRIPE_SECTORS;
+               bio = bio->bi_next) {
+               int page_offset;
+               if (bio->bi_sector >= sector)
+                       page_offset = (signed)(bio->bi_sector - sector) * 512;
+               else 
+                       page_offset = (signed)(sector - bio->bi_sector) * -512;
+               bio_for_each_segment(bvl, bio, i) {
+                       char *ba = __bio_kmap(bio, i);
+                       int len = bio_iovec_idx(bio,i)->bv_len;
+                       int clen;
+                       int b_offset = 0;                       
+
+                       if (page_offset < 0) {
+                               b_offset = -page_offset;
+                               page_offset += b_offset;
+                               len -= b_offset;
+                       }
+
+                       if (len > 0 && page_offset + len > STRIPE_SIZE)
+                               clen = STRIPE_SIZE - page_offset;       
+                       else clen = len;
+                       
+                       if (len > 0) {
+                               if (frombio)
+                                       memcpy(pa+page_offset, ba+b_offset, clen);
+                               else
+                                       memcpy(ba+b_offset, pa+page_offset, clen);
+                       }
+                       __bio_kunmap(bio, i);
+                       page_offset +=  len;
+               }
+       }
+}
+
+#define check_xor()    do {                                            \
+                          if (count == MAX_XOR_BLOCKS) {               \
+                               xor_block(count, STRIPE_SIZE, ptr);     \
+                               count = 1;                              \
+                          }                                            \
                         } while(0)
  
  
@@ -660,88 +673,84 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
  {
         raid5_conf_t *conf = sh->raid_conf;
         int i, count, disks = conf->raid_disks;
-       struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;
+       void *ptr[MAX_XOR_BLOCKS], *p;
  
         PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx);
  
-
-       memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);
-       bh_ptr[0] = sh->bh_cache[dd_idx];
+       ptr[0] = page_address(sh->dev[dd_idx].page);
+       memset(ptr[0], 0, STRIPE_SIZE);
         count = 1;
         for (i = disks ; i--; ) {
                 if (i == dd_idx)
                         continue;
-               bh = sh->bh_cache[i];
-               if (buffer_uptodate(bh))
-                       bh_ptr[count++] = bh;
+               p = page_address(sh->dev[i].page);
+               if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
+                       ptr[count++] = p;
                 else
                         printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
  
                 check_xor();
         }
         if (count != 1)
-               xor_block(count, bh_ptr);
-       set_buffer_uptodate(sh->bh_cache[dd_idx]);
+               xor_block(count, STRIPE_SIZE, ptr);
+       set_bit(R5_UPTODATE, &sh->dev[i].flags);
  }
  
  static void compute_parity(struct stripe_head *sh, int method)
  {
         raid5_conf_t *conf = sh->raid_conf;
         int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
-       struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
-       struct buffer_head *chosen[MD_SB_DISKS];
+       void *ptr[MAX_XOR_BLOCKS];
+       struct bio *chosen[MD_SB_DISKS];
  
         PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method);
         memset(chosen, 0, sizeof(chosen));
  
         count = 1;
-       bh_ptr[0] = sh->bh_cache[pd_idx];
+       ptr[0] = page_address(sh->dev[pd_idx].page);
         switch(method) {
         case READ_MODIFY_WRITE:
-               if (!buffer_uptodate(sh->bh_cache[pd_idx]))
+               if (!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags))
                         BUG();
                 for (i=disks ; i-- ;) {
                         if (i==pd_idx)
                                 continue;
-                       if (sh->bh_write[i] &&
-                           buffer_uptodate(sh->bh_cache[i])) {
-                               bh_ptr[count++] = sh->bh_cache[i];
-                               chosen[i] = sh->bh_write[i];
-                               sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
-                               chosen[i]->b_reqnext = sh->bh_written[i];
-                               sh->bh_written[i] = chosen[i];
+                       if (sh->dev[i].towrite &&
+                           test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
+                               ptr[count++] = page_address(sh->dev[i].page);
+                               chosen[i] = sh->dev[i].towrite;
+                               sh->dev[i].towrite = NULL;
+                               if (sh->dev[i].written) BUG();
+                               sh->dev[i].written = chosen[i];
                                 check_xor();
                         }
                 }
                 break;
         case RECONSTRUCT_WRITE:
-               memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);
+               memset(ptr[0], 0, STRIPE_SIZE);
                 for (i= disks; i-- ;)
-                       if (i!=pd_idx && sh->bh_write[i]) {
-                               chosen[i] = sh->bh_write[i];
-                               sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
-                               chosen[i]->b_reqnext = sh->bh_written[i];
-                               sh->bh_written[i] = chosen[i];
+                       if (i!=pd_idx && sh->dev[i].towrite) {
+                               chosen[i] = sh->dev[i].towrite;
+                               sh->dev[i].towrite = NULL;
+                               if (sh->dev[i].written) BUG();
+                               sh->dev[i].written = chosen[i];
                         }
                 break;
         case CHECK_PARITY:
                 break;
         }
         if (count>1) {
-               xor_block(count, bh_ptr);
+               xor_block(count, STRIPE_SIZE, ptr);
                 count = 1;
         }
         
         for (i = disks; i--;)
                 if (chosen[i]) {
-                       struct buffer_head *bh = sh->bh_cache[i];
-                       char *bdata;
-                       bdata = bh_kmap(chosen[i]);
-                       memcpy(bh->b_data,
-                              bdata,sh->size);
-                       bh_kunmap(chosen[i]);
-                       set_buffer_locked(bh);
-                       set_buffer_uptodate(bh);
+                       sector_t sector = sh->dev[i].sector;
+                       copy_data(1, chosen[i], sh->dev[i].page, sector);
+
+                       set_bit(R5_LOCKED, &sh->dev[i].flags);
+                       set_bit(R5_UPTODATE, &sh->dev[i].flags);
                 }
  
         switch(method) {
@@ -749,55 +758,74 @@ static void compute_parity(struct stripe_head *sh, int method)
         case CHECK_PARITY:
                 for (i=disks; i--;)
                         if (i != pd_idx) {
-                               bh_ptr[count++] = sh->bh_cache[i];
+                               ptr[count++] = page_address(sh->dev[i].page);
                                 check_xor();
                         }
                 break;
         case READ_MODIFY_WRITE:
                 for (i = disks; i--;)
                         if (chosen[i]) {
-                               bh_ptr[count++] = sh->bh_cache[i];
+                               ptr[count++] = page_address(sh->dev[i].page);
                                 check_xor();
                         }
         }
         if (count != 1)
-               xor_block(count, bh_ptr);
+               xor_block(count, STRIPE_SIZE, ptr);
         
         if (method != CHECK_PARITY) {
-               set_buffer_uptodate(sh->bh_cache[pd_idx]);
-               set_buffer_locked(sh->bh_cache[pd_idx]);
+               set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+               set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
         } else
-               clear_buffer_uptodate(sh->bh_cache[pd_idx]);
+               clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
  }
  
-static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
+/*
+ * Each stripe/dev can have one or more bion attached.
+ * toread/towrite point to the first in a chain. 
+ * The bi_next chain must be in order.
+ */
+static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
  {
-       struct buffer_head **bhp;
+       struct bio **bip;
         raid5_conf_t *conf = sh->raid_conf;
  
-       PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector);
+       PRINTK("adding bh b#%lu to stripe s#%lu\n", bi->bi_sector, sh->sector);
  
  
         spin_lock(&sh->lock);
         spin_lock_irq(&conf->device_lock);
-       bh->b_reqnext = NULL;
-       if (rw == READ)
-               bhp = &sh->bh_read[dd_idx];
+       if (forwrite)
+               bip = &sh->dev[dd_idx].towrite;
         else
-               bhp = &sh->bh_write[dd_idx];
-       while (*bhp) {
-               printk(KERN_NOTICE "raid5: multiple %d requests for sector %ld\n", rw, sh->sector);
-               bhp = & (*bhp)->b_reqnext;
-       }
-       *bhp = bh;
+               bip = &sh->dev[dd_idx].toread;
+       while (*bip && (*bip)->bi_sector < bi->bi_sector)
+               bip = & (*bip)->bi_next;
+/* FIXME do I need to worry about overlapping bion */
+       if (*bip && bi->bi_next && (*bip) != bi->bi_next)
+               BUG();
+       if (*bip)
+               bi->bi_next = *bip;
+       *bip = bi;
+       bi->bi_phys_segments ++;
         spin_unlock_irq(&conf->device_lock);
         spin_unlock(&sh->lock);
  
-       PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx);
-}
-
-
+       if (forwrite) {
+               /* check if page is coverred */
+               sector_t sector = sh->dev[dd_idx].sector;
+               for (bi=sh->dev[dd_idx].towrite;
+                    sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
+                            bi && bi->bi_sector <= sector;
+                    bi = bi->bi_next) {
+                       if (bi->bi_sector + (bi->bi_size>>9) >= sector)
+                               sector = bi->bi_sector + (bi->bi_size>>9);
+               }
+               if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
+                       set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
+       }
  
+       PRINTK("added bi b#%lu to stripe s#%lu, disk %d.\n", bi->bi_sector, sh->sector, dd_idx);
+}
  
  
  /*
@@ -822,13 +850,14 @@ static void handle_stripe(struct stripe_head *sh)
  {
         raid5_conf_t *conf = sh->raid_conf;
         int disks = conf->raid_disks;
-       struct buffer_head *return_ok= NULL, *return_fail = NULL;
+       struct bio *return_bi= NULL;
+       struct bio *bi;
         int action[MD_SB_DISKS];
         int i;
         int syncing;
         int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
         int failed_num=0;
-       struct buffer_head *bh;
+       struct r5dev *dev;
  
         PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx);
         memset(action, 0, sizeof(action));
@@ -841,36 +870,38 @@ static void handle_stripe(struct stripe_head *sh)
         /* Now to look around and see what can be done */
  
         for (i=disks; i--; ) {
-               bh = sh->bh_cache[i];
-               PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i, bh->b_state, sh->bh_read[i], sh->bh_write[i], sh->bh_written[i]);
+               dev = &sh->dev[i];
+               PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i, 
+                      dev->flags, dev->toread, dev->towrite, dev->written);
                 /* maybe we can reply to a read */
-               if (buffer_uptodate(bh) && sh->bh_read[i]) {
-                       struct buffer_head *rbh, *rbh2;
+               if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
+                       struct bio *rbi, *rbi2;
                         PRINTK("Return read for disc %d\n", i);
                         spin_lock_irq(&conf->device_lock);
-                       rbh = sh->bh_read[i];
-                       sh->bh_read[i] = NULL;
+                       rbi = dev->toread;
+                       dev->toread = NULL;
                         spin_unlock_irq(&conf->device_lock);
-                       while (rbh) {
-                               char *bdata;
-                               bdata = bh_kmap(rbh);
-                               memcpy(bdata, bh->b_data, bh->b_size);
-                               bh_kunmap(rbh);
-                               rbh2 = rbh->b_reqnext;
-                               rbh->b_reqnext = return_ok;
-                               return_ok = rbh;
-                               rbh = rbh2;
+                       while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+                               copy_data(0, rbi, dev->page, dev->sector);
+                               rbi2 = rbi->bi_next;
+                               spin_lock_irq(&conf->device_lock);
+                               if (--rbi->bi_phys_segments == 0) {
+                                       rbi->bi_next = return_bi;
+                                       return_bi = rbi;
+                               }
+                               spin_unlock_irq(&conf->device_lock);
+                               rbi = rbi2;
                         }
                 }
  
                 /* now count some things */
-               if (buffer_locked(bh)) locked++;
-               if (buffer_uptodate(bh)) uptodate++;
+               if (test_bit(R5_LOCKED, &dev->flags)) locked++;
+               if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
  
                 
-               if (sh->bh_read[i]) to_read++;
-               if (sh->bh_write[i]) to_write++;
-               if (sh->bh_written[i]) written++;
+               if (dev->toread) to_read++;
+               if (dev->towrite) to_write++;
+               if (dev->written) written++;
                 if (!conf->disks[i].operational) {
                         failed++;
                         failed_num = i;
@@ -882,29 +913,42 @@ static void handle_stripe(struct stripe_head *sh)
          * need to be failed
          */
         if (failed > 1 && to_read+to_write) {
+               spin_lock_irq(&conf->device_lock);
                 for (i=disks; i--; ) {
                         /* fail all writes first */
-                       if (sh->bh_write[i]) to_write--;
-                       while ((bh = sh->bh_write[i])) {
-                               sh->bh_write[i] = bh->b_reqnext;
-                               bh->b_reqnext = return_fail;
-                               return_fail = bh;
+                       bi = sh->dev[i].towrite;
+                       sh->dev[i].towrite = NULL;
+                       if (bi) to_write--;
+
+                       while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
+                               struct bio *nextbi = bi->bi_next;
+                               clear_bit(BIO_UPTODATE, &bi->bi_flags);
+                               if (--bi->bi_phys_segments == 0) {
+                                       bi->bi_next = return_bi;
+                                       return_bi = bi;
+                               }
+                               bi = nextbi;
                         }
                         /* fail any reads if this device is non-operational */
                         if (!conf->disks[i].operational) {
-                               spin_lock_irq(&conf->device_lock);
-                               if (sh->bh_read[i]) to_read--;
-                               while ((bh = sh->bh_read[i])) {
-                                       sh->bh_read[i] = bh->b_reqnext;
-                                       bh->b_reqnext = return_fail;
-                                       return_fail = bh;
+                               bi = sh->dev[i].toread;
+                               sh->dev[i].toread = NULL;
+                               if (bi) to_read--;
+                               while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
+                                       struct bio *nextbi = bi->bi_next;
+                                       clear_bit(BIO_UPTODATE, &bi->bi_flags);
+                                       if (--bi->bi_phys_segments == 0) {
+                                               bi->bi_next = return_bi;
+                                               return_bi = bi;
+                                       }
+                                       bi = nextbi;
                                 }
-                               spin_unlock_irq(&conf->device_lock);
                         }
                 }
+               spin_unlock_irq(&conf->device_lock);
         }
         if (failed > 1 && syncing) {
-               md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,0);
+               md_done_sync(conf->mddev, STRIPE_SECTORS,0);
                 clear_bit(STRIPE_SYNCING, &sh->state);
                 syncing = 0;
         }
@@ -912,40 +956,43 @@ static void handle_stripe(struct stripe_head *sh)
         /* might be able to return some write requests if the parity block
          * is safe, or on a failed drive
          */
-       bh = sh->bh_cache[sh->pd_idx];
+       dev = &sh->dev[sh->pd_idx];
         if ( written &&
-            ( (conf->disks[sh->pd_idx].operational && !buffer_locked(bh) && buffer_uptodate(bh))
+            ( (conf->disks[sh->pd_idx].operational && !test_bit(R5_LOCKED, &dev->flags) &&
+               test_bit(R5_UPTODATE, &dev->flags))
                || (failed == 1 && failed_num == sh->pd_idx))
             ) {
-           /* any written block on a uptodate or failed drive can be returned */
+           /* any written block on an uptodate or failed drive can be returned */
             for (i=disks; i--; )
-               if (sh->bh_written[i]) {
-                   bh = sh->bh_cache[i];
+               if (sh->dev[i].written) {
+                   dev = &sh->dev[i];
                     if (!conf->disks[sh->pd_idx].operational ||
-                       (!buffer_locked(bh) && buffer_uptodate(bh)) ) {
+                       (!test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags)) ) {
                         /* maybe we can return some write requests */
-                       struct buffer_head *wbh, *wbh2;
-                       PRINTK("Return write for disc %d\n", i);
-                       wbh = sh->bh_written[i];
-                       sh->bh_written[i] = NULL;
-                       while (wbh) {
-                           wbh2 = wbh->b_reqnext;
-                           wbh->b_reqnext = return_ok;
-                           return_ok = wbh;
-                           wbh = wbh2;
-                       }
+                           struct bio *wbi, *wbi2;
+                           PRINTK("Return write for disc %d\n", i);
+                           wbi = dev->written;
+                           dev->written = NULL;
+                           while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+                                   wbi2 = wbi->bi_next;
+                                   if (--wbi->bi_phys_segments == 0) {
+                                           wbi->bi_next = return_bi;
+                                           return_bi = wbi;
+                                   }
+                                   wbi = wbi2;
+                           }
                     }
                 }
         }
-               
+
         /* Now we might consider reading some blocks, either to check/generate
          * parity, or to satisfy requests
          */
         if (to_read || (syncing && (uptodate+failed < disks))) {
                 for (i=disks; i--;) {
-                       bh = sh->bh_cache[i];
-                       if (!buffer_locked(bh) && !buffer_uptodate(bh) &&
-                           (sh->bh_read[i] || syncing || (failed && sh->bh_read[failed_num]))) {
+                       dev = &sh->dev[i];
+                       if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+                           (dev->toread || syncing || (failed && sh->dev[failed_num].toread))) {
                                 /* we would like to get this block, possibly
                                  * by computing it, but we might not be able to
                                  */
@@ -954,21 +1001,21 @@ static void handle_stripe(struct stripe_head *sh)
                                         compute_block(sh, i);
                                         uptodate++;
                                 } else if (conf->disks[i].operational) {
-                                       set_buffer_locked(bh);
+                                       set_bit(R5_LOCKED, &dev->flags);
                                         action[i] = READ+1;
+#if 0
                                         /* if I am just reading this block and we don't have
                                            a failed drive, or any pending writes then sidestep the cache */
-                                       if (sh->bh_page[i]) BUG();
                                         if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
                                             ! syncing && !failed && !to_write) {
-                                               sh->bh_page[i] = sh->bh_cache[i]->b_page;
                                                 sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
                                                 sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
                                         }
+#endif
                                         locked++;
                                         PRINTK("Reading block %d (sync=%d)\n", i, syncing);
                                         if (syncing)
-                                               md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
+                                               md_sync_acct(conf->disks[i].dev, STRIPE_SECTORS);
                                 }
                         }
                 }
@@ -980,10 +1027,14 @@ static void handle_stripe(struct stripe_head *sh)
                 int rmw=0, rcw=0;
                 for (i=disks ; i--;) {
                         /* would I have to read this buffer for read_modify_write */
-                       bh = sh->bh_cache[i];
-                       if ((sh->bh_write[i] || i == sh->pd_idx) &&
-                           (!buffer_locked(bh) || sh->bh_page[i]) &&
-                           !buffer_uptodate(bh)) {
+                       dev = &sh->dev[i];
+                       if ((dev->towrite || i == sh->pd_idx) &&
+                           (!test_bit(R5_LOCKED, &dev->flags) 
+#if 0
+|| sh->bh_page[i]!=bh->b_page
+#endif
+                                   ) &&
+                           !test_bit(R5_UPTODATE, &dev->flags)) {
                                 if (conf->disks[i].operational 
  /*                                 && !(conf->resync_parity && i == sh->pd_idx) */
                                         )
@@ -991,9 +1042,13 @@ static void handle_stripe(struct stripe_head *sh)
                                 else rmw += 2*disks;  /* cannot read it */
                         }
                         /* Would I have to read this buffer for reconstruct_write */
-                       if (!sh->bh_write[i] && i != sh->pd_idx &&
-                           (!buffer_locked(bh) || sh->bh_page[i]) &&
-                           !buffer_uptodate(bh)) {
+                       if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
+                           (!test_bit(R5_LOCKED, &dev->flags) 
+#if 0
+|| sh->bh_page[i] != bh->b_page
+#endif
+                                   ) &&
+                           !test_bit(R5_UPTODATE, &dev->flags)) {
                                 if (conf->disks[i].operational) rcw++;
                                 else rcw += 2*disks;
                         }
@@ -1003,14 +1058,14 @@ static void handle_stripe(struct stripe_head *sh)
                 if (rmw < rcw && rmw > 0)
                         /* prefer read-modify-write, but need to get some data */
                         for (i=disks; i--;) {
-                               bh = sh->bh_cache[i];
-                               if ((sh->bh_write[i] || i == sh->pd_idx) &&
-                                   !buffer_locked(bh) && !buffer_uptodate(bh) &&
+                               dev = &sh->dev[i];
+                               if ((dev->towrite || i == sh->pd_idx) &&
+                                   !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
                                     conf->disks[i].operational) {
                                         if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                                         {
                                                 PRINTK("Read_old block %d for r-m-w\n", i);
-                                               set_buffer_locked(bh);
+                                               set_bit(R5_LOCKED, &dev->flags);
                                                 action[i] = READ+1;
                                                 locked++;
                                         } else {
@@ -1022,14 +1077,14 @@ static void handle_stripe(struct stripe_head *sh)
                 if (rcw <= rmw && rcw > 0)
                         /* want reconstruct write, but need to get some data */
                         for (i=disks; i--;) {
-                               bh = sh->bh_cache[i];
-                               if (!sh->bh_write[i]  && i != sh->pd_idx &&
-                                   !buffer_locked(bh) && !buffer_uptodate(bh) &&
+                               dev = &sh->dev[i];
+                               if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
+                                   !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
                                     conf->disks[i].operational) {
                                         if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                                         {
                                                 PRINTK("Read_old block %d for Reconstruct\n", i);
-                                               set_buffer_locked(bh);
+                                               set_bit(R5_LOCKED, &dev->flags);
                                                 action[i] = READ+1;
                                                 locked++;
                                         } else {
@@ -1044,7 +1099,7 @@ static void handle_stripe(struct stripe_head *sh)
                         compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
                         /* now every locked buffer is ready to be written */
                         for (i=disks; i--;)
-                               if (buffer_locked(sh->bh_cache[i])) {
+                               if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
                                         PRINTK("Writing block %d\n", i);
                                         locked++;
                                         action[i] = WRITE+1;
@@ -1068,13 +1123,14 @@ static void handle_stripe(struct stripe_head *sh)
             !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
                 set_bit(STRIPE_HANDLE, &sh->state);
                 if (failed == 0) {
+                       char *pagea;
                         if (uptodate != disks)
                                 BUG();
                         compute_parity(sh, CHECK_PARITY);
                         uptodate--;
-                       bh = sh->bh_cache[sh->pd_idx];
-                       if ((*(u32*)bh->b_data) == 0 &&
-                           !memcmp(bh->b_data, bh->b_data+4, bh->b_size-4)) {
+                       pagea = page_address(sh->dev[sh->pd_idx].page);
+                       if ((*(u32*)pagea) == 0 &&
+                           !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
                                 /* parity is correct (on disc, not in buffer any more) */
                                 set_bit(STRIPE_INSYNC, &sh->state);
                         }
@@ -1084,7 +1140,7 @@ static void handle_stripe(struct stripe_head *sh)
                         if (failed==0)
                                 failed_num = sh->pd_idx;
                         /* should be able to compute the missing block and write it to spare */
-                       if (!buffer_uptodate(sh->bh_cache[failed_num])) {
+                       if (!test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)) {
                                 if (uptodate+1 != disks)
                                         BUG();
                                 compute_block(sh, failed_num);
@@ -1092,60 +1148,62 @@ static void handle_stripe(struct stripe_head *sh)
                         }
                         if (uptodate != disks)
                                 BUG();
-                       bh = sh->bh_cache[failed_num];
-                       set_buffer_locked(bh);
+                       dev = &sh->dev[failed_num];
+                       set_bit(R5_LOCKED, &dev->flags);
                         action[failed_num] = WRITE+1;
                         locked++;
                         set_bit(STRIPE_INSYNC, &sh->state);
                         if (conf->disks[failed_num].operational)
-                               md_sync_acct(conf->disks[failed_num].dev, bh->b_size>>9);
+                               md_sync_acct(conf->disks[failed_num].dev, STRIPE_SECTORS);
                         else if ((spare=conf->spare))
-                               md_sync_acct(spare->dev, bh->b_size>>9);
+                               md_sync_acct(spare->dev, STRIPE_SECTORS);
  
                 }
         }
         if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
-               md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,1);
+               md_done_sync(conf->mddev, STRIPE_SECTORS,1);
                 clear_bit(STRIPE_SYNCING, &sh->state);
         }
         
-       
         spin_unlock(&sh->lock);
  
-       while ((bh=return_ok)) {
-               return_ok = bh->b_reqnext;
-               bh->b_reqnext = NULL;
-               bh->b_end_io(bh, 1);
-       }
-       while ((bh=return_fail)) {
-               return_fail = bh->b_reqnext;
-               bh->b_reqnext = NULL;
-               bh->b_end_io(bh, 0);
+       while ((bi=return_bi)) {
+               return_bi = bi->bi_next;
+               bi->bi_next = NULL;
+               bi->bi_end_io(bi);
         }
         for (i=disks; i-- ;) 
                 if (action[i]) {
-                       struct buffer_head *bh = sh->bh_cache[i];
+                       struct bio *bi = &sh->dev[i].req;
                         struct disk_info *spare = conf->spare;
                         int skip = 0;
                         if (action[i] == READ+1)
-                               bh->b_end_io = raid5_end_read_request;
+                               bi->bi_end_io = raid5_end_read_request;
                         else
-                               bh->b_end_io = raid5_end_write_request;
+                               bi->bi_end_io = raid5_end_write_request;
                         if (conf->disks[i].operational)
-                               bh->b_dev = conf->disks[i].dev;
+                               bi->bi_bdev = conf->disks[i].bdev;
                         else if (spare && action[i] == WRITE+1)
-                               bh->b_dev = spare->dev;
+                               bi->bi_bdev = spare->bdev;
                         else skip=1;
-                       /* FIXME - later we will need bdev here */
                         if (!skip) {
                                 PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i);
                                 atomic_inc(&sh->count);
-                               bh->b_rdev = bh->b_dev;
-                               bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
-                               generic_make_request(action[i]-1, bh);
+                               bi->bi_sector = sh->sector;
+                               if (action[i] == READ+1) 
+                                       bi->bi_rw = 0;
+                               else
+                                       bi->bi_rw = 1;
+                               bi->bi_flags = 0;
+                               bi->bi_vcnt = 1;        
+                               bi->bi_idx = 0;
+                               bi->bi_io_vec = &sh->dev[i].vec;
+                               bi->bi_size = STRIPE_SIZE;
+                               bi->bi_next = NULL;
+                               generic_make_request(bi);
                         } else {
                                 PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector);
-                               clear_buffer_locked(bh);
+                               clear_bit(R5_LOCKED, &dev->flags);
                                 set_bit(STRIPE_HANDLE, &sh->state);
                         }
                 }
@@ -1192,13 +1250,14 @@ static inline void raid5_plug_device(raid5_conf_t *conf)
         spin_unlock_irq(&conf->device_lock);
  }
  
-static int make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
+static int make_request (mddev_t *mddev, int rw, struct bio * bi)
  {
         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
         const unsigned int raid_disks = conf->raid_disks;
         const unsigned int data_disks = raid_disks - 1;
         unsigned int dd_idx, pd_idx;
-       unsigned long new_sector;
+       sector_t new_sector;
+       sector_t logical_sector, last_sector;
         int read_ahead = 0;
  
         struct stripe_head *sh;
@@ -1208,25 +1267,39 @@ static int make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
                 read_ahead=1;
         }
  
-       new_sector = raid5_compute_sector(bh->b_rsector,
-                       raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+       logical_sector = bi->bi_sector & ~(STRIPE_SECTORS-1);
+       last_sector = bi->bi_sector + (bi->bi_size>>9);
  
-       PRINTK("raid5: make_request, sector %lu\n", new_sector);
-       sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead);
-       if (sh) {
-               sh->pd_idx = pd_idx;
+       bi->bi_next = NULL;
+       set_bit(BIO_UPTODATE, &bi->bi_flags); /* will be cleared if error detected */
+       bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
+       for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
+               
+               new_sector = raid5_compute_sector(logical_sector,
+                                                 raid_disks, data_disks, &dd_idx, &pd_idx, conf);
  
-               add_stripe_bh(sh, bh, dd_idx, rw);
+               PRINTK("raid5: make_request, sector %ul logical %ul\n", 
+                      new_sector, logical_sector);
  
-               raid5_plug_device(conf);
-               handle_stripe(sh);
-               release_stripe(sh);
-       } else
-               bh->b_end_io(bh, buffer_uptodate(bh));
+               sh = get_active_stripe(conf, new_sector, pd_idx, read_ahead);
+               if (sh) {
+
+                       add_stripe_bio(sh, bi, dd_idx, rw);
+
+                       raid5_plug_device(conf);
+                       handle_stripe(sh);
+                       release_stripe(sh);
+               }
+       }
+       spin_lock_irq(&conf->device_lock);
+       if (--bi->bi_phys_segments == 0) 
+               bi->bi_end_io(bi);
+       spin_unlock_irq(&conf->device_lock);
         return 0;
  }
  
-static int sync_request (mddev_t *mddev, unsigned long sector_nr)
+/* FIXME go_faster isn't used */
+static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
  {
         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
         struct stripe_head *sh;
@@ -1237,25 +1310,19 @@ static int sync_request (mddev_t *mddev, unsigned long sector_nr)
         unsigned long first_sector;
         int raid_disks = conf->raid_disks;
         int data_disks = raid_disks-1;
-       int redone = 0;
-       int bufsize;
  
-       sh = get_active_stripe(conf, sector_nr, 0, 0);
-       bufsize = sh->size;
-       redone = sector_nr - sh->sector;
         first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
                 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
-       sh->pd_idx = pd_idx;
+       sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
         spin_lock(&sh->lock);   
         set_bit(STRIPE_SYNCING, &sh->state);
         clear_bit(STRIPE_INSYNC, &sh->state);
-       sh->sync_redone = redone;
         spin_unlock(&sh->lock);
  
         handle_stripe(sh);
         release_stripe(sh);
  
-       return (bufsize>>9)-redone;
+       return STRIPE_SECTORS;
  }
  
  /*
@@ -1376,7 +1443,6 @@ static int run (mddev_t *mddev)
         INIT_LIST_HEAD(&conf->inactive_list);
         atomic_set(&conf->active_stripes, 0);
         atomic_set(&conf->preread_active_stripes, 0);
-       conf->buffer_size = PAGE_SIZE; /* good default for rebuild */
  
         conf->plugged = 0;
         conf->plug_tq.sync = 0;
@@ -1404,6 +1470,7 @@ static int run (mddev_t *mddev)
                         disk->number = desc->number;
                         disk->raid_disk = raid_disk;
                         disk->dev = rdev->dev;
+                       disk->bdev = rdev->bdev;
  
                         disk->operational = 0;
                         disk->write_only = 0;
@@ -1430,6 +1497,7 @@ static int run (mddev_t *mddev)
                         disk->number = desc->number;
                         disk->raid_disk = raid_disk;
                         disk->dev = rdev->dev;
+                       disk->bdev = rdev->bdev;
                         disk->operational = 1;
                         disk->used_slot = 1;
  
@@ -1442,6 +1510,7 @@ static int run (mddev_t *mddev)
                         disk->number = desc->number;
                         disk->raid_disk = raid_disk;
                         disk->dev = rdev->dev;
+                       disk->bdev = rdev->bdev;
  
                         disk->operational = 0;
                         disk->write_only = 0;
@@ -1461,6 +1530,7 @@ static int run (mddev_t *mddev)
                         disk->number = desc->number;
                         disk->raid_disk = raid_disk;
                         disk->dev = NODEV;
+                       disk->bdev = NULL;
  
                         disk->operational = 0;
                         disk->write_only = 0;
@@ -1518,9 +1588,9 @@ static int run (mddev_t *mddev)
  
         memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
                  conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
-       if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
+       if (grow_stripes(conf, conf->max_nr_stripes)) {
                 printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
-               shrink_stripes(conf, conf->max_nr_stripes);
+               shrink_stripes(conf);
                 goto abort;
         } else
                 printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
@@ -1623,7 +1693,7 @@ static int stop (mddev_t *mddev)
         if (conf->resync_thread)
                 md_unregister_thread(conf->resync_thread);
         md_unregister_thread(conf->thread);
-       shrink_stripes(conf, conf->max_nr_stripes);
+       shrink_stripes(conf);
         free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
         kfree(conf);
         mddev->private = NULL;
@@ -1636,12 +1706,11 @@ static void print_sh (struct stripe_head *sh)
  {
         int i;
  
-       printk("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector, sh->size, sh->pd_idx, sh->state);
+       printk("sh %lu, pd_idx %d, state %ld.\n", sh->sector, sh->pd_idx, sh->state);
         printk("sh %lu,  count %d.\n", sh->sector, atomic_read(&sh->count));
         printk("sh %lu, ", sh->sector);
-       for (i = 0; i < MD_SB_DISKS; i++) {
-               if (sh->bh_cache[i])
-                       printk("(cache%d: %p %ld) ", i, sh->bh_cache[i], sh->bh_cache[i]->b_state);
+       for (i = 0; i < sh->raid_conf->raid_disks; i++) {
+               printk("(cache%d: %p %ld) ", i, sh->dev[i].page, sh->dev[i].flags);
         }
         printk("\n");
  }
@@ -1948,6 +2017,7 @@ static int diskop(mddev_t *mddev, mdp_disk_t **d, int state)
                         goto abort;
                 }
                 rdisk->dev = NODEV;
+               rdisk->bdev = NULL;
                 rdisk->used_slot = 0;
  
                 break;
@@ -1965,6 +2035,8 @@ static int diskop(mddev_t *mddev, mdp_disk_t **d, int state)
                 adisk->number = added_desc->number;
                 adisk->raid_disk = added_desc->raid_disk;
                 adisk->dev = mk_kdev(added_desc->major,added_desc->minor);
+               /* it will be held open by rdev */
+               adisk->bdev = bdget(kdev_t_to_nr(adisk->dev));
  
                 adisk->operational = 0;
                 adisk->write_only = 0;
diff --git a/drivers/md/xor.c b/drivers/md/xor.c

index 9b570ecf98f58865c687798b6a7b81827339409f..8b07177a882eb54c2f6e00987bf4e42edefc79db 100644 (file)
--- a/drivers/md/xor.c
+++ b/drivers/md/xor.c
@@ -26,31 +26,30 @@
  static struct xor_block_template *active_template;
  
  void
-xor_block(unsigned int count, struct buffer_head **bh_ptr)
+xor_block(unsigned int count, unsigned int bytes, void **ptr)
  {
         unsigned long *p0, *p1, *p2, *p3, *p4;
-       unsigned long bytes = bh_ptr[0]->b_size;
  
-       p0 = (unsigned long *) bh_ptr[0]->b_data;
-       p1 = (unsigned long *) bh_ptr[1]->b_data;
+       p0 = (unsigned long *) ptr[0];
+       p1 = (unsigned long *) ptr[1];
         if (count == 2) {
                 active_template->do_2(bytes, p0, p1);
                 return;
         }
  
-       p2 = (unsigned long *) bh_ptr[2]->b_data;
+       p2 = (unsigned long *) ptr[2];
         if (count == 3) {
                 active_template->do_3(bytes, p0, p1, p2);
                 return;
         }
  
-       p3 = (unsigned long *) bh_ptr[3]->b_data;
+       p3 = (unsigned long *) ptr[3];
         if (count == 4) {
                 active_template->do_4(bytes, p0, p1, p2, p3);
                 return;
         }
  
-       p4 = (unsigned long *) bh_ptr[4]->b_data;
+       p4 = (unsigned long *) ptr[4];
         active_template->do_5(bytes, p0, p1, p2, p3, p4);
  }
  
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h

index d3485daeb0f177890de561d9c0db4a1b4602c931..5c25120581a79efd7fd428fd8a3fc5e8f54fe3c8 100644 (file)
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -7,21 +7,21 @@
  /*
   *
   * Each stripe contains one buffer per disc.  Each buffer can be in
- * one of a number of states determined by bh_state.  Changes between
+ * one of a number of states stored in "flags".  Changes between
   * these states happen *almost* exclusively under a per-stripe
- * spinlock.  Some very specific changes can happen in b_end_io, and
+ * spinlock.  Some very specific changes can happen in bi_end_io, and
   * these are not protected by the spin lock.
   *
- * The bh_state bits that are used to represent these states are:
- *   BH_Uptodate, BH_Lock
+ * The flag bits that are used to represent these states are:
+ *   R5_UPTODATE and R5_LOCKED
   *
- * State Empty == !Uptodate, !Lock
+ * State Empty == !UPTODATE, !LOCK
   *        We have no data, and there is no active request
- * State Want == !Uptodate, Lock
+ * State Want == !UPTODATE, LOCK
   *        A read request is being submitted for this block
- * State Dirty == Uptodate, Lock
+ * State Dirty == UPTODATE, LOCK
   *        Some new data is in this buffer, and it is being written out
- * State Clean == Uptodate, !Lock
+ * State Clean == UPTODATE, !LOCK
   *        We have valid data which is the same as on disc
   *
   * The possible state transitions are:
@@ -124,24 +124,29 @@
   * plus raid5d if it is handling it, plus one for each active request
   * on a cached buffer.
   */
+
  struct stripe_head {
         struct stripe_head      *hash_next, **hash_pprev; /* hash pointers */
         struct list_head        lru;                    /* inactive_list or handle_list */
         struct raid5_private_data       *raid_conf;
-       struct buffer_head      *bh_cache[MD_SB_DISKS]; /* buffered copy */
-       struct buffer_head      *bh_read[MD_SB_DISKS];  /* read request buffers of the MD device */
-       struct buffer_head      *bh_write[MD_SB_DISKS]; /* write request buffers of the MD device */
-       struct buffer_head      *bh_written[MD_SB_DISKS]; /* write request buffers of the MD device that have been scheduled for write */
-       struct page             *bh_page[MD_SB_DISKS];  /* saved bh_cache[n]->b_page when reading around the cache */
-       unsigned long           sector;                 /* sector of this row */
-       int                     size;                   /* buffers size */
+       sector_t                sector;                 /* sector of this row */
         int                     pd_idx;                 /* parity disk index */
         unsigned long           state;                  /* state flags */
         atomic_t                count;                  /* nr of active thread/requests */
         spinlock_t              lock;
-       int                     sync_redone;
+       struct r5dev {
+               struct bio      req;
+               struct bio_vec  vec;
+               struct page     *page;
+               struct bio      *toread, *towrite, *written;
+               sector_t        sector;                 /* sector of this page */
+               unsigned long   flags;
+       } dev[1]; /* allocated with extra space depending of RAID geometry */
  };
-
+/* Flags */
+#define        R5_UPTODATE     0       /* page contains current data */
+#define        R5_LOCKED       1       /* IO has been submitted on "req" */
+#define        R5_OVERWRITE    2       /* towrite covers whole page */
  
  /*
   * Write method
@@ -187,6 +192,7 @@ struct stripe_head {
  
  struct disk_info {
         kdev_t  dev;
+       struct block_device *bdev;
         int     operational;
         int     number;
         int     raid_disk;
@@ -201,7 +207,6 @@ struct raid5_private_data {
         mdk_thread_t            *thread, *resync_thread;
         struct disk_info        disks[MD_SB_DISKS];
         struct disk_info        *spare;
-       int                     buffer_size;
         int                     chunk_size, level, algorithm;
         int                     raid_disks, working_disks, failed_disks;
         int                     resync_parity;
@@ -210,6 +215,9 @@ struct raid5_private_data {
         struct list_head        handle_list; /* stripes needing handling */
         struct list_head        delayed_list; /* stripes that have plugged requests */
         atomic_t                preread_active_stripes; /* stripes with scheduled io */
+
+       char                    cache_name[20];
+       kmem_cache_t            *slab_cache; /* for allocating stripes */
         /*
          * Free stripes pool
          */
diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h

index 0e6950a7b9f9ea0aa911438880508b493fb00c56..f0d67cbdea40bcc7428036235ee8ce6e8ab79059 100644 (file)
--- a/include/linux/raid/xor.h
+++ b/include/linux/raid/xor.h
@@ -5,7 +5,7 @@
  
  #define MAX_XOR_BLOCKS 5
  
-extern void xor_block(unsigned int count, struct buffer_head **bh_ptr);
+extern void xor_block(unsigned int count, unsigned int bytes, void **ptr);
  
  struct xor_block_template {
          struct xor_block_template *next;
author	Neil Brown <neilb@cse.unsw.edu.au>
	Sat, 18 May 2002 05:03:33 +0000 (22:03 -0700)
committer	Christoph Hellwig <hch@sb.bsdonline.org>
	Sat, 18 May 2002 05:03:33 +0000 (22:03 -0700)
drivers/md/raid5.c		patch \| blob \| history
drivers/md/xor.c		patch \| blob \| history
include/linux/raid/raid5.h		patch \| blob \| history
include/linux/raid/xor.h		patch \| blob \| history