]> git.hungrycats.org Git - linux/commitdiff
[PATCH] fix check_disk_change() deadlocks
authorAlexander Viro <viro@math.psu.edu>
Sat, 10 Aug 2002 09:22:02 +0000 (02:22 -0700)
committerLinus Torvalds <torvalds@penguin.transmeta.com>
Sat, 10 Aug 2002 09:22:02 +0000 (02:22 -0700)
Small, but tricky: fix for check_disk_change() deadlocks.
What we do is
a) opening block device shifted from check_partition() to
   grok_partitions(); check_partitions() takes opened
   struct block_device.
b) all callers of check_disk_change() fall in two groups -
   ones that are called only from some ->open() and ones
   that are _never_ called from ->open().  There is no
   middle ground.  We split the thing in two functions -
   check_disk_change() for the first class and full_check_....
   for the second.  The former (ones inside ->open()) doesn't
   touch partition tables but marks the bdev as "had been
   invalidated".  In the end of do_open() we check if
   bdev is marked and call wipe_partitions()/check_partition()
   if it is - at that point bdev is fully set up and ready.
c) ->bd_part_sem kludge is gone - we use ->bd_sem instead.
   That is, do_open() on a partition grabs ->bd_sem on entire
   disk and picks partition data while under it; do_open() on
   entire disk rereads partition if needed before dropping
   ->bd_sem (right before dropping it); BLKRRPART does
   trylock on ->bd_sem and then checks ->bd_part_count -
   same logics as before, except that we use ->bd_sem instead
   of ->bd_part_sem.

That kills recursive open(), gives us the same exclusion rules as
we had and makes sure that actual IO (including rereading partition
tables) is done only when we are ready to do it.

It actually sounds a lot nastier than it is.  do_open() is a one sick
puppy right now, but we have everything in one place and _out_ of drivers
(and 20-odd equally sick puppies are gone from them, along with about
the same number of races).

Now we are almost ready to clean it up for good - all that remains to
do before that is to get the rest of drivers (cciss, DAC960, i2o and
a couple of ancients - xd and acsi) using per-disk gendisks.  Then
most of that crap will disappear.

BTW, the only generic ioctl remaining in the drivers is HDIO_GETGEO -
a lot of foo_ioctl() starts with if (cmd != HDIO_GETGEO) return -EINVAL; ;-)

fs/block_dev.c
fs/devfs/base.c
fs/partitions/check.c
fs/super.c
include/linux/blkdev.h
include/linux/fs.h

index 255861a3ce507696a0de935f63f8a022ab1dd125..e43631fac6df8b9afabfd2aa00e75e802f614ae5 100644 (file)
@@ -310,7 +310,7 @@ struct block_device *bdget(dev_t dev)
                        new_bdev->bd_contains = NULL;
                        new_bdev->bd_inode = inode;
                        new_bdev->bd_part_count = 0;
-                       sema_init(&new_bdev->bd_part_sem, 1);
+                       new_bdev->bd_invalidated = 0;
                        inode->i_mode = S_IFBLK;
                        inode->i_rdev = kdev;
                        inode->i_bdev = new_bdev;
@@ -518,24 +518,34 @@ int check_disk_change(struct block_device *bdev)
 
        disk = get_gendisk(dev);
        part = disk->part + minor(dev) - disk->first_minor;
-       if (disk && disk->minor_shift) {
-               if (!down_trylock(&bdev->bd_part_sem)) {
-                       if (!bdev->bd_part_count) {
-                               if (wipe_partitions(dev) == 0) {
-                                       if (bdops->revalidate)
-                                               bdops->revalidate(dev);
-                                       grok_partitions(dev, part[0].nr_sects);
-                               }
-                       }
-                       up(&bdev->bd_part_sem);
-               }
-       } else {
-               if (bdops->revalidate)
-                       bdops->revalidate(dev);
-       }
+       if (bdops->revalidate)
+               bdops->revalidate(dev);
+       if (disk && disk->minor_shift)
+               bdev->bd_invalidated = 1;
        return 1;
 }
 
+int full_check_disk_change(struct block_device *bdev)
+{
+       int res;
+       down(&bdev->bd_sem);
+       res = check_disk_change(bdev);
+       if (bdev->bd_invalidated && !bdev->bd_part_count) {
+               struct gendisk *g = get_gendisk(to_kdev_t(bdev->bd_dev));
+               struct hd_struct *part;
+               part = g->part + MINOR(bdev->bd_dev) - g->first_minor;
+               bdev->bd_invalidated = 0;
+               wipe_partitions(to_kdev_t(bdev->bd_dev));
+               if (part[0].nr_sects)
+                       check_partition(g, bdev);
+       }
+       up(&bdev->bd_sem);
+       return res;
+}
+
+/*
+ * Will die as soon as two remaining callers get converted.
+ */
 int __check_disk_change(dev_t dev)
 {
        struct block_device *bdev = bdget(dev);
@@ -544,11 +554,24 @@ int __check_disk_change(dev_t dev)
                return 0;
        if (blkdev_get(bdev, FMODE_READ, 0, BDEV_RAW) < 0)
                return 0;
-       res = check_disk_change(bdev);
+       res = full_check_disk_change(bdev);
        blkdev_put(bdev, BDEV_RAW);
        return res;
 }
 
+static void bd_set_size(struct block_device *bdev, loff_t size)
+{
+       unsigned bsize = bdev_hardsect_size(bdev);
+       bdev->bd_inode->i_size = size;
+       while (bsize < PAGE_CACHE_SIZE) {
+               if (size & bsize)
+                       break;
+               bsize <<= 1;
+       }
+       bdev->bd_block_size = bsize;
+       bdev->bd_inode->i_blkbits = blksize_bits(bsize);
+}
+
 static int do_open(struct block_device *bdev, struct inode *inode, struct file *file)
 {
        int ret = -ENXIO;
@@ -595,53 +618,64 @@ static int do_open(struct block_device *bdev, struct inode *inode, struct file *
                }
        }
        if (bdev->bd_contains == bdev) {
+               struct gendisk *g = get_gendisk(dev);
                if (bdev->bd_op->open) {
                        ret = bdev->bd_op->open(inode, file);
                        if (ret)
                                goto out2;
                }
-       } else {
-               down(&bdev->bd_contains->bd_part_sem);
-               bdev->bd_contains->bd_part_count++;
-               up(&bdev->bd_contains->bd_part_sem);
-       }
-       if (!bdev->bd_openers) {
-               struct blk_dev_struct *p = blk_dev + major(dev);
-               struct gendisk *g = get_gendisk(dev);
-               unsigned bsize = bdev_hardsect_size(bdev);
-
-               bdev->bd_offset = 0;
-               if (g) {
-                       struct hd_struct *p;
-                       p = g->part + minor(dev) - g->first_minor;
-                       bdev->bd_inode->i_size = (loff_t) p->nr_sects << 9;
-                       bdev->bd_offset = p->start_sect;
-               } else if (blk_size[major(dev)])
-                       bdev->bd_inode->i_size =
-                               (loff_t) blk_size[major(dev)][minor(dev)] << 10;
-               else
-                       bdev->bd_inode->i_size = 0;
-               while (bsize < PAGE_CACHE_SIZE) {
-                       if (bdev->bd_inode->i_size & bsize)
-                               break;
-                       bsize <<= 1;
-               }
-               bdev->bd_block_size = bsize;
-               bdev->bd_inode->i_blkbits = blksize_bits(bsize);
-               if (p->queue)
-                       bdev->bd_queue =  p->queue(dev);
-               else
-                       bdev->bd_queue = &p->request_queue;
-               if (bdev->bd_inode->i_data.backing_dev_info ==
-                                       &default_backing_dev_info) {
+               if (!bdev->bd_openers) {
+                       struct blk_dev_struct *p = blk_dev + major(dev);
                        struct backing_dev_info *bdi;
-
+                       sector_t sect = 0;
+
+                       bdev->bd_offset = 0;
+                       if (g) {
+                               struct hd_struct *p;
+                               p = g->part + minor(dev) - g->first_minor;
+                               sect = p->nr_sects;
+                       } else if (blk_size[major(dev)])
+                               sect = blk_size[major(dev)][minor(dev)] << 1;
+                       if (p->queue)
+                               bdev->bd_queue =  p->queue(dev);
+                       else
+                               bdev->bd_queue = &p->request_queue;
+                       bd_set_size(bdev, (loff_t)sect << 9);
                        bdi = blk_get_backing_dev_info(bdev);
                        if (bdi == NULL)
                                bdi = &default_backing_dev_info;
                        inode->i_data.backing_dev_info = bdi;
                        bdev->bd_inode->i_data.backing_dev_info = bdi;
                }
+               if (bdev->bd_invalidated && !bdev->bd_part_count) {
+                       struct hd_struct *part;
+                       part = g->part + minor(dev) - g->first_minor;
+                       bdev->bd_invalidated = 0;
+                       wipe_partitions(dev);
+                       if (part[0].nr_sects)
+                               check_partition(g, bdev);
+               }
+       } else {
+               down(&bdev->bd_contains->bd_sem);
+               bdev->bd_contains->bd_part_count++;
+               if (!bdev->bd_openers) {
+                       struct gendisk *g = get_gendisk(dev);
+                       struct hd_struct *p;
+                       p = g->part + minor(dev) - g->first_minor;
+                       inode->i_data.backing_dev_info =
+                          bdev->bd_inode->i_data.backing_dev_info =
+                          bdev->bd_contains->bd_inode->i_data.backing_dev_info;
+                       if (!p->nr_sects) {
+                               bdev->bd_contains->bd_part_count--;
+                               up(&bdev->bd_contains->bd_sem);
+                               ret = -ENXIO;
+                               goto out2;
+                       }
+                       bdev->bd_queue = bdev->bd_contains->bd_queue;
+                       bdev->bd_offset = p->start_sect;
+                       bd_set_size(bdev, (loff_t) p->nr_sects << 9);
+               }
+               up(&bdev->bd_contains->bd_sem);
        }
        bdev->bd_openers++;
        up(&bdev->bd_sem);
@@ -725,9 +759,9 @@ int blkdev_put(struct block_device *bdev, int kind)
                if (bdev->bd_op->release)
                        ret = bdev->bd_op->release(bd_inode, NULL);
        } else {
-               down(&bdev->bd_contains->bd_part_sem);
+               down(&bdev->bd_contains->bd_sem);
                bdev->bd_contains->bd_part_count--;
-               up(&bdev->bd_contains->bd_part_sem);
+               up(&bdev->bd_contains->bd_sem);
        }
        if (!bdev->bd_openers) {
                if (bdev->bd_op->owner)
@@ -758,24 +792,25 @@ static int blkdev_reread_part(struct block_device *bdev)
        struct hd_struct *part;
        int res;
 
-       if (!disk)
+       if (!disk || !disk->minor_shift)
                return -EINVAL;
        part = disk->part + minor(dev) - disk->first_minor;
        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
-       if (down_trylock(&bdev->bd_part_sem));
+       if (down_trylock(&bdev->bd_sem));
                return -EBUSY;
        if (bdev->bd_part_count) {
-               up(&bdev->bd_part_sem);
+               up(&bdev->bd_sem);
                return -EBUSY;
        }
        res = wipe_partitions(dev);
        if (!res) {
                if (bdev->bd_op->revalidate)
                        bdev->bd_op->revalidate(dev);
-               grok_partitions(dev, part[0].nr_sects);
+               if (part[0].nr_sects)
+                       check_partition(disk, bdev);
        }
-       up(&bdev->bd_part_sem);
+       up(&bdev->bd_sem);
        return res;
 }
 
index 172ef0e9113bf40dc96a62e7b7582b52b3c103f4..98492594db7a7de55cfe810dd9dd5d00f5b823ef 100644 (file)
@@ -2390,7 +2390,7 @@ static int check_disc_changed (struct devfs_entry *de)
     /*  Ugly hack to disable messages about unable to read partition table  */
     tmp = warn_no_part;
     warn_no_part = 0;
-    retval = check_disk_change(bdev);
+    retval = full_check_disk_change(bdev);
     warn_no_part = tmp;
 out:
     devfs_put_ops (de);
index 19ad765de4b3a8c3dabb957d5cdded0d09f4e438..862d80ba5513df172305fdf5c4083513961f2044 100644 (file)
@@ -289,10 +289,13 @@ void driverfs_remove_partitions(struct gendisk *hd, int minor)
        return;
 }
 
-static void check_partition(struct gendisk *hd, kdev_t dev)
+/*
+ *     DON'T EXPORT
+ */
+void check_partition(struct gendisk *hd, struct block_device *bdev)
 {
        devfs_handle_t de = NULL;
-       struct block_device *bdev;
+       kdev_t dev = to_kdev_t(bdev->bd_dev);
        char buf[64];
        struct parsed_partitions *state;
        int i;
@@ -314,9 +317,6 @@ static void check_partition(struct gendisk *hd, kdev_t dev)
                if (n - COMPAQ_SMART2_MAJOR <= 7 || n - COMPAQ_CISS_MAJOR <= 7)
                        sprintf(state->name, "p");
        }
-       bdev = bdget(kdev_t_to_nr(dev));
-       if (blkdev_get(bdev, FMODE_READ, 0, BDEV_RAW))
-               goto out;
        state->limit = 1<<hd->minor_shift;
        for (i = 0; check_part[i]; i++) {
                int res, j;
@@ -328,7 +328,7 @@ static void check_partition(struct gendisk *hd, kdev_t dev)
                if (res < 0) {
                        if (warn_no_part)
                                printk(" unable to read partition table\n");
-                       goto setup_devfs;
+                       goto out;
                } 
                p = hd->part + minor(dev) - hd->first_minor;
                for (j = 1; j < state->limit; j++) {
@@ -340,12 +340,10 @@ static void check_partition(struct gendisk *hd, kdev_t dev)
                        md_autodetect_dev(mk_kdev(major(dev),minor(dev)+j));
 #endif
                }
-               goto setup_devfs;
+               goto out;
        }
 
        printk(" unknown partition table\n");
-setup_devfs:
-       blkdev_put(bdev, BDEV_RAW);
 out:
        driverfs_create_partitions(hd, minor(dev));
        devfs_register_partitions (hd, minor(dev), 0);
@@ -463,34 +461,29 @@ void register_disk(struct gendisk *gdev, kdev_t dev, unsigned minors,
 
 void grok_partitions(kdev_t dev, long size)
 {
-       int minors, first_minor, end_minor;
+       struct block_device *bdev;
        struct gendisk *g = get_gendisk(dev);
        struct hd_struct *p;
 
        if (!g)
                return;
 
-       minors = 1 << g->minor_shift;
-       first_minor = minor(dev);
-       if (first_minor & (minors-1)) {
-               printk("grok_partitions: bad device 0x%02x:%02x\n",
-                      major(dev), first_minor);
-               first_minor &= ~(minors-1);
-       }
-       end_minor = first_minor + minors;
-       p = g->part + first_minor - g->first_minor;
+       p = g->part + minor(dev) - g->first_minor;
        p[0].nr_sects = size;
 
        /* No minors to use for partitions */
-       if (minors == 1)
+       if (!g->minor_shift)
                return;
 
        /* No such device (e.g., media were just removed) */
        if (!size)
                return;
 
-       check_partition(g, mk_kdev(g->major, first_minor));
+       bdev = bdget(kdev_t_to_nr(dev));
+       if (blkdev_get(bdev, FMODE_READ, 0, BDEV_RAW) < 0)
+               return;
+       check_partition(g, bdev);
+       blkdev_put(bdev, BDEV_RAW);
 }
 
 unsigned char *read_dev_sector(struct block_device *bdev, unsigned long n, Sector *p)
index e5232d531047717f2de2c48ba45d889e1aa423db..acae6165e572d2bd8f73cfd179f0ef88bb526dcf 100644 (file)
@@ -502,7 +502,6 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
        devfs_put_ops (de);   /*  Decrement module use count now we're safe  */
        if (error)
                goto out;
-       check_disk_change(bdev);
        error = -EACCES;
        if (!(flags & MS_RDONLY) && bdev_read_only(bdev))
                goto out1;
index f83c52f82ab04555d04a258802e4ea63ac74518e..d0a89877ad940da4fac8e27c25639999dcbffc07 100644 (file)
@@ -279,6 +279,7 @@ extern struct blk_dev_struct blk_dev[MAX_BLKDEV];
 extern void grok_partitions(kdev_t dev, long size);
 extern int wipe_partitions(kdev_t dev);
 extern void register_disk(struct gendisk *dev, kdev_t first, unsigned minors, struct block_device_operations *ops, long size);
+extern void check_partition(struct gendisk *disk, struct block_device *bdev);
 extern void generic_make_request(struct bio *bio);
 extern inline request_queue_t *bdev_get_queue(struct block_device *bdev);
 extern void blk_put_request(struct request *);
index 9ad9cdf3e6681778fce4c28e5e2adf2ece4d5616..192107379fd9a591e3acc565b2f6e7f29b33faf8 100644 (file)
@@ -356,8 +356,8 @@ struct block_device {
        struct block_device *   bd_contains;
        unsigned                bd_block_size;
        unsigned long           bd_offset;
-       struct semaphore        bd_part_sem;
        unsigned                bd_part_count;
+       int                     bd_invalidated;
 };
 
 struct inode {
@@ -1132,6 +1132,7 @@ extern int fs_may_remount_ro(struct super_block *);
 #define bio_data_dir(bio)      ((bio)->bi_rw & 1)
 
 extern int check_disk_change(struct block_device *);
+extern int full_check_disk_change(struct block_device *);
 extern int __check_disk_change(dev_t);
 extern int invalidate_inodes(struct super_block *);
 extern int invalidate_device(kdev_t, int);