]> git.hungrycats.org Git - linux/commitdiff
[PATCH] readv/writev speedup
authorAndrew Morton <akpm@digeo.com>
Fri, 13 Sep 2002 12:57:02 +0000 (05:57 -0700)
committerLinus Torvalds <torvalds@home.transmeta.com>
Fri, 13 Sep 2002 12:57:02 +0000 (05:57 -0700)
This is Janet Morgan's patch which converts the readv/writev code
to submit all segments for IO before waiting on them, rather than
submitting each segment separately.

This is a critical performance fix for O_DIRECT reads and writes.
Prior to this change, O_DIRECT vectored IO was forced to wait for
completion against each segment of the iovec rather than submitting all
segments and waiting on the lot.  ie: for ten segments, this code will
be ten times faster.

There will also be moderate improvements for buffered IO - smaller code
paths, plus writev() only takes i_sem once.

The patch ended up quite large unfortunately - turned out that the only
sane way to implement this without duplicating significant amounts of
code (the generic_file_write() bounds checking, all the O_DIRECT
handling, etc) was to redo generic_file_read() and generic_file_write()
to take an iovec/nr_segs pair rather than `buf, count'.

New exported functions generic_file_readv() and generic_file_writev()
have been added:

ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
                          unsigned long nr_segs, loff_t *ppos);
ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
                          unsigned long nr_segs, loff_t * ppos);

If a driver does not use these in their file_operations then they will
continue to use the old readv/writev code, which sits in a loop calling
calls fops->read() or fops->write().

ext2, ext3, JFS and the blockdev driver are currently using this
capability.

Some coding cleanups were made in fs/read_write.c.  Mainly:

- pass "READ" or "WRITE" around to indicate the diretion of the
  operation, rather than the (confusing, inverted)
  VERIFY_READ/VERIFY_WRITE.

- Use the identifier `nr_segs' everywhere to indicate the iovec
  length rather than `count', which is often used to indicate the
  number of bytes in the syscall.  It was confusing the heck out of me.

- Some cleanups to the raw driver.

- Some additional generality in fs/direct_io.c: the core `struct dio'
  used to be a "populate-and-go" thing.  Janet has broken that up so
  you can initialise a struct dio once, then loop around feeding it
  more file segments, then wait on completion against everything.

- In a couple of places we needed to handle the situation where we
  knew, a-priori, that the user was going to get a short read or write.
  File size limit exceeded, read past i_size, etc.  We handled that by
  shortening the iovec in-place with iov_shorten().  Which is not
  particularly pretty, but neither were the alternatives.

14 files changed:
drivers/char/raw.c
fs/block_dev.c
fs/direct-io.c
fs/ext2/file.c
fs/ext2/inode.c
fs/ext3/file.c
fs/ext3/inode.c
fs/jfs/file.c
fs/jfs/inode.c
fs/read_write.c
include/linux/fs.h
include/linux/uio.h
kernel/ksyms.c
mm/filemap.c

index 1da088fc6626f3a9c399b43a77622fc050b0a20f..a2f05f72791d308d6c31debb0eadd6afc71948e0 100644 (file)
@@ -201,25 +201,29 @@ out:
 }
 
 static ssize_t
-rw_raw_dev(int rw, struct file *filp, char *buf, size_t size, loff_t *offp)
+rw_raw_dev(int rw, struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
 {
        const int minor = minor(filp->f_dentry->d_inode->i_rdev);
        struct block_device *bdev = raw_devices[minor].binding;
        struct inode *inode = bdev->bd_inode;
+       size_t count = iov_length(iov, nr_segs); 
        ssize_t ret = 0;
 
-       if (size == 0)
-               goto out;
-       ret = -EINVAL;
-       if (size < 0)
-               goto out;
-       ret = -ENXIO;
-       if (*offp >= inode->i_size)
-               goto out;
+       if (count == 0)
+               goto out;       
+
+       if ((ssize_t)count < 0)
+               return -EINVAL; 
+
+       if (*offp >= inode->i_size) 
+               return -ENXIO;
+
+       if (count + *offp > inode->i_size) {
+               count = inode->i_size - *offp;
+               nr_segs = iov_shorten((struct iovec *)iov, nr_segs, count);
+       }
+       ret = generic_file_direct_IO(rw, inode, iov, *offp, nr_segs);
 
-       if (size + *offp > inode->i_size)
-               size = inode->i_size - *offp;
-       ret = generic_file_direct_IO(rw, inode, buf, *offp, size);
        if (ret > 0)
                *offp += ret;
 out:
@@ -227,15 +231,31 @@ out:
 }
 
 static ssize_t
-raw_read(struct file *filp, char * buf, size_t size, loff_t *offp)
+raw_read(struct file *filp, char *buf, size_t size, loff_t *offp)
 {
-       return rw_raw_dev(READ, filp, buf, size, offp);
+       struct iovec local_iov = { .iov_base = buf, .iov_len = size};
+
+       return rw_raw_dev(READ, filp, &local_iov, 1, offp);
 }
 
 static ssize_t
 raw_write(struct file *filp, const char *buf, size_t size, loff_t *offp)
 {
-       return rw_raw_dev(WRITE, filp, (char *)buf, size, offp);
+       struct iovec local_iov = { .iov_base = buf, .iov_len = size};
+
+       return rw_raw_dev(WRITE, filp, &local_iov, 1, offp);
+}
+
+static ssize_t 
+raw_readv(struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp) 
+{
+       return rw_raw_dev(READ, filp, iov, nr_segs, offp);
+}
+
+static ssize_t 
+raw_writev(struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp) 
+{
+       return rw_raw_dev(WRITE, filp, iov, nr_segs, offp);
 }
 
 static struct file_operations raw_fops = {
@@ -244,6 +264,8 @@ static struct file_operations raw_fops = {
        .open   =       raw_open,
        .release=       raw_release,
        .ioctl  =       raw_ioctl,
+       .readv  =       raw_readv,
+       .writev =       raw_writev,
        .owner  =       THIS_MODULE,
 };
 
index 7d8a089a9d0a04f54f9117b22c15184d8ae9b8e9..f5a3d314bcd411be4d8d67b1ca70052f34fcd710 100644 (file)
@@ -116,11 +116,11 @@ blkdev_get_blocks(struct inode *inode, sector_t iblock,
 }
 
 static int
-blkdev_direct_IO(int rw, struct inode *inode, char *buf,
-                       loff_t offset, size_t count)
+blkdev_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
+                       loff_t offset, unsigned long nr_segs)
 {
-       return generic_direct_IO(rw, inode, buf, offset,
-                               count, blkdev_get_blocks);
+       return generic_direct_IO(rw, inode, iov, offset,
+                               nr_segs, blkdev_get_blocks);
 }
 
 static int blkdev_writepage(struct page * page)
@@ -787,6 +787,14 @@ static int blkdev_reread_part(struct block_device *bdev)
        return res;
 }
 
+static ssize_t blkdev_file_write(struct file *file, const char *buf,
+                                  size_t count, loff_t *ppos)
+{
+       struct iovec local_iov = { .iov_base = (void *)buf, .iov_len = count };
+
+       return generic_file_write_nolock(file, &local_iov, 1, ppos);
+}
+
 static int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
                        unsigned long arg)
 {
@@ -832,26 +840,28 @@ static int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
 }
 
 struct address_space_operations def_blk_aops = {
-       readpage: blkdev_readpage,
-       writepage: blkdev_writepage,
-       sync_page: block_sync_page,
-       prepare_write: blkdev_prepare_write,
-       commit_write: blkdev_commit_write,
-       writepages: generic_writepages,
-       vm_writeback: generic_vm_writeback,
-       direct_IO: blkdev_direct_IO,
+       .readpage       = blkdev_readpage,
+       .writepage      = blkdev_writepage,
+       .sync_page      = block_sync_page,
+       .prepare_write  = blkdev_prepare_write,
+       .commit_write   = blkdev_commit_write,
+       .writepages     = generic_writepages,
+       .vm_writeback   = generic_vm_writeback,
+       .direct_IO      = blkdev_direct_IO,
 };
 
 struct file_operations def_blk_fops = {
-       open:           blkdev_open,
-       release:        blkdev_close,
-       llseek:         block_llseek,
-       read:           generic_file_read,
-       write:          generic_file_write_nolock,
-       mmap:           generic_file_mmap,
-       fsync:          block_fsync,
-       ioctl:          blkdev_ioctl,
-       sendfile:       generic_file_sendfile,
+       .open           = blkdev_open,
+       .release        = blkdev_close,
+       .llseek         = block_llseek,
+       .read           = generic_file_read,
+       .write          = blkdev_file_write,
+       .mmap           = generic_file_mmap,
+       .fsync          = block_fsync,
+       .ioctl          = blkdev_ioctl,
+       .readv          = generic_file_readv,
+       .writev         = generic_file_writev,
+       .sendfile       = generic_file_sendfile,
 };
 
 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
index 015881a7914784d3cfdbb8049dde69d5f5f1e00d..fa4e46719c1461fb6f019fcd9258779e4ffa675a 100644 (file)
@@ -75,7 +75,7 @@ struct dio {
  */
 static inline unsigned dio_pages_present(struct dio *dio)
 {
-       return dio->head - dio->tail;
+       return dio->tail - dio->head;
 }
 
 /*
@@ -265,6 +265,10 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
 static int dio_await_completion(struct dio *dio)
 {
        int ret = 0;
+
+       if (dio->bio)
+               dio_bio_submit(dio);
+
        while (atomic_read(&dio->bio_count)) {
                struct bio *bio = dio_await_one(dio);
                int ret2;
@@ -523,29 +527,16 @@ out:
        return ret;
 }
 
-/*
- * The main direct-IO function.  This is a library function for use by
- * filesystem drivers.
- */
 int
-generic_direct_IO(int rw, struct inode *inode, char *buf, loff_t offset,
-                       size_t count, get_blocks_t get_blocks)
+direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, 
+       loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
 {
        const unsigned blkbits = inode->i_blkbits;
-       const unsigned blocksize_mask = (1 << blkbits) - 1;
-       const unsigned long user_addr = (unsigned long)buf;
-       int ret;
-       int ret2;
+       unsigned long user_addr; 
+       int seg, ret2, ret = 0;
        struct dio dio;
-       size_t bytes;
+       size_t bytes, tot_bytes = 0;
 
-       /* Check the memory alignment.  Blocks cannot straddle pages */
-       if ((user_addr & blocksize_mask) || (count & blocksize_mask)) {
-               ret = -EINVAL;
-               goto out;
-       }
-
-       /* BIO submission state */
        dio.bio = NULL;
        dio.bvec = NULL;
        dio.inode = inode;
@@ -553,31 +544,13 @@ generic_direct_IO(int rw, struct inode *inode, char *buf, loff_t offset,
        dio.blkbits = blkbits;
        dio.block_in_file = offset >> blkbits;
        dio.blocks_available = 0;
-       dio.final_block_in_request = (offset + count) >> blkbits;
 
-       /* Index into the first page of the first block */
-       dio.first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
        dio.boundary = 0;
        dio.reap_counter = 0;
        dio.get_blocks = get_blocks;
        dio.last_block_in_bio = -1;
        dio.next_block_in_bio = -1;
 
-       /* Page fetching state */
-       dio.curr_page = 0;
-       bytes = count;
-       dio.total_pages = 0;
-       if (user_addr & (PAGE_SIZE - 1)) {
-               dio.total_pages++;
-               bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
-       }
-
-       dio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
-       dio.curr_user_address = user_addr;
-
-       /* Page queue */
-       dio.head = 0;
-       dio.tail = 0;
        dio.page_errors = 0;
 
        /* BIO completion state */
@@ -586,38 +559,75 @@ generic_direct_IO(int rw, struct inode *inode, char *buf, loff_t offset,
        dio.bio_list = NULL;
        dio.waiter = NULL;
 
-       ret = do_direct_IO(&dio);
+       for (seg = 0; seg < nr_segs; seg++) {
+               user_addr = (unsigned long)iov[seg].iov_base;
+               bytes = iov[seg].iov_len;
+
+               /* Index into the first page of the first block */
+               dio.first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
+               dio.final_block_in_request = dio.block_in_file + (bytes >> blkbits);
+               /* Page fetching state */
+               dio.head = 0;
+               dio.tail = 0;
+               dio.curr_page = 0;
+
+               dio.total_pages = 0;
+               if (user_addr & (PAGE_SIZE-1)) {
+                       dio.total_pages++;
+                       bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
+               }
+               dio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+               dio.curr_user_address = user_addr;
+       
+               ret = do_direct_IO(&dio);
+
+               if (ret) {
+                       dio_cleanup(&dio);
+                       break;
+               }
+
+               tot_bytes += iov[seg].iov_len - ((dio.final_block_in_request -
+                                       dio.block_in_file) << blkbits);
+
+       } /* end iovec loop */
 
-       if (dio.bio)
-               dio_bio_submit(&dio);
-       if (ret)
-               dio_cleanup(&dio);
        ret2 = dio_await_completion(&dio);
        if (ret == 0)
                ret = ret2;
        if (ret == 0)
                ret = dio.page_errors;
        if (ret == 0)
-               ret = count - ((dio.final_block_in_request -
-                               dio.block_in_file) << blkbits);
-out:
+               ret = tot_bytes; 
+
        return ret;
 }
 
-ssize_t
-generic_file_direct_IO(int rw, struct inode *inode, char *buf,
-                       loff_t offset, size_t count)
+/*
+ * This is a library function for use by filesystem drivers.
+ */
+int
+generic_direct_IO(int rw, struct inode *inode, const struct iovec *iov, 
+       loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
 {
+       int seg;
+       size_t size;
+       unsigned long addr;
        struct address_space *mapping = inode->i_mapping;
-       unsigned blocksize_mask;
-       ssize_t retval;
+       unsigned blocksize_mask = (1 << inode->i_blkbits) - 1;
+       ssize_t retval = -EINVAL;
 
-       blocksize_mask = (1 << inode->i_blkbits) - 1;
-       if ((offset & blocksize_mask) || (count & blocksize_mask)) {
-               retval = -EINVAL;
+       if (offset & blocksize_mask) {
                goto out;
        }
 
+       /* Check the memory alignment.  Blocks cannot straddle pages */
+       for (seg = 0; seg < nr_segs; seg++) {
+               addr = (unsigned long)iov[seg].iov_base;
+               size = iov[seg].iov_len;
+               if ((addr & blocksize_mask) || (size & blocksize_mask)) 
+                       goto out;       
+       }
+
        if (mapping->nrpages) {
                retval = filemap_fdatawrite(mapping);
                if (retval == 0)
@@ -625,9 +635,21 @@ generic_file_direct_IO(int rw, struct inode *inode, char *buf,
                if (retval)
                        goto out;
        }
-       retval = mapping->a_ops->direct_IO(rw, inode, buf, offset, count);
+
+       retval = direct_io_worker(rw, inode, iov, offset, nr_segs, get_blocks);
+out:
+       return retval;
+}
+
+ssize_t
+generic_file_direct_IO(int rw, struct inode *inode, const struct iovec *iov, 
+       loff_t offset, unsigned long nr_segs)
+{
+       struct address_space *mapping = inode->i_mapping;
+       ssize_t retval;
+
+       retval = mapping->a_ops->direct_IO(rw, inode, iov, offset, nr_segs);
        if (inode->i_mapping->nrpages)
                invalidate_inode_pages2(inode->i_mapping);
-out:
        return retval;
 }
index e401b86da37499fc2bb13700b8a3f38d7f32902e..aff333ae8e5a716dea774303135177a8c42bac2b 100644 (file)
@@ -46,6 +46,8 @@ struct file_operations ext2_file_operations = {
        .open           = generic_file_open,
        .release        = ext2_release_file,
        .fsync          = ext2_sync_file,
+       .readv          = generic_file_readv,
+       .writev         = generic_file_writev,
        .sendfile       = generic_file_sendfile,
 };
 
index d0c363f8062efc6e107f7c2ffe91b2e2b39e146b..78a1b6ace4944b7753d815fd3872af649d40c6b2 100644 (file)
@@ -619,11 +619,11 @@ ext2_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks,
 }
 
 static int
-ext2_direct_IO(int rw, struct inode *inode, char *buf,
-                       loff_t offset, size_t count)
+ext2_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
+                       loff_t offset, unsigned long nr_segs)
 {
-       return generic_direct_IO(rw, inode, buf,
-                               offset, count, ext2_get_blocks);
+       return generic_direct_IO(rw, inode, iov,
+                               offset, nr_segs, ext2_get_blocks);
 }
 
 static int
index 412cbb6334cd208b545dcfb60abc7ba383e5a14a..6ea4b8a091b877962f88230528499d2bb17ad792 100644 (file)
@@ -76,19 +76,21 @@ ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
 }
 
 struct file_operations ext3_file_operations = {
-       .llseek         = generic_file_llseek,  /* BKL held */
-       .read           = generic_file_read,    /* BKL not held.  Don't need */
-       .write          = ext3_file_write,      /* BKL not held.  Don't need */
-       .ioctl          = ext3_ioctl,           /* BKL held */
+       .llseek         = generic_file_llseek,
+       .read           = generic_file_read,
+       .write          = ext3_file_write,
+       .readv          = generic_file_readv,
+       .writev         = generic_file_writev,
+       .ioctl          = ext3_ioctl,
        .mmap           = generic_file_mmap,
-       .open           = ext3_open_file,               /* BKL not held.  Don't need */
-       .release        = ext3_release_file,    /* BKL not held.  Don't need */
-       .fsync          = ext3_sync_file,               /* BKL held */
-       .sendfile       = generic_file_sendfile,        /* BKL not held.  Don't need */
+       .open           = ext3_open_file,
+       .release        = ext3_release_file,
+       .fsync          = ext3_sync_file,
+       .sendfile       = generic_file_sendfile,
 };
 
 struct inode_operations ext3_file_inode_operations = {
-       .truncate       = ext3_truncate,                /* BKL held */
-       .setattr        = ext3_setattr,         /* BKL held */
+       .truncate       = ext3_truncate,
+       .setattr        = ext3_setattr,
 };
 
index 38e3decdae39a6b4b722e5d8e838c32f68fd93ab..681d0dc715b3fdbc062ddabaa245fc2e44077a4b 100644 (file)
@@ -1399,13 +1399,15 @@ static int ext3_releasepage(struct page *page, int wait)
  * If the O_DIRECT write is intantiating holes inside i_size and the machine
  * crashes then stale disk data _may_ be exposed inside the file.
  */
-static int ext3_direct_IO(int rw, struct inode *inode, char *buf,
-                       loff_t offset, size_t count)
+static int ext3_direct_IO(int rw, struct inode *inode,
+                       const struct iovec *iov, loff_t offset,
+                       unsigned long nr_segs)
 {
        struct ext3_inode_info *ei = EXT3_I(inode);
        handle_t *handle = NULL;
        int ret;
        int orphan = 0;
+       size_t count = iov_length(iov, nr_segs);
 
        if (rw == WRITE) {
                loff_t final_size = offset + count;
@@ -1428,8 +1430,8 @@ static int ext3_direct_IO(int rw, struct inode *inode, char *buf,
                }
        }
 
-       ret = generic_direct_IO(rw, inode, buf, offset,
-                               count, ext3_direct_io_get_blocks);
+       ret = generic_direct_IO(rw, inode, iov, offset,
+                               nr_segs, ext3_direct_io_get_blocks);
 
 out_stop:
        if (handle) {
index c65adbffe13069e23cabf6151ed048e59bcedb88..6a69ca7d8acd9ce2c4d62b2b6e12c0c97ea385e0 100644 (file)
@@ -108,6 +108,8 @@ struct file_operations jfs_file_operations = {
        .write          = generic_file_write,
        .read           = generic_file_read,
        .mmap           = generic_file_mmap,
+       .readv          = generic_file_readv,
+       .writev         = generic_file_writev,
        .sendfile       = generic_file_sendfile,
        .fsync          = jfs_fsync,
 };
index 6af76fc84b140123fab1fe637bf2cd470c089d9a..65d1dff1f80dc4c200265e8cfd84a64be64a47a0 100644 (file)
@@ -309,11 +309,11 @@ static int jfs_bmap(struct address_space *mapping, long block)
        return generic_block_bmap(mapping, block, jfs_get_block);
 }
 
-static int jfs_direct_IO(int rw, struct inode *inode, char *buf,
-                       loff_t offset, size_t count)
+static int jfs_direct_IO(int rw, struct inode *inode, const struct iovec *iov, 
+                       loff_t offset, unsigned long nr_segs)
 {
-       return generic_direct_IO(rw, inode, buf,
-                               offset, count, jfs_get_blocks);
+       return generic_direct_IO(rw, inode, iov,
+                               offset, nr_segs, jfs_get_blocks);
 }
 
 struct address_space_operations jfs_aops = {
index 6a244f6127209d43035397d0dd465995776da8f7..306ead083cf0702282bfd13091f2161017da2337 100644 (file)
@@ -286,9 +286,29 @@ asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char *buf,
        return ret;
 }
 
+/*
+ * Reduce an iovec's length in-place.  Return the resulting number of segments
+ */
+unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
+{
+       unsigned long seg = 0;
+       size_t len = 0;
+
+       while (seg < nr_segs) {
+               seg++;
+               if (len + iov->iov_len >= to) {
+                       iov->iov_len = to - len;
+                       break;
+               }
+               len += iov->iov_len;
+               iov++;
+       }
+       return seg;
+}
+
 static ssize_t do_readv_writev(int type, struct file *file,
                               const struct iovec * vector,
-                              unsigned long count)
+                              unsigned long nr_segs)
 {
        typedef ssize_t (*io_fn_t)(struct file *, char *, size_t, loff_t *);
        typedef ssize_t (*iov_fn_t)(struct file *, const struct iovec *, unsigned long, loff_t *);
@@ -296,73 +316,86 @@ static ssize_t do_readv_writev(int type, struct file *file,
        size_t tot_len;
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov=iovstack;
-       ssize_t ret, i;
+       ssize_t ret = -EINVAL;
+       int seg;
        io_fn_t fn;
        iov_fn_t fnv;
        struct inode *inode;
 
+       /*
+        * SuS says "The readv() function *may* fail if the iovcnt argument
+        * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+        * traditionally returned -EINVAL for zero segments, so...
+        */
+       if (nr_segs == 0)
+               goto out;
+
        /*
         * First get the "struct iovec" from user memory and
         * verify all the pointers
         */
-       ret = 0;
-       if (!count)
-               goto out_nofree;
-       ret = -EINVAL;
-       if (count > UIO_MAXIOV)
-               goto out_nofree;
+       if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
+               goto out;
        if (!file->f_op)
-               goto out_nofree;
-       if (count > UIO_FASTIOV) {
+               goto out;
+       if (nr_segs > UIO_FASTIOV) {
                ret = -ENOMEM;
-               iov = kmalloc(count*sizeof(struct iovec), GFP_KERNEL);
+               iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
                if (!iov)
-                       goto out_nofree;
+                       goto out;
        }
        ret = -EFAULT;
-       if (copy_from_user(iov, vector, count*sizeof(*vector)))
+       if (copy_from_user(iov, vector, nr_segs*sizeof(*vector)))
                goto out;
 
        /*
         * Single unix specification:
-        * We should -EINVAL if an element length is not >= 0 and fitting an ssize_t
-        * The total length is fitting an ssize_t
+        * We should -EINVAL if an element length is not >= 0 and fitting an
+        * ssize_t.  The total length is fitting an ssize_t
         *
         * Be careful here because iov_len is a size_t not an ssize_t
         */
-        
        tot_len = 0;
        ret = -EINVAL;
-       for (i = 0 ; i < count ; i++) {
+       for (seg = 0 ; seg < nr_segs; seg++) {
                ssize_t tmp = tot_len;
-               ssize_t len = (ssize_t)iov[i].iov_len;
+               ssize_t len = (ssize_t)iov[seg].iov_len;
                if (len < 0)    /* size_t not fitting an ssize_t .. */
                        goto out;
                tot_len += len;
                if (tot_len < tmp) /* maths overflow on the ssize_t */
                        goto out;
        }
+       if (tot_len == 0) {
+               ret = 0;
+               goto out;
+       }
 
        inode = file->f_dentry->d_inode;
        /* VERIFY_WRITE actually means a read, as we write to user space */
-       ret = locks_verify_area((type == VERIFY_WRITE
+       ret = locks_verify_area((type == READ 
                                 ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE),
                                inode, file, file->f_pos, tot_len);
-       if (ret) goto out;
+       if (ret)
+               goto out;
 
-       fnv = (type == VERIFY_WRITE ? file->f_op->readv : file->f_op->writev);
+       fnv = NULL;
+       if (type == READ) {
+               fn = file->f_op->read;
+               fnv = file->f_op->readv;
+       } else {
+               fn = (io_fn_t)file->f_op->write;
+               fnv = file->f_op->writev;
+       }
        if (fnv) {
-               ret = fnv(file, iov, count, &file->f_pos);
+               ret = fnv(file, iov, nr_segs, &file->f_pos);
                goto out;
        }
 
-       /* VERIFY_WRITE actually means a read, as we write to user space */
-       fn = (type == VERIFY_WRITE ? file->f_op->read :
-             (io_fn_t) file->f_op->write);
-
+       /* Do it by hand, with file-ops */
        ret = 0;
        vector = iov;
-       while (count > 0) {
+       while (nr_segs > 0) {
                void * base;
                size_t len;
                ssize_t nr;
@@ -370,7 +403,7 @@ static ssize_t do_readv_writev(int type, struct file *file,
                base = vector->iov_base;
                len = vector->iov_len;
                vector++;
-               count--;
+               nr_segs--;
 
                nr = fn(file, base, len, &file->f_pos);
 
@@ -382,20 +415,18 @@ static ssize_t do_readv_writev(int type, struct file *file,
                if (nr != len)
                        break;
        }
-
 out:
        if (iov != iovstack)
                kfree(iov);
-out_nofree:
-       /* VERIFY_WRITE actually means a read, as we write to user space */
-       if ((ret + (type == VERIFY_WRITE)) > 0)
+       if ((ret + (type == READ)) > 0)
                dnotify_parent(file->f_dentry,
-                       (type == VERIFY_WRITE) ? DN_MODIFY : DN_ACCESS);
+                               (type == READ) ? DN_MODIFY : DN_ACCESS);
        return ret;
 }
 
-asmlinkage ssize_t sys_readv(unsigned long fd, const struct iovec * vector,
-                            unsigned long count)
+
+asmlinkage ssize_t
+sys_readv(unsigned long fd, const struct iovec *vector, unsigned long nr_segs)
 {
        struct file * file;
        ssize_t ret;
@@ -409,7 +440,7 @@ asmlinkage ssize_t sys_readv(unsigned long fd, const struct iovec * vector,
            (file->f_op->readv || file->f_op->read)) {
                ret = security_ops->file_permission (file, MAY_READ);
                if (!ret)
-                       ret = do_readv_writev(VERIFY_WRITE, file, vector, count);
+                       ret = do_readv_writev(READ, file, vector, nr_segs);
        }
        fput(file);
 
@@ -417,8 +448,8 @@ bad_file:
        return ret;
 }
 
-asmlinkage ssize_t sys_writev(unsigned long fd, const struct iovec * vector,
-                             unsigned long count)
+asmlinkage ssize_t
+sys_writev(unsigned long fd, const struct iovec * vector, unsigned long nr_segs)
 {
        struct file * file;
        ssize_t ret;
@@ -432,7 +463,7 @@ asmlinkage ssize_t sys_writev(unsigned long fd, const struct iovec * vector,
            (file->f_op->writev || file->f_op->write)) {
                ret = security_ops->file_permission (file, MAY_WRITE);
                if (!ret)
-                       ret = do_readv_writev(VERIFY_READ, file, vector, count);
+                       ret = do_readv_writev(WRITE, file, vector, nr_segs);
        }
        fput(file);
 
index d58cd8b88fa8984ce7cf0c4f2b480f8e69e7198d..622481a001151e2bc34b832454b286a9ecf310cf 100644 (file)
@@ -307,8 +307,7 @@ struct address_space_operations {
        int (*bmap)(struct address_space *, long);
        int (*invalidatepage) (struct page *, unsigned long);
        int (*releasepage) (struct page *, int);
-       int (*direct_IO)(int, struct inode *, char *buf,
-                               loff_t offset, size_t count);
+       int (*direct_IO)(int, struct inode *, const struct iovec *iov, loff_t offset, unsigned long nr_segs);
 };
 
 struct backing_dev_info;
@@ -1245,14 +1244,18 @@ extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
 extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *);
 extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *);
-extern ssize_t generic_file_write_nolock(struct file *, const char *, size_t, loff_t *);
+ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov,
+                               unsigned long nr_segs, loff_t *ppos);
 extern ssize_t generic_file_sendfile(struct file *, struct file *, loff_t *, size_t);
 extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t);
-ssize_t generic_file_direct_IO(int rw, struct inode *inode, char *buf,
-                               loff_t offset, size_t count);
-int generic_direct_IO(int rw, struct inode *inode, char *buf,
-                       loff_t offset, size_t count, get_blocks_t *get_blocks);
-
+extern ssize_t generic_file_direct_IO(int rw, struct inode *inode, 
+       const struct iovec *iov, loff_t offset, unsigned long nr_segs);
+extern int generic_direct_IO(int rw, struct inode *inode, const struct iovec 
+       *iov, loff_t offset, unsigned long nr_segs, get_blocks_t *get_blocks);
+extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, 
+       unsigned long nr_segs, loff_t *ppos);
+ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, 
+                       unsigned long nr_segs, loff_t *ppos);
 extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
index beaafffd3cfb57cd8d88ec78f7c23391c2e3f4ba..ec098c8e67931389b8bc994384d95bb0658499dd 100644 (file)
@@ -34,4 +34,19 @@ struct iovec
                                 /* Beg pardon: BSD has 1024 --ANK */
 #endif
 
+/*
+ * Total number of bytes covered by an iovec
+ */
+static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
+{
+       unsigned long seg;
+       size_t ret = 0;
+
+       for (seg = 0; seg < nr_segs; seg++)
+               ret += iov[seg].iov_len;
+       return ret;
+}
+
+unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to);
+
 #endif
index 3b03394fe14d9a0760cfc534d3ef2802f747f7ff..d69272a391754aff7306692a61b79ebd61ea2e18 100644 (file)
@@ -42,6 +42,7 @@
 #include <linux/highuid.h>
 #include <linux/brlock.h>
 #include <linux/fs.h>
+#include <linux/uio.h>
 #include <linux/tty.h>
 #include <linux/in6.h>
 #include <linux/completion.h>
@@ -343,6 +344,9 @@ EXPORT_SYMBOL(register_disk);
 EXPORT_SYMBOL(read_dev_sector);
 EXPORT_SYMBOL(init_buffer);
 EXPORT_SYMBOL_GPL(generic_file_direct_IO);
+EXPORT_SYMBOL(generic_file_readv);
+EXPORT_SYMBOL(generic_file_writev);
+EXPORT_SYMBOL(iov_shorten);
 
 /* tty routines */
 EXPORT_SYMBOL(tty_hangup);
index f66ea3911500ac7288cf334048b49a62dfe4a3ac..ea1052accdf2a92f5b8a6e59e33e657b21084c3d 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
+#include <linux/uio.h>
 #include <linux/iobuf.h>
 #include <linux/hash.h>
 #include <linux/writeback.h>
@@ -1121,14 +1122,18 @@ success:
  * This is the "read()" routine for all filesystems
  * that can use the page cache directly.
  */
-ssize_t
-generic_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
+static ssize_t
+__generic_file_read(struct file *filp, const struct iovec *iov,
+               unsigned long nr_segs, loff_t *ppos)
 {
        ssize_t retval;
+       unsigned long seg;
+       size_t count = iov_length(iov, nr_segs);
 
        if ((ssize_t) count < 0)
                return -EINVAL;
 
+       /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (filp->f_flags & O_DIRECT) {
                loff_t pos = *ppos, size;
                struct address_space *mapping;
@@ -1141,10 +1146,13 @@ generic_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
                        goto out; /* skip atime */
                size = inode->i_size;
                if (pos < size) {
-                       if (pos + count > size)
+                       if (pos + count > size) {
                                count = size - pos;
-                       retval = generic_file_direct_IO(READ, inode,
-                                                       buf, pos, count);
+                               nr_segs = iov_shorten((struct iovec *)iov,
+                                                       nr_segs, count);
+                       }
+                       retval = generic_file_direct_IO(READ, inode, 
+                                       iov, pos, nr_segs);
                        if (retval > 0)
                                *ppos = pos + retval;
                }
@@ -1152,27 +1160,42 @@ generic_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
                goto out;
        }
 
-       retval = -EFAULT;
-       if (access_ok(VERIFY_WRITE, buf, count)) {
-               retval = 0;
+       for (seg = 0; seg < nr_segs; seg++) {
+               if (!access_ok(VERIFY_WRITE,iov[seg].iov_base,iov[seg].iov_len))
+                       return -EFAULT;
+       }
 
-               if (count) {
+       retval = 0;
+       if (count) {
+               for (seg = 0; seg < nr_segs; seg++) {
                        read_descriptor_t desc;
 
                        desc.written = 0;
-                       desc.count = count;
-                       desc.buf = buf;
+                       desc.buf = iov[seg].iov_base;
+                       desc.count = iov[seg].iov_len;
+                       if (desc.count == 0)
+                               continue;
                        desc.error = 0;
                        do_generic_file_read(filp,ppos,&desc,file_read_actor);
-                       retval = desc.written;
-                       if (!retval)
+                       retval += desc.written;
+                       if (!retval) {
                                retval = desc.error;
+                               break;
+                       }
                }
        }
 out:
        return retval;
 }
 
+ssize_t
+generic_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
+{
+       struct iovec local_iov = { .iov_base = buf, .iov_len = count };
+
+       return __generic_file_read(filp, &local_iov, 1, ppos);
+}
+
 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
 {
        ssize_t written;
@@ -1926,11 +1949,14 @@ filemap_copy_from_user(struct page *page, unsigned long offset,
  * it for writing by marking it dirty.
  *                                                     okir@monad.swb.de
  */
-ssize_t generic_file_write_nolock(struct file *file, const char *buf,
-                                 size_t count, loff_t *ppos)
+ssize_t
+generic_file_write_nolock(struct file *file, const struct iovec *iov,
+                               unsigned long nr_segs, loff_t *ppos)
 {
        struct address_space * mapping = file->f_dentry->d_inode->i_mapping;
        struct address_space_operations *a_ops = mapping->a_ops;
+       const size_t ocount = iov_length(iov, nr_segs);
+       size_t count =  ocount;
        struct inode    *inode = mapping->host;
        unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
        long            status = 0;
@@ -1942,12 +1968,19 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf,
        unsigned        bytes;
        time_t          time_now;
        struct pagevec  lru_pvec;
+       struct iovec    *cur_iov;
+       unsigned        iov_bytes;      /* Cumulative count to the end of the
+                                          current iovec */
+       unsigned long   seg;
+       char            *buf;
 
        if (unlikely((ssize_t)count < 0))
                return -EINVAL;
 
-       if (unlikely(!access_ok(VERIFY_READ, buf, count)))
-               return -EFAULT;
+       for (seg = 0; seg < nr_segs; seg++) {
+               if (!access_ok(VERIFY_READ,iov[seg].iov_base,iov[seg].iov_len))
+                       return -EFAULT;
+       }
 
        pos = *ppos;
        if (unlikely(pos < 0))
@@ -2045,9 +2078,13 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf,
                mark_inode_dirty_sync(inode);
        }
 
+       /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (unlikely(file->f_flags & O_DIRECT)) {
-               written = generic_file_direct_IO(WRITE, inode,
-                                               (char *)buf, pos, count);
+               if (count != ocount)
+                       nr_segs = iov_shorten((struct iovec *)iov,
+                                               nr_segs, count);
+               written = generic_file_direct_IO(WRITE, inode, 
+                                       iov, pos, nr_segs);
                if (written > 0) {
                        loff_t end = pos + written;
                        if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
@@ -2065,6 +2102,9 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf,
                goto out_status;
        }
 
+       cur_iov = (struct iovec *)iov;
+       iov_bytes = cur_iov->iov_len;
+       buf = cur_iov->iov_base;
        do {
                unsigned long index;
                unsigned long offset;
@@ -2075,6 +2115,8 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf,
                bytes = PAGE_CACHE_SIZE - offset;
                if (bytes > count)
                        bytes = count;
+               if (bytes + written > iov_bytes)
+                       bytes = iov_bytes - written;
 
                /*
                 * Bring in the user page that we will copy from _first_.
@@ -2084,7 +2126,7 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf,
                 */
                fault_in_pages_readable(buf, bytes);
 
-               page = __grab_cache_page(mapping, index, &cached_page, &lru_pvec);
+               page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
                if (!page) {
                        status = -ENOMEM;
                        break;
@@ -2115,6 +2157,11 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf,
                                count -= status;
                                pos += status;
                                buf += status;
+                               if (written == iov_bytes && count) {
+                                       cur_iov++;
+                                       iov_bytes += cur_iov->iov_len;
+                                       buf = cur_iov->iov_base;
+                               }
                        }
                }
                if (!PageReferenced(page))
@@ -2151,10 +2198,29 @@ ssize_t generic_file_write(struct file *file, const char *buf,
 {
        struct inode    *inode = file->f_dentry->d_inode->i_mapping->host;
        int             err;
+       struct iovec local_iov = { .iov_base = (void *)buf, .iov_len = count };
 
        down(&inode->i_sem);
-       err = generic_file_write_nolock(file, buf, count, ppos);
+       err = generic_file_write_nolock(file, &local_iov, 1, ppos);
        up(&inode->i_sem);
 
        return err;
 }
+
+ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
+                       unsigned long nr_segs, loff_t *ppos)
+{
+       return __generic_file_read(filp, iov, nr_segs, ppos);
+}
+
+ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
+                       unsigned long nr_segs, loff_t * ppos) 
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       ssize_t ret;
+
+       down(&inode->i_sem);
+       ret = generic_file_write_nolock(file, iov, nr_segs, ppos);
+       up(&inode->i_sem);
+       return ret;
+}