[PATCH] dio: only call aio_complete() after returning -EIOCBQUEUED

author Zach Brown <zach.brown@oracle.com>

Sun, 10 Dec 2006 10:21:05 +0000 (02:21 -0800)

committer Linus Torvalds <torvalds@woody.osdl.org>

Sun, 10 Dec 2006 17:57:21 +0000 (09:57 -0800)
author Zach Brown <zach.brown@oracle.com>
Sun, 10 Dec 2006 10:21:05 +0000 (02:21 -0800)
committer Linus Torvalds <torvalds@woody.osdl.org>
Sun, 10 Dec 2006 17:57:21 +0000 (09:57 -0800)
diff --git a/fs/direct-io.c b/fs/direct-io.c

index f11f05dc9e61a9e6f3c7bb12e3948b68c8cc4d82..71f4aeac7632328ead009dbe7d2411f670e3cf2f 100644 (file)
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -226,6 +226,15 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
  {
         ssize_t transferred = 0;
  
+       /*
+        * AIO submission can race with bio completion to get here while
+        * expecting to have the last io completed by bio completion.
+        * In that case -EIOCBQUEUED is in fact not an error we want
+        * to preserve through this call.
+        */
+       if (ret == -EIOCBQUEUED)
+               ret = 0;
+
         if (dio->result) {
                 transferred = dio->result;
  
@@ -251,24 +260,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
         return ret;
  }
  
-/*
- * Called when a BIO has been processed.  If the count goes to zero then IO is
- * complete and we can signal this to the AIO layer.
- */
-static void dio_complete_aio(struct dio *dio)
-{
-       int ret;
-
-       ret = dio_complete(dio, dio->iocb->ki_pos, 0);
-
-       /* Complete AIO later if falling back to buffered i/o */
-       if (dio->result == dio->size ||
-               ((dio->rw == READ) && dio->result)) {
-               aio_complete(dio->iocb, ret, 0);
-               kfree(dio);
-       }
-}
-
  static int dio_bio_complete(struct dio *dio, struct bio *bio);
  /*
   * Asynchronous IO callback. 
@@ -290,8 +281,11 @@ static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error)
         if (remaining == 1 && waiter_holds_ref)
                 wake_up_process(dio->waiter);
  
-       if (remaining == 0)
-               dio_complete_aio(dio);
+       if (remaining == 0) {
+               int ret = dio_complete(dio, dio->iocb->ki_pos, 0);
+               aio_complete(dio->iocb, ret, 0);
+               kfree(dio);
+       }
  
         return 0;
  }
@@ -1082,47 +1076,33 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
                 mutex_unlock(&dio->inode->i_mutex);
  
         /*
-        * OK, all BIOs are submitted, so we can decrement bio_count to truly
-        * reflect the number of to-be-processed BIOs.
+        * The only time we want to leave bios in flight is when a successful
+        * partial aio read or full aio write have been setup.  In that case
+        * bio completion will call aio_complete.  The only time it's safe to
+        * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
+        * This had *better* be the only place that raises -EIOCBQUEUED.
          */
-       if (dio->is_async) {
-               int should_wait = 0;
-
-               if (dio->result < dio->size && (rw & WRITE)) {
-                       dio->waiter = current;
-                       should_wait = 1;
-               }
-               if (ret == 0)
-                       ret = dio->result;
-
-               if (should_wait)
-                       dio_await_completion(dio);
-
-               /* this can free the dio */
-               if (atomic_dec_and_test(&dio->refcount))
-                       dio_complete_aio(dio);
+       BUG_ON(ret == -EIOCBQUEUED);
+       if (dio->is_async && ret == 0 && dio->result &&
+           ((rw & READ) || (dio->result == dio->size)))
+               ret = -EIOCBQUEUED;
  
-               if (should_wait)
-                       kfree(dio);
-       } else {
+       if (ret != -EIOCBQUEUED)
                 dio_await_completion(dio);
  
+       /*
+        * Sync will always be dropping the final ref and completing the
+        * operation.  AIO can if it was a broken operation described above
+        * or in fact if all the bios race to complete before we get here.
+        * In that case dio_complete() translates the EIOCBQUEUED into
+        * the proper return code that the caller will hand to aio_complete().
+        */
+       if (atomic_dec_and_test(&dio->refcount)) {
                 ret = dio_complete(dio, offset, ret);
+               kfree(dio);
+       } else
+               BUG_ON(ret != -EIOCBQUEUED);
  
-               /* We could have also come here on an AIO file extend */
-               if (!is_sync_kiocb(iocb) && (rw & WRITE) &&
-                   ret >= 0 && dio->result == dio->size)
-                       /*
-                        * For AIO writes where we have completed the
-                        * i/o, we have to mark the the aio complete.
-                        */
-                       aio_complete(iocb, ret, 0);
-
-               if (atomic_dec_and_test(&dio->refcount))
-                       kfree(dio);
-               else
-                       BUG();
-       }
         return ret;
  }
  
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c

index 8e6b56fc1cad4b52504dd9c4055a3fda3a6d595d..b56eb754e2d23d6f87208a9cff5c9c759193f0d8 100644 (file)
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1406,7 +1406,7 @@ xfs_vm_direct_IO(
                         xfs_end_io_direct);
         }
  
-       if (unlikely(ret <= 0 && iocb->private))
+       if (unlikely(ret != -EIOCBQUEUED && iocb->private))
                 xfs_destroy_ioend(iocb->private);
         return ret;
  }
diff --git a/mm/filemap.c b/mm/filemap.c

index 606432f71b3a17dafd791b46361c29a996beec18..8332c77b1bd123fdd76b2a9b2d966711a45b124b 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1181,8 +1181,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                 if (pos < size) {
                         retval = generic_file_direct_IO(READ, iocb,
                                                 iov, pos, nr_segs);
-                       if (retval > 0 && !is_sync_kiocb(iocb))
-                               retval = -EIOCBQUEUED;
                         if (retval > 0)
                                 *ppos = pos + retval;
                 }
@@ -2047,15 +2045,14 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
          * Sync the fs metadata but not the minor inode changes and
          * of course not the data as we did direct DMA for the IO.
          * i_mutex is held, which protects generic_osync_inode() from
-        * livelocking.
+        * livelocking.  AIO O_DIRECT ops attempt to sync metadata here.
          */
-       if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+       if ((written >= 0 || written == -EIOCBQUEUED) &&
+           ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
                 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
                 if (err < 0)
                         written = err;
         }
-       if (written == count && !is_sync_kiocb(iocb))
-               written = -EIOCBQUEUED;
         return written;
  }
  EXPORT_SYMBOL(generic_file_direct_write);
author	Zach Brown <zach.brown@oracle.com>
	Sun, 10 Dec 2006 10:21:05 +0000 (02:21 -0800)
committer	Linus Torvalds <torvalds@woody.osdl.org>
	Sun, 10 Dec 2006 17:57:21 +0000 (09:57 -0800)
fs/direct-io.c		patch \| blob \| history
fs/xfs/linux-2.6/xfs_aops.c		patch \| blob \| history
mm/filemap.c		patch \| blob \| history