

fdatasync can fail to wait on some pages due to a race.

If some task (eg pdflush) is flushing the same mapping it can remove a page's
dirty tag but not then mark that page as being under writeback, because
pdflush hit a locked buffer in __block_write_full_page().  This will happen
because kjournald is writing the buffer.  In this situation
__block_write_full_page() will redirty the page so that fsync notices it, but
there is a window where the page eludes the radix tree dirty page walk.

Consequently a concurrent fsync will fail to notice the page when walking the
radix tree's dirty pages.

The approach taken by this patch is to leave the page marked as dirty in the
radix tree while ->writepage is working out what to do with it.  This ensures
that a concurrent write-for-sync will successfully locate the page and will
then block in lock_page() until the non-write-for-sync code has finished
altering the page state.


---

 25-akpm/fs/mpage.c          |    2 +-
 25-akpm/include/linux/mm.h  |    1 +
 25-akpm/mm/page-writeback.c |   35 ++++++++++++++++++++++++++++++++++-
 25-akpm/mm/vmscan.c         |    2 +-
 4 files changed, 37 insertions(+), 3 deletions(-)

diff -puN fs/mpage.c~clear_page_dirty_for_io fs/mpage.c
--- 25/fs/mpage.c~clear_page_dirty_for_io	2004-04-03 03:00:16.454730056 -0800
+++ 25-akpm/fs/mpage.c	2004-04-03 03:00:16.461728992 -0800
@@ -643,7 +643,7 @@ mpage_writepages(struct address_space *m
 				wait_on_page_writeback(page);
 
 			if (page->mapping == mapping && !PageWriteback(page) &&
-						test_clear_page_dirty(page)) {
+						clear_page_dirty_for_io(page)) {
 				if (writepage) {
 					ret = (*writepage)(page, wbc);
 					if (ret) {
diff -puN mm/page-writeback.c~clear_page_dirty_for_io mm/page-writeback.c
--- 25/mm/page-writeback.c~clear_page_dirty_for_io	2004-04-03 03:00:16.455729904 -0800
+++ 25-akpm/mm/page-writeback.c	2004-04-03 03:00:16.462728840 -0800
@@ -472,7 +472,7 @@ int write_one_page(struct page *page, in
 	if (wait)
 		wait_on_page_writeback(page);
 
-	if (test_clear_page_dirty(page)) {
+	if (clear_page_dirty_for_io(page)) {
 		page_cache_get(page);
 		ret = mapping->a_ops->writepage(page, &wbc);
 		if (ret == 0 && wait) {
@@ -574,6 +574,36 @@ int test_clear_page_dirty(struct page *p
 EXPORT_SYMBOL(test_clear_page_dirty);
 
 /*
+ * Clear a page's dirty flag, while caring for dirty memory accounting.
+ * Returns true if the page was previously dirty.
+ *
+ * This is for preparing to put the page under writeout.  We leave the page
+ * tagged as dirty in the radix tree so that a concurrent write-for-sync
+ * can discover it via a PAGECACHE_TAG_DIRTY walk.  The ->writepage
+ * implementation will run either set_page_writeback() or set_page_dirty(),
+ * at which stage we bring the page's dirty flag and radix-tree dirty tag
+ * back into sync.
+ *
+ * This incoherency between the page's dirty flag and radix-tree tag is
+ * unfortunate, but it only exists while the page is locked.
+ */
+int clear_page_dirty_for_io(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	if (mapping) {
+		if (TestClearPageDirty(page)) {
+			if (!mapping->backing_dev_info->memory_backed)
+				dec_page_state(nr_dirty);
+			return 1;
+		}
+		return 0;
+	}
+	return TestClearPageDirty(page);
+}
+EXPORT_SYMBOL(clear_page_dirty_for_io);
+
+/*
  * Clear a page's dirty flag while ignoring dirty memory accounting
  */
 int __clear_page_dirty(struct page *page)
@@ -629,6 +659,9 @@ int test_set_page_writeback(struct page 
 		if (!ret)
 			radix_tree_tag_set(&mapping->page_tree, page->index,
 						PAGECACHE_TAG_WRITEBACK);
+		if (!PageDirty(page))
+			radix_tree_tag_clear(&mapping->page_tree, page->index,
+						PAGECACHE_TAG_DIRTY);
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestSetPageWriteback(page);
diff -puN include/linux/mm.h~clear_page_dirty_for_io include/linux/mm.h
--- 25/include/linux/mm.h~clear_page_dirty_for_io	2004-04-03 03:00:16.456729752 -0800
+++ 25-akpm/include/linux/mm.h	2004-04-03 03:00:16.463728688 -0800
@@ -472,6 +472,7 @@ int get_user_pages(struct task_struct *t
 int __set_page_dirty_buffers(struct page *page);
 int __set_page_dirty_nobuffers(struct page *page);
 int set_page_dirty_lock(struct page *page);
+int clear_page_dirty_for_io(struct page *page);
 
 /*
  * Prototype to add a shrinker callback for ageable caches.
diff -puN mm/vmscan.c~clear_page_dirty_for_io mm/vmscan.c
--- 25/mm/vmscan.c~clear_page_dirty_for_io	2004-04-03 03:00:16.458729448 -0800
+++ 25-akpm/mm/vmscan.c	2004-04-03 03:00:16.464728536 -0800
@@ -354,7 +354,7 @@ shrink_list(struct list_head *page_list,
 				goto keep_locked;
 			if (!may_write_to_queue(mapping->backing_dev_info))
 				goto keep_locked;
-			if (test_clear_page_dirty(page)) {
+			if (clear_page_dirty_for_io(page)) {
 				int res;
 				struct writeback_control wbc = {
 					.sync_mode = WB_SYNC_NONE,

_
